Procházet zdrojové kódy

Use YouTube video duration as read time

This feature works by scraping YouTube website.

To enable it, set the FETCH_YOUTUBE_WATCH_TIME environment variable to
1.

Resolves #972.
Ilya Mateyko před 5 roky
rodič
revize
c3f871b49b

+ 18 - 0
config/config_test.go

@@ -1413,3 +1413,21 @@ func TestAuthProxyUserCreationAdmin(t *testing.T) {
 		t.Fatalf(`Unexpected AUTH_PROXY_USER_CREATION value, got %v instead of %v`, result, expected)
 	}
 }
+
+func TestFetchYouTubeWatchTime(t *testing.T) {
+	os.Clearenv()
+	os.Setenv("FETCH_YOUTUBE_WATCH_TIME", "1")
+
+	parser := NewParser()
+	opts, err := parser.ParseEnvironmentVariables()
+	if err != nil {
+		t.Fatalf(`Parsing failure: %v`, err)
+	}
+
+	expected := true
+	result := opts.FetchYouTubeWatchTime()
+
+	if result != expected {
+		t.Fatalf(`Unexpected FETCH_YOUTUBE_WATCH_TIME value, got %v instead of %v`, result, expected)
+	}
+}

+ 10 - 0
config/options.go

@@ -43,6 +43,7 @@ const (
 	defaultCleanupArchiveUnreadDays           = 180
 	defaultCleanupRemoveSessionsDays          = 30
 	defaultProxyImages                        = "http-only"
+	defaultFetchYouTubeWatchTime              = false
 	defaultCreateAdmin                        = false
 	defaultAdminUsername                      = ""
 	defaultAdminPassword                      = ""
@@ -108,6 +109,7 @@ type Options struct {
 	adminUsername                      string
 	adminPassword                      string
 	proxyImages                        string
+	fetchYouTubeWatchTime              bool
 	oauth2UserCreationAllowed          bool
 	oauth2ClientID                     string
 	oauth2ClientSecret                 string
@@ -162,6 +164,7 @@ func NewOptions() *Options {
 		workerPoolSize:                     defaultWorkerPoolSize,
 		createAdmin:                        defaultCreateAdmin,
 		proxyImages:                        defaultProxyImages,
+		fetchYouTubeWatchTime:              defaultFetchYouTubeWatchTime,
 		oauth2UserCreationAllowed:          defaultOAuth2UserCreation,
 		oauth2ClientID:                     defaultOAuth2ClientID,
 		oauth2ClientSecret:                 defaultOAuth2ClientSecret,
@@ -373,6 +376,12 @@ func (o *Options) AdminPassword() string {
 	return o.adminPassword
 }
 
+// FetchYouTubeWatchTime returns true if the YouTube video duration
+// should be fetched and used as a reading time.
+func (o *Options) FetchYouTubeWatchTime() bool {
+	return o.fetchYouTubeWatchTime
+}
+
 // ProxyImages returns "none" to never proxy, "http-only" to proxy non-HTTPS, "all" to always proxy.
 func (o *Options) ProxyImages() string {
 	return o.proxyImages
@@ -469,6 +478,7 @@ func (o *Options) SortedOptions() []*Option {
 		"DATABASE_MIN_CONNS":                     o.databaseMinConns,
 		"DATABASE_URL":                           o.databaseURL,
 		"DEBUG":                                  o.debug,
+		"FETCH_YOUTUBE_WATCH_TIME":               o.fetchYouTubeWatchTime,
 		"HSTS":                                   o.hsts,
 		"HTTPS":                                  o.HTTPS,
 		"HTTP_CLIENT_MAX_BODY_SIZE":              o.httpClientMaxBodySize,

+ 2 - 0
config/parser.go

@@ -187,6 +187,8 @@ func (p *Parser) parseLines(lines []string) (err error) {
 			p.opts.metricsRefreshInterval = parseInt(value, defaultMetricsRefreshInterval)
 		case "METRICS_ALLOWED_NETWORKS":
 			p.opts.metricsAllowedNetworks = parseStringList(value, []string{defaultMetricsAllowedNetworks})
+		case "FETCH_YOUTUBE_WATCH_TIME":
+			p.opts.fetchYouTubeWatchTime = parseBool(value, defaultFetchYouTubeWatchTime)
 		}
 	}
 

+ 6 - 0
miniflux.1

@@ -107,6 +107,12 @@ Set the value to 1 to enable debug logs\&.
 .br
 Disabled by default\&.
 .TP
+.B FETCH_YOUTUBE_WATCH_TIME
+Set the value to 1 to scrape video duration from YouTube website and
+use it as a reading time\&.
+.br
+Disabled by default\&.
+.TP
 .B SERVER_TIMING_HEADER
 Set the value to 1 to enable server-timing headers\&.
 .br

+ 87 - 1
reader/processor/processor.go

@@ -5,24 +5,35 @@
 package processor
 
 import (
+	"errors"
+	"fmt"
 	"math"
 	"regexp"
+	"strconv"
 	"strings"
 	"time"
 	"unicode/utf8"
 
 	"miniflux.app/config"
+	"miniflux.app/http/client"
 	"miniflux.app/logger"
 	"miniflux.app/metric"
 	"miniflux.app/model"
+	"miniflux.app/reader/browser"
 	"miniflux.app/reader/rewrite"
 	"miniflux.app/reader/sanitizer"
 	"miniflux.app/reader/scraper"
 	"miniflux.app/storage"
 
+	"github.com/PuerkitoBio/goquery"
 	"github.com/rylans/getlang"
 )
 
+var (
+	youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+	iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
+)
+
 // ProcessFeedEntries downloads original web page for entries and apply filters.
 func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
 	var filteredEntries model.Entries
@@ -63,7 +74,20 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
 		// The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
 		entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
 
-		entry.ReadingTime = calculateReadingTime(entry.Content)
+		if config.Opts.FetchYouTubeWatchTime() {
+			if matches := youtubeRegex.FindStringSubmatch(entry.URL); len(matches) == 2 {
+				watchTime, err := fetchYouTubeWatchTime(entry.URL)
+				if err != nil {
+					logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)
+				}
+				entry.ReadingTime = watchTime
+			}
+		}
+
+		if entry.ReadingTime == 0 {
+			entry.ReadingTime = calculateReadingTime(entry.Content)
+		}
+
 		filteredEntries = append(filteredEntries, entry)
 	}
 
@@ -120,6 +144,68 @@ func ProcessEntryWebPage(entry *model.Entry) error {
 	return nil
 }
 
+func fetchYouTubeWatchTime(url string) (int, error) {
+	clt := client.NewClientWithConfig(url, config.Opts)
+	response, browserErr := browser.Exec(clt)
+	if browserErr != nil {
+		return 0, browserErr
+	}
+
+	doc, docErr := goquery.NewDocumentFromReader(response.Body)
+	if docErr != nil {
+		return 0, docErr
+	}
+
+	durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
+	if !exists {
+		return 0, errors.New("duration has not found")
+	}
+
+	dur, err := parseISO8601(durs)
+	if err != nil {
+		return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
+	}
+
+	return int(dur.Minutes()), nil
+}
+
+// parseISO8601 parses an ISO 8601 duration string.
+func parseISO8601(from string) (time.Duration, error) {
+	var match []string
+	var d time.Duration
+
+	if iso8601Regex.MatchString(from) {
+		match = iso8601Regex.FindStringSubmatch(from)
+	} else {
+		return 0, errors.New("could not parse duration string")
+	}
+
+	for i, name := range iso8601Regex.SubexpNames() {
+		part := match[i]
+		if i == 0 || name == "" || part == "" {
+			continue
+		}
+
+		val, err := strconv.ParseInt(part, 10, 64)
+		if err != nil {
+			return 0, err
+		}
+
+		switch name {
+		case "hour":
+			d = d + (time.Duration(val) * time.Hour)
+		case "minute":
+			d = d + (time.Duration(val) * time.Minute)
+		case "second":
+			d = d + (time.Duration(val) * time.Second)
+		default:
+			return 0, fmt.Errorf("unknown field %s", name)
+		}
+	}
+
+	return d, nil
+}
+
 func calculateReadingTime(content string) int {
 	sanitizedContent := sanitizer.StripTags(content)
 	languageInfo := getlang.FromString(sanitizedContent)

+ 30 - 0
reader/processor/processor_test.go

@@ -6,6 +6,7 @@ package processor // import "miniflux.app/reader/processor"
 
 import (
 	"testing"
+	"time"
 
 	"miniflux.app/model"
 )
@@ -47,3 +48,32 @@ func TestAllowEntries(t *testing.T) {
 		}
 	}
 }
+
+func TestParseISO8601(t *testing.T) {
+	var scenarios = []struct {
+		duration string
+		expected time.Duration
+	}{
+		// Live streams and radio.
+		{"PT0M0S", 0},
+		// https://www.youtube.com/watch?v=HLrqNhgdiC0
+		{"PT6M20S", (6 * time.Minute) + (20 * time.Second)},
+		// https://www.youtube.com/watch?v=LZa5KKfqHtA
+		{"PT5M41S", (5 * time.Minute) + (41 * time.Second)},
+		// https://www.youtube.com/watch?v=yIxEEgEuhT4
+		{"PT51M52S", (51 * time.Minute) + (52 * time.Second)},
+		// https://www.youtube.com/watch?v=bpHf1XcoiFs
+		{"PT80M42S", (1 * time.Hour) + (20 * time.Minute) + (42 * time.Second)},
+	}
+
+	for _, tc := range scenarios {
+		result, err := parseISO8601(tc.duration)
+		if err != nil {
+			t.Errorf("Got an error when parsing %q: %v", tc.duration, err)
+		}
+
+		if tc.expected != result {
+			t.Errorf(`Unexpected result, got %v for duration %q`, result, tc.duration)
+		}
+	}
+}