Parcourir la source

Improve YouTube page feed detection

In order to be more resilient to YouTube URLs variation and
to address this feature_request: https://github.com/miniflux/v2/issues/2628
I've reworked a bit the way the YouTube feed extraction is done.

I've kept all the `FindSubscriptionsFromYouTube*` in order
to keep all the existing unit tests as-is ensuring little to no
regressions. By doing so, I had to call twice `youtubeURLIDExtractor`.
Small performance penalty for peace of mind in my opinion.

`youtubeURLIDExtractor` is made in a way only one kind
of page can be detected at a time. This mean I can
solve the "video in a playlist" feature_request
by prioritizing the playlist ID over the Video ID

Also, by using `url.Parse()` to get ids, it's safer
to url mangle and variation. The most common variation
being the `t=42` parameters that start the playback
at a given position. Previously, this kind of url
would not be detected as "YouTube URL".

I deliberately ignored the url parsing error
to keep previous behavior (skip the YouTube analysis and follow with the other analysis)

I also tried to keep debug logs the same as before as much as I could.

I manually tested all the YouTube cases (video,channel,playlist)
and they all work as expected except for the video. But this one
does not work either on main. The `meta` html tag that was searched for
does not seem to exist anymore.

fix: #2628
Ztec il y a 1 an
Parent
commit
e54825bf02
2 fichiers modifiés avec 163 ajouts et 48 suppressions
  1. 94 46
      internal/reader/subscription/finder.go
  2. 69 2
      internal/reader/subscription/finder_test.go

+ 94 - 46
internal/reader/subscription/finder.go

@@ -5,10 +5,13 @@ package subscription // import "miniflux.app/v2/internal/reader/subscription"
 
 import (
 	"bytes"
+	"errors"
 	"fmt"
 	"io"
 	"log/slog"
+	"net/url"
 	"regexp"
+	"strings"
 
 	"miniflux.app/v2/internal/config"
 	"miniflux.app/v2/internal/integration/rssbridge"
@@ -22,10 +25,19 @@ import (
 	"golang.org/x/net/html/charset"
 )
 
+type youtubeKind string
+
+const (
+	youtubeIDKindChannel  youtubeKind = "channel"
+	youtubeIDKindVideo    youtubeKind = "video"
+	youtubeIDKindPlaylist youtubeKind = "playlist"
+)
+
 var (
-	youtubeChannelRegex  = regexp.MustCompile(`youtube\.com/channel/(.*)$`)
-	youtubeVideoRegex    = regexp.MustCompile(`youtube\.com/watch\?v=(.*)$`)
-	youtubePlaylistRegex = regexp.MustCompile(`youtube\.com/playlist\?list=(.*)$`)
+	youtubeHostRegex    = regexp.MustCompile(`youtube\.com$`)
+	youtubeChannelRegex = regexp.MustCompile(`channel/(.*)$`)
+
+	errNotYoutubeUrl = fmt.Errorf("this website is not a YouTube page")
 )
 
 type SubscriptionFinder struct {
@@ -75,43 +87,40 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
 		return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil
 	}
 
-	// Step 2) Check if the website URL is a YouTube channel.
-	slog.Debug("Try to detect feeds from YouTube channel page", slog.String("website_url", websiteURL))
-	subscriptions, localizedError := f.FindSubscriptionsFromYouTubeChannelPage(websiteURL)
-	if localizedError != nil {
-		return nil, localizedError
-	}
+	subscriptions := make(Subscriptions, 1)
 
-	if len(subscriptions) > 0 {
-		slog.Debug("Subscriptions found from YouTube channel page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
-		return subscriptions, nil
-	}
+	// Step 2) Parse URL to find feeds from YouTube.
+	kind, _, err := youtubeURLIDExtractor(websiteURL)
 
-	// Step 3) Check if the website URL is a YouTube video.
-	slog.Debug("Try to detect feeds from YouTube video page", slog.String("website_url", websiteURL))
-	subscriptions, localizedError = f.FindSubscriptionsFromYouTubeVideoPage(websiteURL)
-	if localizedError != nil {
-		return nil, localizedError
-	}
-
-	if len(subscriptions) > 0 {
-		slog.Debug("Subscriptions found from YouTube video page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
-		return subscriptions, nil
-	}
-
-	// Step 4) Check if the website URL is a YouTube playlist.
-	slog.Debug("Try to detect feeds from YouTube playlist page", slog.String("website_url", websiteURL))
-	subscriptions, localizedError = f.FindSubscriptionsFromYouTubePlaylistPage(websiteURL)
-	if localizedError != nil {
-		return nil, localizedError
-	}
-
-	if len(subscriptions) > 0 {
-		slog.Debug("Subscriptions found from YouTube playlist page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
-		return subscriptions, nil
+	// If YouTube url has been detected, return the YouTube feed
+	if err == nil || !errors.Is(err, errNotYoutubeUrl) {
+		switch kind {
+		case youtubeIDKindChannel:
+			slog.Debug("Try to detect feeds from YouTube channel page", slog.String("website_url", websiteURL))
+			subscriptions, localizedError = f.FindSubscriptionsFromYouTubeChannelPage(websiteURL)
+			if localizedError != nil {
+				return nil, localizedError
+			}
+		case youtubeIDKindVideo:
+			slog.Debug("Try to detect feeds from YouTube video page", slog.String("website_url", websiteURL))
+			subscriptions, localizedError = f.FindSubscriptionsFromYouTubeVideoPage(websiteURL)
+			if localizedError != nil {
+				return nil, localizedError
+			}
+		case youtubeIDKindPlaylist:
+			slog.Debug("Try to detect feeds from YouTube playlist page", slog.String("website_url", websiteURL))
+			subscriptions, localizedError = f.FindSubscriptionsFromYouTubePlaylistPage(websiteURL)
+			if localizedError != nil {
+				return nil, localizedError
+			}
+		}
+		if len(subscriptions) > 0 {
+			slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
+			return subscriptions, nil
+		}
 	}
 
-	// Step 5) Parse web page to find feeds from HTML meta tags.
+	// Step 3) Parse web page to find feeds from HTML meta tags.
 	slog.Debug("Try to detect feeds from HTML meta tags",
 		slog.String("website_url", websiteURL),
 		slog.String("content_type", responseHandler.ContentType()),
@@ -126,7 +135,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
 		return subscriptions, nil
 	}
 
-	// Step 6) Check if the website URL can use RSS-Bridge.
+	// Step 4) Check if the website URL can use RSS-Bridge.
 	if rssBridgeURL != "" {
 		slog.Debug("Try to detect feeds with RSS-Bridge", slog.String("website_url", websiteURL))
 		subscriptions, localizedError := f.FindSubscriptionsFromRSSBridge(websiteURL, rssBridgeURL)
@@ -140,7 +149,7 @@ func (f *SubscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string)
 		}
 	}
 
-	// Step 7) Check if the website has a known feed URL.
+	// Step 5) Check if the website has a known feed URL.
 	slog.Debug("Try to detect feeds from well-known URLs", slog.String("website_url", websiteURL))
 	subscriptions, localizedError = f.FindSubscriptionsFromWellKnownURLs(websiteURL)
 	if localizedError != nil {
@@ -298,20 +307,24 @@ func (f *SubscriptionFinder) FindSubscriptionsFromRSSBridge(websiteURL, rssBridg
 }
 
 func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeChannelPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
-	matches := youtubeChannelRegex.FindStringSubmatch(websiteURL)
+	kind, id, _ := youtubeURLIDExtractor(websiteURL)
 
-	if len(matches) == 2 {
-		feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, matches[1])
+	if kind == youtubeIDKindChannel {
+		feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?channel_id=%s`, id)
 		return Subscriptions{NewSubscription(websiteURL, feedURL, parser.FormatAtom)}, nil
 	}
-
 	slog.Debug("This website is not a YouTube channel page, the regex doesn't match", slog.String("website_url", websiteURL))
 
 	return nil, nil
 }
 
 func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeVideoPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
-	if !youtubeVideoRegex.MatchString(websiteURL) {
+	kind, _, err := youtubeURLIDExtractor(websiteURL)
+	if err != nil {
+		slog.Debug("Could not parse url", slog.String("website_url", websiteURL))
+	}
+
+	if kind != youtubeIDKindVideo {
 		slog.Debug("This website is not a YouTube video page, the regex doesn't match", slog.String("website_url", websiteURL))
 		return nil, nil
 	}
@@ -337,10 +350,10 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubeVideoPage(websiteURL st
 }
 
 func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
-	matches := youtubePlaylistRegex.FindStringSubmatch(websiteURL)
+	kind, id, _ := youtubeURLIDExtractor(websiteURL)
 
-	if len(matches) == 2 {
-		feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?playlist_id=%s`, matches[1])
+	if kind == youtubeIDKindPlaylist {
+		feedURL := fmt.Sprintf(`https://www.youtube.com/feeds/videos.xml?playlist_id=%s`, id)
 		return Subscriptions{NewSubscription(websiteURL, feedURL, parser.FormatAtom)}, nil
 	}
 
@@ -348,3 +361,38 @@ func (f *SubscriptionFinder) FindSubscriptionsFromYouTubePlaylistPage(websiteURL
 
 	return nil, nil
 }
+
+func youtubeURLIDExtractor(websiteURL string) (idKind youtubeKind, id string, err error) {
+	decodedUrl, err := url.Parse(websiteURL)
+	if err != nil {
+		return
+	}
+
+	if !youtubeHostRegex.MatchString(decodedUrl.Host) {
+		slog.Debug("This website is not a YouTube page, the regex doesn't match", slog.String("website_url", websiteURL))
+		err = errNotYoutubeUrl
+		return
+	}
+
+	switch {
+	case strings.HasPrefix(decodedUrl.Path, "/channel"):
+		idKind = youtubeIDKindChannel
+		matches := youtubeChannelRegex.FindStringSubmatch(decodedUrl.Path)
+		id = matches[1]
+		return
+	case strings.HasPrefix(decodedUrl.Path, "/watch") && decodedUrl.Query().Has("list"):
+		idKind = youtubeIDKindPlaylist
+		id = decodedUrl.Query().Get("list")
+		return
+	case strings.HasPrefix(decodedUrl.Path, "/watch"):
+		idKind = youtubeIDKindVideo
+		id = decodedUrl.Query().Get("v")
+		return
+	case strings.HasPrefix(decodedUrl.Path, "/playlist"):
+		idKind = youtubeIDKindPlaylist
+		id = decodedUrl.Query().Get("list")
+		return
+	}
+	err = fmt.Errorf("unable to extract youtube id from URL: %s", websiteURL)
+	return
+}

+ 69 - 2
internal/reader/subscription/finder_test.go

@@ -4,14 +4,16 @@
 package subscription
 
 import (
+	"errors"
 	"strings"
 	"testing"
 )
 
 func TestFindYoutubePlaylistFeed(t *testing.T) {
 	scenarios := map[string]string{
-		"https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
-		"https://www.youtube.com/playlist?list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
+		"https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR":            "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
+		"https://www.youtube.com/playlist?list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM":            "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
+		"https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
 	}
 
 	for websiteURL, expectedFeedURL := range scenarios {
@@ -30,6 +32,71 @@ func TestFindYoutubePlaylistFeed(t *testing.T) {
 	}
 }
 
+func TestItDoesNotConsiderPlaylistWatchPageAsVideoWatchPage(t *testing.T) {
+	_, localizedError := NewSubscriptionFinder(nil).FindSubscriptionsFromYouTubeVideoPage("https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM")
+	if localizedError != nil {
+		t.Fatalf(`Should not consider a playlist watch page as a video watch page`)
+	}
+}
+
+func TestYoutubeIdExtractor(t *testing.T) {
+	type testResult struct {
+		ID    string
+		Kind  youtubeKind
+		error error
+	}
+	urls := map[string]testResult{
+		"https://www.youtube.com/watch?v=dQw4w9WgXcQ": {
+			ID:    "dQw4w9WgXcQ",
+			Kind:  youtubeIDKindVideo,
+			error: nil,
+		},
+		"https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1": {
+			ID:    "dQw4w9WgXcQ",
+			Kind:  youtubeIDKindVideo,
+			error: nil,
+		},
+		"https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ": {
+			ID:    "dQw4w9WgXcQ",
+			Kind:  youtubeIDKindVideo,
+			error: nil,
+		},
+		"https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM": {
+			ID:    "PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
+			Kind:  youtubeIDKindPlaylist,
+			error: nil,
+		},
+		"https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR": {
+			ID:    "PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
+			Kind:  youtubeIDKindPlaylist,
+			error: nil,
+		},
+		"https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw": {
+			ID:    "UC-Qj80avWItNRjkZ41rzHyw",
+			Kind:  youtubeIDKindChannel,
+			error: nil,
+		},
+		"https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw": {
+			ID:    "",
+			Kind:  "",
+			error: errNotYoutubeUrl,
+		},
+	}
+
+	for websiteURL, expected := range urls {
+		kind, id, err := youtubeURLIDExtractor(websiteURL)
+		if !errors.Is(err, expected.error) {
+			t.Fatalf(`Unexpected error: %v got %v`, expected.error, err)
+		}
+		if id != expected.ID {
+			t.Fatalf(`Unexpected ID: %v got %v`, expected.ID, id)
+		}
+		if kind != expected.Kind {
+			t.Fatalf(`Unexpected Kind: %v got %v`, expected.Kind, kind)
+		}
+	}
+}
+
 func TestFindYoutubeChannelFeed(t *testing.T) {
 	scenarios := map[string]string{
 		"https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw": "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",