Browse Source

feat(finder): enhance youtube channel parsing with default playlists

Mateusz Jabłoński 4 months ago
parent
commit
3ea4aee4d6
2 changed files with 131 additions and 21 deletions
  1. 67 4
      internal/reader/subscription/finder.go
  2. 64 17
      internal/reader/subscription/finder_test.go

+ 67 - 4
internal/reader/subscription/finder.go

@@ -71,7 +71,8 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
 
 	// Step 2) Check if the website URL is a YouTube channel.
 	slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL))
-	if subscriptions, localizedError := f.findSubscriptionsFromYouTube(websiteURL); localizedError != nil {
+	youtubeURL := f.findCanonicalURL(websiteURL, responseHandler.ContentType(), bytes.NewReader(responseBody))
+	if subscriptions, localizedError := f.findSubscriptionsFromYouTube(youtubeURL); localizedError != nil {
 		return nil, localizedError
 	} else if len(subscriptions) > 0 {
 		slog.Debug("Subscriptions found from YouTube page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
@@ -274,6 +275,24 @@ func (f *subscriptionFinder) findSubscriptionsFromRSSBridge(websiteURL, rssBridg
 }
 
 func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Subscriptions, *locale.LocalizedErrorWrapper) {
+	playlistPrefixes := []struct {
+		prefix string
+		title  string
+	}{
+		{"UULF", "Videos"},
+		{"UUSH", "Short videos"},
+		{"UULV", "Live streams"},
+
+		{"UULP", "Popular videos"},
+		{"UUPS", "Popular short videos"},
+		{"UUPV", "Popular live streams"},
+		
+		{"UUMO", "Members-only contents (videos, short videos and live streams)"},
+		{"UUMF", "Members-only videos"},
+		{"UUMS", "Members-only short videos"},
+		{"UUMV", "Members-only live streams"},
+	}
+
 	decodedURL, err := url.Parse(websiteURL)
 	if err != nil {
 		return nil, locale.NewLocalizedErrorWrapper(err, "error.invalid_site_url", err)
@@ -283,9 +302,19 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
 		slog.Debug("YouTube feed discovery skipped: not a YouTube domain", slog.String("website_url", websiteURL))
 		return nil, nil
 	}
-	if _, channelID, found := strings.Cut(decodedURL.Path, "channel/"); found {
-		feedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=" + channelID
-		return Subscriptions{NewSubscription(decodedURL.String(), feedURL, parser.FormatAtom)}, nil
+
+	if _, baseID, found := strings.Cut(decodedURL.Path, "channel/UC"); found {
+		var subscriptions Subscriptions
+		
+		channelFeedURL := "https://www.youtube.com/feeds/videos.xml?channel_id=UC" + baseID
+		subscriptions = append(subscriptions, NewSubscription("Channel", channelFeedURL, parser.FormatAtom))
+			
+		for _, playlist := range playlistPrefixes {
+			playlistFeedURL := "https://www.youtube.com/feeds/videos.xml?playlist_id=" + playlist.prefix + baseID
+			subscriptions = append(subscriptions, NewSubscription(playlist.title, playlistFeedURL, parser.FormatAtom))
+		}
+		
+		return subscriptions, nil
 	}
 
 	if strings.HasPrefix(decodedURL.Path, "/watch") || strings.HasPrefix(decodedURL.Path, "/playlist") {
@@ -297,3 +326,37 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
 
 	return nil, nil
 }
+
+// findCanonicalURL extracts the canonical URL from the HTML <link rel="canonical"> tag.
+// Returns the canonical URL if found, otherwise returns the effective URL.
+func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body io.Reader) string {
+	htmlDocumentReader, err := encoding.NewCharsetReader(body, contentType)
+	if err != nil {
+		return effectiveURL
+	}
+
+	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
+	if err != nil {
+		return effectiveURL
+	}
+
+	baseURL := effectiveURL
+	if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
+		hrefValue = strings.TrimSpace(hrefValue)
+		if urllib.IsAbsoluteURL(hrefValue) {
+			baseURL = hrefValue
+		}
+	}
+
+	canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href")
+	if !exists || strings.TrimSpace(canonicalHref) == "" {
+		return effectiveURL
+	}
+
+	canonicalURL, err := urllib.AbsoluteURL(baseURL, strings.TrimSpace(canonicalHref))
+	if err != nil {
+		return effectiveURL
+	}
+
+	return canonicalURL
+}

+ 64 - 17
internal/reader/subscription/finder_test.go

@@ -11,7 +11,7 @@ import (
 func TestFindYoutubeFeed(t *testing.T) {
 	type testResult struct {
 		websiteURL     string
-		feedURL        string
+		feedURLs       []string
 		discoveryError bool
 	}
 
@@ -19,57 +19,69 @@ func TestFindYoutubeFeed(t *testing.T) {
 		// Video URL
 		{
 			websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Video URL with position argument
 		{
 			websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&t=1",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Video URL with position argument
 		{
 			websiteURL: "https://www.youtube.com/watch?t=1&v=dQw4w9WgXcQ",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Channel URL
 		{
 			websiteURL: "https://www.youtube.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
-			feedURL:    "https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
+			feedURLs: []string{
+				"https://www.youtube.com/feeds/videos.xml?channel_id=UC-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UULF-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUSH-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UULV-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UULP-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUPS-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUPV-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMO-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMF-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMS-Qj80avWItNRjkZ41rzHyw",
+				"https://www.youtube.com/feeds/videos.xml?playlist_id=UUMV-Qj80avWItNRjkZ41rzHyw",
+			},
 		},
 		// Channel URL with name
 		{
 			websiteURL: "https://www.youtube.com/@ABCDEFG",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Playlist URL
 		{
 			websiteURL: "https://www.youtube.com/playlist?list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
-			feedURL:    "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
+			feedURLs:   []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
 		},
 		// Playlist URL with video ID
 		{
 			websiteURL: "https://www.youtube.com/watch?v=dQw4w9WgXcQ&list=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
-			feedURL:    "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM",
+			feedURLs:   []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_N42HlCLhqyJ0ZBWr5K1QDM"},
 		},
 		// Playlist URL with video ID and index argument
 		{
 			websiteURL: "https://www.youtube.com/watch?v=6IutBmRJNLk&list=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR&index=4",
-			feedURL:    "https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR",
+			feedURLs:   []string{"https://www.youtube.com/feeds/videos.xml?playlist_id=PLOOwEPgFWm_NHcQd9aCi5JXWASHO_n5uR"},
 		},
 		// Empty playlist ID parameter
 		{
 			websiteURL: "https://www.youtube.com/playlist?list=",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Non-Youtube URL
 		{
 			websiteURL: "https://www.example.com/channel/UC-Qj80avWItNRjkZ41rzHyw",
-			feedURL:    "",
+			feedURLs:   []string{},
 		},
 		// Invalid URL
 		{
 			websiteURL:     "https://example|org/",
-			feedURL:        "",
+			feedURLs:       []string{},
 			discoveryError: true,
 		},
 	}
@@ -82,7 +94,7 @@ func TestFindYoutubeFeed(t *testing.T) {
 			}
 		}
 
-		if scenario.feedURL == "" {
+		if len(scenario.feedURLs) == 0 {
 			if len(subscriptions) > 0 {
 				t.Fatalf(`Parsing an invalid URL should not return any subscription: %q -> %v`, scenario.websiteURL, subscriptions)
 			}
@@ -91,12 +103,14 @@ func TestFindYoutubeFeed(t *testing.T) {
 				t.Fatalf(`Parsing a correctly formatted YouTube playlist or channel page should not return any error: %v`, localizedError)
 			}
 
-			if len(subscriptions) != 1 {
-				t.Fatalf(`Incorrect number of subscriptions returned`)
+			if len(subscriptions) != len(scenario.feedURLs) {
+				t.Fatalf(`Incorrect number of subscriptions returned, expected %d, got %d`, len(scenario.feedURLs), len(subscriptions))
 			}
 
-			if subscriptions[0].URL != scenario.feedURL {
-				t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[0].URL, scenario.feedURL)
+			for i := range scenario.feedURLs {
+				if subscriptions[i].URL != scenario.feedURLs[i] {
+					t.Errorf(`Unexpected feed, got %s, instead of %s`, subscriptions[i].URL, scenario.feedURLs[i])
+				}
 			}
 		}
 	}
@@ -397,3 +411,36 @@ func TestParseWebPageWithNoHref(t *testing.T) {
 		t.Fatal(`Incorrect number of subscriptions returned`)
 	}
 }
+
+func TestFindCanonicalURL(t *testing.T) {
+	htmlPage := `
+	<!doctype html>
+	<html>
+		<head>
+			<link rel="canonical" href="https://example.org/canonical-page">
+		</head>
+		<body>
+		</body>
+	</html>`
+
+	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
+	if canonicalURL != "https://example.org/canonical-page" {
+		t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page")
+	}
+}
+
+func TestFindCanonicalURLNotFound(t *testing.T) {
+	htmlPage := `
+	<!doctype html>
+	<html>
+		<head>
+		</head>
+		<body>
+		</body>
+	</html>`
+
+	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", strings.NewReader(htmlPage))
+	if canonicalURL != "https://example.org/page" {
+		t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL)
+	}
+}