Browse Source

Deduplicate feed URLs when parsing HTML document during discovery process

Fixes #2232
Frédéric Guillot 2 years ago
parent
commit
5de0714256

+ 7 - 2
internal/reader/subscription/finder.go

@@ -152,6 +152,7 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
 	}
 
 	var subscriptions Subscriptions
+	subscriptionURLs := make(map[string]bool)
 	for query, kind := range queries {
 		doc.Find(query).Each(func(i int, s *goquery.Selection) {
 			subscription := new(Subscription)
@@ -163,7 +164,10 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
 
 			if feedURL, exists := s.Attr("href"); exists {
 				if feedURL != "" {
-					subscription.URL, _ = urllib.AbsoluteURL(websiteURL, feedURL)
+					subscription.URL, err = urllib.AbsoluteURL(websiteURL, feedURL)
+					if err != nil {
+						return
+					}
 				}
 			}
 
@@ -171,7 +175,8 @@ func (f *SubscriptionFinder) FindSubscriptionsFromWebPage(websiteURL string, bod
 				subscription.Title = subscription.URL
 			}
 
-			if subscription.URL != "" {
+			if subscription.URL != "" && !subscriptionURLs[subscription.URL] {
+				subscriptionURLs[subscription.URL] = true
 				subscriptions = append(subscriptions, subscription)
 			}
 		})

+ 34 - 0
internal/reader/subscription/finder_test.go

@@ -249,6 +249,40 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
 	}
 }
 
+func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
+	htmlPage := `
+	<!doctype html>
+	<html>
+		<head>
+			<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed A">
+			<link href="http://example.org/feed.xml" rel="alternate" type="application/rss+xml" title="Feed B">
+		</head>
+		<body>
+		</body>
+	</html>`
+
+	subscriptions, err := NewSubscriptionFinder(nil).FindSubscriptionsFromWebPage("http://example.org/", strings.NewReader(htmlPage))
+	if err != nil {
+		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
+	}
+
+	if len(subscriptions) != 1 {
+		t.Fatal(`Incorrect number of subscriptions returned`)
+	}
+
+	if subscriptions[0].Title != "Feed A" {
+		t.Errorf(`Incorrect subscription title: %q`, subscriptions[0].Title)
+	}
+
+	if subscriptions[0].URL != "http://example.org/feed.xml" {
+		t.Errorf(`Incorrect subscription URL: %q`, subscriptions[0].URL)
+	}
+
+	if subscriptions[0].Type != "rss" {
+		t.Errorf(`Incorrect subscription type: %q`, subscriptions[0].Type)
+	}
+}
+
 func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
 	htmlPage := `
 	<!doctype html>