Browse Source

fix(rss): disambiguate entries sharing the same guid

Some non-conformant feeds ship the same <guid> for every item, which
caused Miniflux to collapse all of them into a single entry. Keep the
first occurrence hashed as SHA256(guid) so existing stored entries
still match, and disambiguate later collisions using the entry URL
(falling back to the item position when no URL is available).
Frédéric Guillot 4 days ago
parent
commit
2e60923236
2 changed files with 156 additions and 1 deletions
  1. 20 1
      internal/reader/rss/adapter.go
  2. 136 0
      internal/reader/rss/parser_test.go

+ 20 - 1
internal/reader/rss/adapter.go

@@ -66,6 +66,10 @@ func (r *rssAdapter) buildFeed(baseURL string) *model.Feed {
 		}
 	}
 
+	// Track GUIDs already seen in this feed to disambiguate items from
+	// non-conformant feeds that reuse the same <guid> for every entry.
+	seenGUIDs := make(map[string]int)
+
 	for _, item := range r.rss.Channel.Items {
 		entry := model.NewEntry()
 		entry.Date = findEntryDate(&item)
@@ -105,9 +109,24 @@ func (r *rssAdapter) buildFeed(baseURL string) *model.Feed {
 		}
 
 		// Generate the entry hash.
+		//
+		// The RSS 2.0 spec requires <guid> to uniquely identify the item, but
+		// some feeds ship the same GUID for every entry. Keep the first
+		// occurrence stable (so existing stored entries still match) and
+		// disambiguate later collisions using the entry URL or, as a last
+		// resort, the item position.
 		switch {
 		case item.GUID.Data != "":
-			entry.Hash = crypto.SHA256(item.GUID.Data)
+			n := seenGUIDs[item.GUID.Data]
+			seenGUIDs[item.GUID.Data] = n + 1
+			switch {
+			case n == 0:
+				entry.Hash = crypto.SHA256(item.GUID.Data)
+			case entry.URL != "":
+				entry.Hash = crypto.SHA256(item.GUID.Data + "|" + entry.URL)
+			default:
+				entry.Hash = crypto.SHA256(item.GUID.Data + "|" + strconv.Itoa(n))
+			}
 		case entryURL != "":
 			entry.Hash = crypto.SHA256(entryURL)
 		default:

+ 136 - 0
internal/reader/rss/parser_test.go

@@ -2179,3 +2179,139 @@ func TestParseFeedWithIncorrectTTLValue(t *testing.T) {
 		t.Errorf("Incorrect TTL, got: %d", feed.TTL)
 	}
 }
+
+func TestParseEntriesWithDuplicateGUIDAndDistinctLinks(t *testing.T) {
+	// Some non-conformant feeds (e.g. fluentboards.com) ship the same <guid>
+	// for every item. The first occurrence must keep the historical
+	// SHA256(guid) hash so previously stored entries still match, while later
+	// duplicates are disambiguated using the entry URL.
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0">
+		<channel>
+			<link>https://example.org/</link>
+			<item>
+				<title>Item A</title>
+				<link>https://example.org/a</link>
+				<guid isPermaLink="false">dup-guid</guid>
+			</item>
+			<item>
+				<title>Item B</title>
+				<link>https://example.org/b</link>
+				<guid isPermaLink="false">dup-guid</guid>
+			</item>
+			<item>
+				<title>Item C</title>
+				<link>https://example.org/a</link>
+				<guid isPermaLink="false">dup-guid</guid>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 3 {
+		t.Fatalf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	expected := []string{
+		"22561495b53d916c228504600fda7c06cefc55a80395bbdf007102a6e9070d3f", // SHA256("dup-guid")
+		"3479bd288c08f8d2f8e992aa511a267ee50b18e22a4c475a4862c15c4d4f675b", // SHA256("dup-guid|https://example.org/b")
+		"8e4bb51d800e7a0db6b81b48d09af4711f8a25e00d09251e3d9f675e02bfe11e", // SHA256("dup-guid|https://example.org/a")
+	}
+	for i, want := range expected {
+		if feed.Entries[i].Hash != want {
+			t.Errorf("Entry %d: incorrect hash, got: %s, want: %s", i, feed.Entries[i].Hash, want)
+		}
+	}
+
+	// Sanity-check uniqueness across all three entries.
+	seen := make(map[string]bool)
+	for _, e := range feed.Entries {
+		if seen[e.Hash] {
+			t.Errorf("Duplicate hash across entries: %s", e.Hash)
+		}
+		seen[e.Hash] = true
+	}
+}
+
+func TestParseEntriesWithDuplicateGUIDAndNoLink(t *testing.T) {
+	// When colliding entries also lack a usable URL, we fall back to the
+	// position-based disambiguator. The site URL is used as the entry URL
+	// fallback (see findEntryURL handling), so the second entry hashes
+	// against that URL rather than the position.
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0">
+		<channel>
+			<link>https://example.org/</link>
+			<item>
+				<title>Item A</title>
+				<guid isPermaLink="false">dup-guid</guid>
+			</item>
+			<item>
+				<title>Item B</title>
+				<guid isPermaLink="false">dup-guid</guid>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 2 {
+		t.Fatalf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Hash != "22561495b53d916c228504600fda7c06cefc55a80395bbdf007102a6e9070d3f" {
+		t.Errorf("Entry 0: incorrect hash, got: %s", feed.Entries[0].Hash)
+	}
+	// Both items fall back to the channel link as their URL, so the second
+	// hash uses "dup-guid|https://example.org/".
+	if feed.Entries[1].Hash != "175d6d57a61533fb553c5d9bc3d52aa9092553a7e0ae473e3536c32c87cfa418" {
+		t.Errorf("Entry 1: incorrect hash, got: %s", feed.Entries[1].Hash)
+	}
+	if feed.Entries[0].Hash == feed.Entries[1].Hash {
+		t.Errorf("Hashes should differ for duplicate-GUID items")
+	}
+}
+
+func TestParseEntriesWithUniqueGUIDsAreUnchanged(t *testing.T) {
+	// Regression guard: feeds with unique GUIDs must keep the historical
+	// SHA256(guid) hashing so existing stored entries are not duplicated.
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0">
+		<channel>
+			<link>https://example.org/</link>
+			<item>
+				<title>Item A</title>
+				<link>https://example.org/a</link>
+				<guid isPermaLink="false">guid-a</guid>
+			</item>
+			<item>
+				<title>Item B</title>
+				<link>https://example.org/b</link>
+				<guid isPermaLink="false">guid-b</guid>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 2 {
+		t.Fatalf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].Hash != "2b13a8de1741fb778d7e733e7a888088cb578d22e5ec6d40e0a88f82d4829cbd" {
+		t.Errorf("Entry 0: incorrect hash, got: %s", feed.Entries[0].Hash)
+	}
+	if feed.Entries[1].Hash != "33ce1b8657a81c6a49590313a36325e44683f27229e96faa690f9dc7b2a75d2a" {
+		t.Errorf("Entry 1: incorrect hash, got: %s", feed.Entries[1].Hash)
+	}
+}