Jelajahi Sumber

Refactor RDF parser to use an adapter

Avoid tight coupling between `model.Feed` and the original XML RDF feed.
Frédéric Guillot 2 tahun lalu
induk
melakukan
6bc4b35e38

+ 2 - 18
internal/reader/dublincore/dublincore.go

@@ -3,29 +3,13 @@
 
 package dublincore // import "miniflux.app/v2/internal/reader/dublincore"
 
-import (
-	"strings"
-
-	"miniflux.app/v2/internal/reader/sanitizer"
-)
-
-// DublinCoreFeedElement represents Dublin Core feed XML elements.
-type DublinCoreFeedElement struct {
-	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ channel>creator"`
-}
-
-func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
-	return strings.TrimSpace(sanitizer.StripTags(feed.DublinCoreCreator))
+type DublinCoreChannelElement struct {
+	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
 }
 
-// DublinCoreItemElement represents Dublin Core entry XML elements.
 type DublinCoreItemElement struct {
 	DublinCoreTitle   string `xml:"http://purl.org/dc/elements/1.1/ title"`
 	DublinCoreDate    string `xml:"http://purl.org/dc/elements/1.1/ date"`
 	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
 	DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
 }
-
-func (item *DublinCoreItemElement) GetSanitizedCreator() string {
-	return strings.TrimSpace(sanitizer.StripTags(item.DublinCoreCreator))
-}

+ 115 - 0
internal/reader/rdf/adapter.go

@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package rdf // import "miniflux.app/v2/internal/reader/rdf"
+
+import (
+	"html"
+	"log/slog"
+	"strings"
+	"time"
+
+	"miniflux.app/v2/internal/crypto"
+	"miniflux.app/v2/internal/model"
+	"miniflux.app/v2/internal/reader/date"
+	"miniflux.app/v2/internal/reader/sanitizer"
+	"miniflux.app/v2/internal/urllib"
+)
+
+type RDFAdapter struct {
+	rdf *RDF
+}
+
+func NewRDFAdapter(rdf *RDF) *RDFAdapter {
+	return &RDFAdapter{rdf}
+}
+
+func (r *RDFAdapter) BuildFeed(feedURL string) *model.Feed {
+	feed := &model.Feed{
+		Title:   stripTags(r.rdf.Channel.Title),
+		FeedURL: feedURL,
+	}
+
+	if feed.Title == "" {
+		feed.Title = feedURL
+	}
+
+	if siteURL, err := urllib.AbsoluteURL(feedURL, r.rdf.Channel.Link); err != nil {
+		feed.SiteURL = r.rdf.Channel.Link
+	} else {
+		feed.SiteURL = siteURL
+	}
+
+	for _, item := range r.rdf.Items {
+		entry := model.NewEntry()
+		itemLink := strings.TrimSpace(item.Link)
+
+		// Populate the entry URL.
+		if itemLink == "" {
+			entry.URL = feed.SiteURL // Fallback to the feed URL if the entry URL is empty.
+		} else if entryURL, err := urllib.AbsoluteURL(feed.SiteURL, itemLink); err == nil {
+			entry.URL = entryURL
+		} else {
+			entry.URL = itemLink
+		}
+
+		// Populate the entry title.
+		for _, title := range []string{item.Title, item.DublinCoreTitle} {
+			title = strings.TrimSpace(title)
+			if title != "" {
+				entry.Title = html.UnescapeString(title)
+				break
+			}
+		}
+
+		// If the entry title is empty, we use the entry URL as a fallback.
+		if entry.Title == "" {
+			entry.Title = entry.URL
+		}
+
+		// Populate the entry content.
+		if item.DublinCoreContent != "" {
+			entry.Content = item.DublinCoreContent
+		} else {
+			entry.Content = item.Description
+		}
+
+		// Generate the entry hash.
+		hashValue := itemLink
+		if hashValue == "" {
+			hashValue = item.Title + item.Description // Fallback to the title and description if the link is empty.
+		}
+
+		entry.Hash = crypto.Hash(hashValue)
+
+		// Populate the entry date.
+		entry.Date = time.Now()
+		if item.DublinCoreDate != "" {
+			if itemDate, err := date.Parse(item.DublinCoreDate); err != nil {
+				slog.Debug("Unable to parse date from RDF feed",
+					slog.String("date", item.DublinCoreDate),
+					slog.String("link", itemLink),
+					slog.Any("error", err),
+				)
+			} else {
+				entry.Date = itemDate
+			}
+		}
+
+		// Populate the entry author.
+		switch {
+		case item.DublinCoreCreator != "":
+			entry.Author = stripTags(item.DublinCoreCreator)
+		case r.rdf.Channel.DublinCoreCreator != "":
+			entry.Author = stripTags(r.rdf.Channel.DublinCoreCreator)
+		}
+
+		feed.Entries = append(feed.Entries, entry)
+	}
+
+	return feed
+}
+
+func stripTags(value string) string {
+	return strings.TrimSpace(sanitizer.StripTags(value))
+}

+ 3 - 3
internal/reader/rdf/parser.go

@@ -13,10 +13,10 @@ import (
 
 // Parse returns a normalized feed struct from a RDF feed.
 func Parse(baseURL string, data io.ReadSeeker) (*model.Feed, error) {
-	feed := new(rdfFeed)
-	if err := xml.NewXMLDecoder(data).Decode(feed); err != nil {
+	xmlFeed := new(RDF)
+	if err := xml.NewXMLDecoder(data).Decode(xmlFeed); err != nil {
 		return nil, fmt.Errorf("rdf: unable to parse feed: %w", err)
 	}
 
-	return feed.Transform(baseURL), nil
+	return NewRDFAdapter(xmlFeed).BuildFeed(baseURL), nil
 }

+ 312 - 200
internal/reader/rdf/parser_test.go

@@ -228,63 +228,87 @@ func TestParseRDFSampleWithDublinCore(t *testing.T) {
 	}
 }
 
-func TestParseItemWithOnlyFeedAuthor(t *testing.T) {
+func TestParseRDFFeedWithEmptyTitle(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
-
 	<rdf:RDF
-	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-	  xmlns:dc="http://purl.org/dc/elements/1.1/"
-	  xmlns="http://purl.org/rss/1.0/"
-	>
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/">
+		<channel>
+			<link>http://example.org/item</link>
+		</channel>
+		<item>
+			<title>Example</title>
+			<link>http://example.org/item</link>
+			<description>Test</description>
+		</item>
+	</rdf:RDF>`
 
-	  <channel rdf:about="http://meerkat.oreillynet.com/?_fl=rss1.0">
-		<title>Meerkat</title>
-		<link>http://meerkat.oreillynet.com</link>
-		<dc:creator>Rael Dornfest (mailto:rael@oreilly.com)</dc:creator>
-	  </channel>
+	feed, err := Parse("http://example.org/feed", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
 
-	  <item rdf:about="http://c.moreover.com/click/here.pl?r123">
-		<title>XML: A Disruptive Technology</title>
-		<link>http://c.moreover.com/click/here.pl?r123</link>
-		<dc:description>
-		  XML is placing increasingly heavy loads on the existing technical
-		  infrastructure of the Internet.
-		</dc:description>
-	  </item>
+	if feed.Title != "http://example.org/feed" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Title)
+	}
+}
+
+func TestParseRDFFeedWithEmptyLink(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/">
+		<channel>
+			<title>Example Feed</title>
+		</channel>
+		<item>
+			<title>Example</title>
+			<link>http://example.org/item</link>
+			<description>Test</description>
+		</item>
 	</rdf:RDF>`
 
-	feed, err := Parse("http://meerkat.oreillynet.com", bytes.NewReader([]byte(data)))
+	feed, err := Parse("http://example.org/feed", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Author != "Rael Dornfest (mailto:rael@oreilly.com)" {
-		t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author)
+	if feed.SiteURL != "http://example.org/feed" {
+		t.Errorf(`Incorrect SiteURL, got: %q`, feed.SiteURL)
+	}
+
+	if feed.FeedURL != "http://example.org/feed" {
+		t.Errorf(`Incorrect FeedURL, got: %q`, feed.FeedURL)
 	}
 }
 
-func TestParseItemRelativeURL(t *testing.T) {
+func TestParseRDFFeedWithRelativeLink(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
-	  <channel>
+	<rdf:RDF
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/">
+		<channel>
+			<title>Example Feed</title>
+			<link>/test/index.html</link>
+		</channel>
+		<item>
 			<title>Example</title>
-			<link>http://example.org</link>
-	  </channel>
-
-	  <item>
-			<title>Title</title>
+			<link>http://example.org/item</link>
 			<description>Test</description>
-			<link>something.html</link>
-	  </item>
+		</item>
 	</rdf:RDF>`
 
-	feed, err := Parse("http://meerkat.oreillynet.com", bytes.NewReader([]byte(data)))
+	feed, err := Parse("http://example.org/feed", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].URL != "http://example.org/something.html" {
-		t.Errorf("Incorrect entry url, got: %s", feed.Entries[0].URL)
+	if feed.SiteURL != "http://example.org/test/index.html" {
+		t.Errorf(`Incorrect SiteURL, got: %q`, feed.SiteURL)
+	}
+
+	if feed.FeedURL != "http://example.org/feed" {
+		t.Errorf(`Incorrect FeedURL, got: %q`, feed.FeedURL)
 	}
 }
 
@@ -321,63 +345,7 @@ func TestParseItemWithoutLink(t *testing.T) {
 	}
 }
 
-func TestParseItemWithDublicCoreDate(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-	  <channel>
-			<title>Example</title>
-			<link>http://example.org</link>
-	  </channel>
-
-	  <item>
-			<title>Title</title>
-			<description>Test</description>
-			<link>http://example.org/test.html</link>
-			<dc:creator>Tester</dc:creator>
-			<dc:date>2018-04-10T05:00:00+00:00</dc:date>
-	  </item>
-	</rdf:RDF>`
-
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expectedDate := time.Date(2018, time.April, 10, 5, 0, 0, 0, time.UTC)
-	if !feed.Entries[0].Date.Equal(expectedDate) {
-		t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate)
-	}
-}
-
-func TestParseItemWithEncodedHTMLInDCCreatorField(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-	  <channel>
-			<title>Example</title>
-			<link>http://example.org</link>
-	  </channel>
-
-	  <item>
-			<title>Title</title>
-			<description>Test</description>
-			<link>http://example.org/test.html</link>
-			<dc:creator>&lt;a href=&quot;http://example.org/author1&quot;>Author 1&lt;/a&gt; (University 1), &lt;a href=&quot;http://example.org/author2&quot;>Author 2&lt;/a&gt; (University 2)</dc:creator>
-			<dc:date>2018-04-10T05:00:00+00:00</dc:date>
-	  </item>
-	</rdf:RDF>`
-
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expectedAuthor := "Author 1 (University 1), Author 2 (University 2)"
-	if feed.Entries[0].Author != expectedAuthor {
-		t.Errorf("Incorrect entry author, got: %s, want: %s", feed.Entries[0].Author, expectedAuthor)
-	}
-}
-
-func TestParseItemWithoutDate(t *testing.T) {
+func TestParseItemRelativeURL(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
 	  <channel>
@@ -388,90 +356,17 @@ func TestParseItemWithoutDate(t *testing.T) {
 	  <item>
 			<title>Title</title>
 			<description>Test</description>
-			<link>http://example.org/test.html</link>
-	  </item>
-	</rdf:RDF>`
-
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	expectedDate := time.Now().In(time.Local)
-	diff := expectedDate.Sub(feed.Entries[0].Date)
-	if diff > time.Second {
-		t.Errorf("Incorrect entry date, got: %v", diff)
-	}
-}
-
-func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
-	  <channel>
-			<title>Example</title>
-			<link>http://example.org</link>
-	  </channel>
-
-	  <item>
-			<title>AT&amp;amp;T</title>
-			<description>Test</description>
-			<link>http://example.org/test.html</link>
+			<link>something.html</link>
 	  </item>
 	</rdf:RDF>`
 
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != `AT&T` {
-		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
-	}
-}
-
-func TestParseInvalidXml(t *testing.T) {
-	data := `garbage`
-	_, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err == nil {
-		t.Fatal("Parse should returns an error")
-	}
-}
-
-func TestParseFeedWithHTMLEntity(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
-	  <channel>
-			<title>Example &nbsp; Feed</title>
-			<link>http://example.org</link>
-	  </channel>
-	</rdf:RDF>`
-
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Title != "Example \u00a0 Feed" {
-		t.Errorf(`Incorrect title, got: %q`, feed.Title)
-	}
-}
-
-func TestParseFeedWithInvalidCharacterEntity(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
-	  <channel>
-			<title>Example Feed</title>
-			<link>http://example.org/a&b</link>
-	  </channel>
-	</rdf:RDF>`
-
-	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	feed, err := Parse("http://meerkat.oreillynet.com", bytes.NewReader([]byte(data)))
 	if err != nil {
 		t.Fatal(err)
 	}
 
-	if feed.SiteURL != "http://example.org/a&b" {
-		t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
+	if feed.Entries[0].URL != "http://example.org/something.html" {
+		t.Errorf("Incorrect entry url, got: %s", feed.Entries[0].URL)
 	}
 }
 
@@ -539,20 +434,19 @@ func TestParseFeedWithURLWrappedInSpaces(t *testing.T) {
 	}
 }
 
-func TestParseRDFWithContentEncoded(t *testing.T) {
+func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF
 		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-		xmlns="http://purl.org/rss/1.0/"
-		xmlns:content="http://purl.org/rss/1.0/modules/content/">
+		xmlns="http://purl.org/rss/1.0/">
 		<channel>
 			<title>Example Feed</title>
 			<link>http://example.org/</link>
 		</channel>
 		<item>
-			<title>Item Title</title>
-			<link>http://example.org/</link>
-			<content:encoded><![CDATA[<p>Test</p>]]></content:encoded>
+			<title> </title>
+			<link>http://example.org/item</link>
+			<description>Test</description>
 		</item>
 	</rdf:RDF>`
 
@@ -565,27 +459,27 @@ func TestParseRDFWithContentEncoded(t *testing.T) {
 		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
 	}
 
-	expected := `<p>Test</p>`
-	result := feed.Entries[0].Content
+	expected := `http://example.org/item`
+	result := feed.Entries[0].Title
 	if result != expected {
-		t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
+		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
 	}
 }
 
-func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
+func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF
 		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 		xmlns="http://purl.org/rss/1.0/"
-		xmlns:content="http://purl.org/rss/1.0/modules/content/">
+		xmlns:dc="http://purl.org/dc/elements/1.1/">
 		<channel>
 			<title>Example Feed</title>
 			<link>http://example.org/</link>
 		</channel>
 		<item>
-			<title>Item Title</title>
+			<dc:title>Dublin Core Title</dc:title>
 			<link>http://example.org/</link>
-			<description>AT&amp;amp;T &lt;img src="https://example.org/img.png"&gt;&lt;/a&gt;</description>
+			<description>Test</description>
 		</item>
 	</rdf:RDF>`
 
@@ -598,10 +492,10 @@ func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
 		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
 	}
 
-	expected := `AT&amp;T <img src="https://example.org/img.png"></a>`
-	result := feed.Entries[0].Content
+	expected := `Dublin Core Title`
+	result := feed.Entries[0].Title
 	if result != expected {
-		t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
+		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
 	}
 }
 
@@ -639,20 +533,45 @@ func TestParseRDFItemWithDuplicateTitleElement(t *testing.T) {
 	}
 }
 
-func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
+func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>AT&amp;amp;T</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != `AT&T` {
+		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
+	}
+}
+
+func TestParseRDFWithContentEncoded(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF
 		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
 		xmlns="http://purl.org/rss/1.0/"
-		xmlns:dc="http://purl.org/dc/elements/1.1/">
+		xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example Feed</title>
 			<link>http://example.org/</link>
 		</channel>
 		<item>
-			<dc:title>Dublin Core Title</dc:title>
+			<title>Item Title</title>
 			<link>http://example.org/</link>
-			<description>Test</description>
+			<content:encoded><![CDATA[<p>Test</p>]]></content:encoded>
 		</item>
 	</rdf:RDF>`
 
@@ -665,26 +584,27 @@ func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
 		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
 	}
 
-	expected := `Dublin Core Title`
-	result := feed.Entries[0].Title
+	expected := `<p>Test</p>`
+	result := feed.Entries[0].Content
 	if result != expected {
-		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
+		t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
 	}
 }
 
-func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
+func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF
 		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
-		xmlns="http://purl.org/rss/1.0/">
+		xmlns="http://purl.org/rss/1.0/"
+		xmlns:content="http://purl.org/rss/1.0/modules/content/">
 		<channel>
 			<title>Example Feed</title>
 			<link>http://example.org/</link>
 		</channel>
 		<item>
-			<title> </title>
-			<link>http://example.org/item</link>
-			<description>Test</description>
+			<title>Item Title</title>
+			<link>http://example.org/</link>
+			<description>AT&amp;amp;T &lt;img src="https://example.org/img.png"&gt;&lt;/a&gt;</description>
 		</item>
 	</rdf:RDF>`
 
@@ -697,9 +617,201 @@ func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
 		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
 	}
 
-	expected := `http://example.org/item`
-	result := feed.Entries[0].Title
+	expected := `AT&amp;T <img src="https://example.org/img.png"></a>`
+	result := feed.Entries[0].Content
 	if result != expected {
-		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
+		t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
+	}
+}
+
+func TestParseItemWithoutDate(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>Title</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedDate := time.Now().In(time.Local)
+	diff := expectedDate.Sub(feed.Entries[0].Date)
+	if diff > time.Second {
+		t.Errorf("Incorrect entry date, got: %v", diff)
+	}
+}
+
+func TestParseItemWithDublicCoreDate(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>Title</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+			<dc:creator>Tester</dc:creator>
+			<dc:date>2018-04-10T05:00:00+00:00</dc:date>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedDate := time.Date(2018, time.April, 10, 5, 0, 0, 0, time.UTC)
+	if !feed.Entries[0].Date.Equal(expectedDate) {
+		t.Errorf("Incorrect entry date, got: %v, want: %v", feed.Entries[0].Date, expectedDate)
+	}
+}
+
+func TestParseItemWithInvalidDublicCoreDate(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>Title</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+			<dc:creator>Tester</dc:creator>
+			<dc:date>20-04-10T05:00:00+00:00</dc:date>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedDate := time.Now().In(time.Local)
+	diff := expectedDate.Sub(feed.Entries[0].Date)
+	if diff > time.Second {
+		t.Errorf("Incorrect entry date, got: %v", diff)
+	}
+}
+
+func TestParseItemWithEncodedHTMLInDCCreatorField(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>Title</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+			<dc:creator>&lt;a href=&quot;http://example.org/author1&quot;>Author 1&lt;/a&gt; (University 1), &lt;a href=&quot;http://example.org/author2&quot;>Author 2&lt;/a&gt; (University 2)</dc:creator>
+			<dc:date>2018-04-10T05:00:00+00:00</dc:date>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedAuthor := "Author 1 (University 1), Author 2 (University 2)"
+	if feed.Entries[0].Author != expectedAuthor {
+		t.Errorf("Incorrect entry author, got: %s, want: %s", feed.Entries[0].Author, expectedAuthor)
+	}
+}
+
+func TestParseItemWithOnlyFeedAuthor(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF
+	  xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+	  xmlns:dc="http://purl.org/dc/elements/1.1/"
+	  xmlns="http://purl.org/rss/1.0/"
+	>
+
+	  <channel rdf:about="http://meerkat.oreillynet.com/?_fl=rss1.0">
+		<title>Meerkat</title>
+		<link>http://meerkat.oreillynet.com</link>
+		<dc:creator>Rael Dornfest (mailto:rael@oreilly.com)</dc:creator>
+	  </channel>
+
+	  <item rdf:about="http://c.moreover.com/click/here.pl?r123">
+		<title>XML: A Disruptive Technology</title>
+		<link>http://c.moreover.com/click/here.pl?r123</link>
+		<dc:description>
+		  XML is placing increasingly heavy loads on the existing technical
+		  infrastructure of the Internet.
+		</dc:description>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://meerkat.oreillynet.com", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Author != "Rael Dornfest (mailto:rael@oreilly.com)" {
+		t.Errorf("Incorrect entry author, got: %s", feed.Entries[0].Author)
+	}
+}
+
+func TestParseInvalidXml(t *testing.T) {
+	data := `garbage`
+	_, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err == nil {
+		t.Fatal("Parse should returns an error")
+	}
+}
+
+func TestParseFeedWithHTMLEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
+	  <channel>
+			<title>Example &nbsp; Feed</title>
+			<link>http://example.org</link>
+	  </channel>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Title != "Example \u00a0 Feed" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Title)
+	}
+}
+
+func TestParseFeedWithInvalidCharacterEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
+	  <channel>
+			<title>Example Feed</title>
+			<link>http://example.org/a&b</link>
+	  </channel>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.SiteURL != "http://example.org/a&b" {
+		t.Errorf(`Incorrect URL, got: %q`, feed.SiteURL)
 	}
 }

+ 11 - 114
internal/reader/rdf/rdf.go

@@ -5,130 +5,27 @@ package rdf // import "miniflux.app/v2/internal/reader/rdf"
 
 import (
 	"encoding/xml"
-	"html"
-	"log/slog"
-	"strings"
-	"time"
 
-	"miniflux.app/v2/internal/crypto"
-	"miniflux.app/v2/internal/model"
-	"miniflux.app/v2/internal/reader/date"
 	"miniflux.app/v2/internal/reader/dublincore"
-	"miniflux.app/v2/internal/reader/sanitizer"
-	"miniflux.app/v2/internal/urllib"
 )
 
-type rdfFeed struct {
-	XMLName xml.Name  `xml:"RDF"`
-	Title   string    `xml:"channel>title"`
-	Link    string    `xml:"channel>link"`
-	Items   []rdfItem `xml:"item"`
-	dublincore.DublinCoreFeedElement
+// RDF sepcs: https://web.resource.org/rss/1.0/spec
+type RDF struct {
+	XMLName xml.Name   `xml:"http://www.w3.org/1999/02/22-rdf-syntax-ns# RDF"`
+	Channel RDFChannel `xml:"channel"`
+	Items   []RDFItem  `xml:"item"`
 }
 
-func (r *rdfFeed) Transform(baseURL string) *model.Feed {
-	var err error
-	feed := new(model.Feed)
-	feed.Title = sanitizer.StripTags(r.Title)
-	feed.FeedURL = baseURL
-	feed.SiteURL, err = urllib.AbsoluteURL(baseURL, r.Link)
-	if err != nil {
-		feed.SiteURL = r.Link
-	}
-
-	for _, item := range r.Items {
-		entry := item.Transform()
-		if entry.Author == "" && r.DublinCoreCreator != "" {
-			entry.Author = r.GetSanitizedCreator()
-		}
-
-		if entry.URL == "" {
-			entry.URL = feed.SiteURL
-		} else {
-			entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
-			if err == nil {
-				entry.URL = entryURL
-			}
-		}
-
-		feed.Entries = append(feed.Entries, entry)
-	}
-
-	return feed
+type RDFChannel struct {
+	Title       string `xml:"title"`
+	Link        string `xml:"link"`
+	Description string `xml:"description"`
+	dublincore.DublinCoreChannelElement
 }
 
-type rdfItem struct {
+type RDFItem struct {
 	Title       string `xml:"http://purl.org/rss/1.0/ title"`
 	Link        string `xml:"link"`
 	Description string `xml:"description"`
 	dublincore.DublinCoreItemElement
 }
-
-func (r *rdfItem) Transform() *model.Entry {
-	entry := model.NewEntry()
-	entry.Title = r.entryTitle()
-	entry.Author = r.entryAuthor()
-	entry.URL = r.entryURL()
-	entry.Content = r.entryContent()
-	entry.Hash = r.entryHash()
-	entry.Date = r.entryDate()
-
-	if entry.Title == "" {
-		entry.Title = entry.URL
-	}
-	return entry
-}
-
-func (r *rdfItem) entryTitle() string {
-	for _, title := range []string{r.Title, r.DublinCoreTitle} {
-		title = strings.TrimSpace(title)
-		if title != "" {
-			return html.UnescapeString(title)
-		}
-	}
-	return ""
-}
-
-func (r *rdfItem) entryContent() string {
-	switch {
-	case r.DublinCoreContent != "":
-		return r.DublinCoreContent
-	default:
-		return r.Description
-	}
-}
-
-func (r *rdfItem) entryAuthor() string {
-	return r.GetSanitizedCreator()
-}
-
-func (r *rdfItem) entryURL() string {
-	return strings.TrimSpace(r.Link)
-}
-
-func (r *rdfItem) entryDate() time.Time {
-	if r.DublinCoreDate != "" {
-		result, err := date.Parse(r.DublinCoreDate)
-		if err != nil {
-			slog.Debug("Unable to parse date from RDF feed",
-				slog.String("date", r.DublinCoreDate),
-				slog.String("link", r.Link),
-				slog.Any("error", err),
-			)
-			return time.Now()
-		}
-
-		return result
-	}
-
-	return time.Now()
-}
-
-func (r *rdfItem) entryHash() string {
-	value := r.Link
-	if value == "" {
-		value = r.Title + r.Description
-	}
-
-	return crypto.Hash(value)
-}