Browse Source

fix(rss): handle item title with CDATA content correctly

Fix regression introduced in commit a3ce03cc
Frédéric Guillot 1 year ago
parent
commit
7f54b27079
3 changed files with 205 additions and 164 deletions
  1. 1 1
      internal/reader/rss/adapter.go
  2. 178 157
      internal/reader/rss/parser_test.go
  3. 26 6
      internal/reader/rss/rss.go

+ 1 - 1
internal/reader/rss/adapter.go

@@ -173,7 +173,7 @@ func findFeedAuthor(rssChannel *RSSChannel) string {
 }
 
 func findEntryTitle(rssItem *RSSItem) string {
-	title := sanitizer.StripTags(rssItem.Title.Inner)
+	title := rssItem.Title.Content
 
 	if rssItem.DublinCoreTitle != "" {
 		title = rssItem.DublinCoreTitle

+ 178 - 157
internal/reader/rss/parser_test.go

@@ -311,6 +311,184 @@ func TestParseEntryWithDCTitleOnly(t *testing.T) {
 	}
 }
 
+func TestParseFeedTitleWithHTMLEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<title>Example &nbsp; Feed</title>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Title != "Example \u00a0 Feed" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Title)
+	}
+}
+
+func TestParseFeedTitleWithUnicodeEntityAndCdata(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<title><![CDATA[Jenny&#8217;s Newsletter]]></title>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Title != `Jenny’s Newsletter` {
+		t.Errorf(`Incorrect title, got: %q`, feed.Title)
+	}
+}
+
+func TestParseItemTitleWithHTMLEntity(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<title>Example</title>
+			<item>
+				<title>&lt;/example&gt;</title>
+				<link>http://www.example.org/entries/1</link>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "</example>" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
+	}
+}
+
+func TestParseItemTitleWithNumericCharacterReference(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<title>Example</title>
+			<item>
+				<title>&#931; &#xDF;</title>
+				<link>http://www.example.org/article.html</link>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "Σ ß" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
+	}
+}
+
+func TestParseItemTitleWithDoubleEncodedEntities(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<title>Example</title>
+			<item>
+				<title>&amp;#39;Text&amp;#39;</title>
+				<link>http://www.example.org/article.html</link>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "'Text'" {
+		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
+	}
+}
+
+func TestParseItemTitleWithWhitespaces(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rss version="2.0">
+	<channel>
+		<title>Example</title>
+		<link>http://example.org</link>
+		<item>
+			<title>
+				Some Title
+			</title>
+			<link>http://www.example.org/entries/1</link>
+		</item>
+	</channel>
+	</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "Some Title" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
+func TestParseItemTitleWithCDATA(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rss version="2.0">
+	<channel>
+		<title>Example</title>
+		<link>http://example.org</link>
+		<item>
+			<title><![CDATA[This is a title]]></title>
+			<link>http://www.example.org/entries/1</link>
+		</item>
+	</channel>
+	</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "This is a title" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
+func TestParseItemTitleWithInnerHTML(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rss version="2.0">
+	<channel>
+		<title>Example</title>
+		<link>http://example.org</link>
+		<item>
+			<title>Test: <b>bold</b></title>
+			<link>http://www.example.org/entries/1</link>
+		</item>
+	</channel>
+	</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].Title != "Test: bold" {
+		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
+	}
+}
+
 func TestParseEntryWithoutLink(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0">
@@ -997,56 +1175,6 @@ func TestParseEntryWithFeedBurnerLink(t *testing.T) {
 	}
 }
 
-func TestParseEntryTitleWithWhitespaces(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rss version="2.0">
-	<channel>
-		<title>Example</title>
-		<link>http://example.org</link>
-		<item>
-			<title>
-				Some Title
-			</title>
-			<link>http://www.example.org/entries/1</link>
-			<pubDate>Fri, 15 Jul 2005 00:00:00 -0500</pubDate>
-		</item>
-	</channel>
-	</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != "Some Title" {
-		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
-	}
-}
-
-func TestParseEntryTitleWithInnerHTML(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rss version="2.0">
-	<channel>
-		<title>Example</title>
-		<link>http://example.org</link>
-		<item>
-			<title>Test: <b>bold</b></title>
-			<link>http://www.example.org/entries/1</link>
-			<pubDate>Fri, 15 Jul 2005 00:00:00 -0500</pubDate>
-		</item>
-	</channel>
-	</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != "Test: bold" {
-		t.Errorf("Incorrect entry title, got: %s", feed.Entries[0].Title)
-	}
-}
-
 func TestParseEntryWithEnclosures(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0">
@@ -1404,113 +1532,6 @@ func TestParseInvalidXml(t *testing.T) {
 	}
 }
 
-func TestParseFeedTitleWithHTMLEntity(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-		<channel>
-			<link>https://example.org/</link>
-			<title>Example &nbsp; Feed</title>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Title != "Example \u00a0 Feed" {
-		t.Errorf(`Incorrect title, got: %q`, feed.Title)
-	}
-}
-
-func TestParseFeedTitleWithUnicodeEntityAndCdata(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-		<channel>
-			<link>https://example.org/</link>
-			<title><![CDATA[Jenny&#8217;s Newsletter]]></title>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Title != `Jenny’s Newsletter` {
-		t.Errorf(`Incorrect title, got: %q`, feed.Title)
-	}
-}
-
-func TestParseItemTitleWithHTMLEntity(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-		<channel>
-			<link>https://example.org/</link>
-			<title>Example</title>
-			<item>
-				<title>&lt;/example&gt;</title>
-				<link>http://www.example.org/entries/1</link>
-			</item>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != "</example>" {
-		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
-	}
-}
-
-func TestParseItemTitleWithNumericCharacterReference(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-		<channel>
-			<link>https://example.org/</link>
-			<title>Example</title>
-			<item>
-				<title>&#931; &#xDF;</title>
-				<link>http://www.example.org/article.html</link>
-			</item>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != "Σ ß" {
-		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
-	}
-}
-
-func TestParseItemTitleWithDoubleEncodedEntities(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
-		<channel>
-			<link>https://example.org/</link>
-			<title>Example</title>
-			<item>
-				<title>&amp;#39;Text&amp;#39;</title>
-				<link>http://www.example.org/article.html</link>
-			</item>
-		</channel>
-		</rss>`
-
-	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != "'Text'" {
-		t.Errorf(`Incorrect title, got: %q`, feed.Entries[0].Title)
-	}
-}
-
 func TestParseFeedLinkWithInvalidCharacterEntity(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">

+ 26 - 6
internal/reader/rss/rss.go

@@ -111,7 +111,7 @@ type RSSImage struct {
 
 type RSSItem struct {
 	// Title is the title of the item.
-	Title RSSTitle `xml:"rss title"`
+	Title InnerContent `xml:"rss title"`
 
 	// Link is the URL of the item.
 	Link string `xml:"rss link"`
@@ -169,11 +169,6 @@ type RSSItem struct {
 	googleplay.GooglePlayItemElement
 }
 
-type RSSTitle struct {
-	Data  string `xml:",chardata"`
-	Inner string `xml:",innerxml"`
-}
-
 type RSSAuthor struct {
 	XMLName xml.Name
 	Data    string `xml:",chardata"`
@@ -203,3 +198,28 @@ type RSSSource struct {
 	URL  string `xml:"url,attr"`
 	Name string `xml:",chardata"`
 }
+
+type InnerContent struct {
+	Content string
+}
+
+func (ic *InnerContent) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
+	var content strings.Builder
+
+	for {
+		token, err := d.Token()
+		if err != nil {
+			return err
+		}
+
+		switch t := token.(type) {
+		case xml.CharData:
+			content.Write(t)
+		case xml.EndElement:
+			if t == start.End() {
+				ic.Content = strings.TrimSpace(content.String())
+				return nil
+			}
+		}
+	}
+}