Jelajahi Sumber

Strip HTML tags from DublinCore Creator tags

Frédéric Guillot 2 tahun lalu
induk
melakukan
36f013670e

+ 17 - 3
internal/reader/rdf/dublincore.go → internal/reader/dublincore/dublincore.go

@@ -1,16 +1,30 @@
 // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
 // SPDX-License-Identifier: Apache-2.0
 
-package rdf // import "miniflux.app/v2/internal/reader/rdf"
+package dublincore // import "miniflux.app/v2/internal/reader/dublincore"
+
+import (
+	"strings"
+
+	"miniflux.app/v2/internal/reader/sanitizer"
+)
 
 // DublinCoreFeedElement represents Dublin Core feed XML elements.
 type DublinCoreFeedElement struct {
 	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ channel>creator"`
 }
 
-// DublinCoreEntryElement represents Dublin Core entry XML elements.
-type DublinCoreEntryElement struct {
+func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
+	return strings.TrimSpace(sanitizer.StripTags(feed.DublinCoreCreator))
+}
+
+// DublinCoreItemElement represents Dublin Core entry XML elements.
+type DublinCoreItemElement struct {
 	DublinCoreDate    string `xml:"http://purl.org/dc/elements/1.1/ date"`
 	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
 	DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
 }
+
+func (item *DublinCoreItemElement) GetSanitizedCreator() string {
+	return strings.TrimSpace(sanitizer.StripTags(item.DublinCoreCreator))
+}

+ 28 - 0
internal/reader/rdf/parser_test.go

@@ -349,6 +349,34 @@ func TestParseItemWithDublicCoreDate(t *testing.T) {
 	}
 }
 
+func TestParseItemWithEncodedHTMLInDCCreatorField(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/" xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+	  <channel>
+			<title>Example</title>
+			<link>http://example.org</link>
+	  </channel>
+
+	  <item>
+			<title>Title</title>
+			<description>Test</description>
+			<link>http://example.org/test.html</link>
+			<dc:creator>&lt;a href=&quot;http://example.org/author1&quot;>Author 1&lt;/a&gt; (University 1), &lt;a href=&quot;http://example.org/author2&quot;>Author 2&lt;/a&gt; (University 2)</dc:creator>
+			<dc:date>2018-04-10T05:00:00+00:00</dc:date>
+	  </item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	expectedAuthor := "Author 1 (University 1), Author 2 (University 2)"
+	if feed.Entries[0].Author != expectedAuthor {
+		t.Errorf("Incorrect entry author, got: %s, want: %s", feed.Entries[0].Author, expectedAuthor)
+	}
+}
+
 func TestParseItemWithoutDate(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">

+ 5 - 4
internal/reader/rdf/rdf.go

@@ -13,6 +13,7 @@ import (
 	"miniflux.app/v2/internal/logger"
 	"miniflux.app/v2/internal/model"
 	"miniflux.app/v2/internal/reader/date"
+	"miniflux.app/v2/internal/reader/dublincore"
 	"miniflux.app/v2/internal/reader/sanitizer"
 	"miniflux.app/v2/internal/urllib"
 )
@@ -22,7 +23,7 @@ type rdfFeed struct {
 	Title   string    `xml:"channel>title"`
 	Link    string    `xml:"channel>link"`
 	Items   []rdfItem `xml:"item"`
-	DublinCoreFeedElement
+	dublincore.DublinCoreFeedElement
 }
 
 func (r *rdfFeed) Transform(baseURL string) *model.Feed {
@@ -38,7 +39,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
 	for _, item := range r.Items {
 		entry := item.Transform()
 		if entry.Author == "" && r.DublinCoreCreator != "" {
-			entry.Author = strings.TrimSpace(r.DublinCoreCreator)
+			entry.Author = r.GetSanitizedCreator()
 		}
 
 		if entry.URL == "" {
@@ -60,7 +61,7 @@ type rdfItem struct {
 	Title       string `xml:"title"`
 	Link        string `xml:"link"`
 	Description string `xml:"description"`
-	DublinCoreEntryElement
+	dublincore.DublinCoreItemElement
 }
 
 func (r *rdfItem) Transform() *model.Entry {
@@ -88,7 +89,7 @@ func (r *rdfItem) entryContent() string {
 }
 
 func (r *rdfItem) entryAuthor() string {
-	return strings.TrimSpace(r.DublinCoreCreator)
+	return r.GetSanitizedCreator()
 }
 
 func (r *rdfItem) entryURL() string {

+ 0 - 11
internal/reader/rss/dublincore.go

@@ -1,11 +0,0 @@
-// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
-// SPDX-License-Identifier: Apache-2.0
-
-package rss // import "miniflux.app/v2/internal/reader/rss"
-
-// DublinCoreElement represents Dublin Core XML elements.
-type DublinCoreElement struct {
-	DublinCoreDate    string `xml:"http://purl.org/dc/elements/1.1/ date"`
-	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
-	DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
-}

+ 3 - 2
internal/reader/rss/rss.go

@@ -15,6 +15,7 @@ import (
 	"miniflux.app/v2/internal/logger"
 	"miniflux.app/v2/internal/model"
 	"miniflux.app/v2/internal/reader/date"
+	"miniflux.app/v2/internal/reader/dublincore"
 	"miniflux.app/v2/internal/reader/media"
 	"miniflux.app/v2/internal/reader/sanitizer"
 	"miniflux.app/v2/internal/urllib"
@@ -182,7 +183,7 @@ type rssItem struct {
 	CommentLinks   []rssCommentLink `xml:"comments"`
 	EnclosureLinks []rssEnclosure   `xml:"enclosure"`
 	Categories     []rssCategory    `xml:"category"`
-	DublinCoreElement
+	dublincore.DublinCoreItemElement
 	FeedBurnerElement
 	PodcastEntryElement
 	media.Element
@@ -250,7 +251,7 @@ func (r *rssItem) entryAuthor() string {
 	}
 
 	if author == "" {
-		author = r.DublinCoreCreator
+		author = r.GetSanitizedCreator()
 	}
 
 	return sanitizer.StripTags(strings.TrimSpace(author))