Explorar el Código

Handle RDF feeds with duplicated <title> elements

Frédéric Guillot hace 2 años
padre
commit
c595c80356

+ 1 - 0
internal/reader/dublincore/dublincore.go

@@ -20,6 +20,7 @@ func (feed *DublinCoreFeedElement) GetSanitizedCreator() string {
 
 // DublinCoreItemElement represents Dublin Core entry XML elements.
 type DublinCoreItemElement struct {
+	DublinCoreTitle   string `xml:"http://purl.org/dc/elements/1.1/ title"`
 	DublinCoreDate    string `xml:"http://purl.org/dc/elements/1.1/ date"`
 	DublinCoreCreator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
 	DublinCoreContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`

+ 104 - 5
internal/reader/rdf/parser_test.go

@@ -406,7 +406,7 @@ func TestParseItemWithoutDate(t *testing.T) {
 
 func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
-	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#">
+	<rdf:RDF xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#" xmlns="http://purl.org/rss/1.0/">
 	  <channel>
 			<title>Example</title>
 			<link>http://example.org</link>
@@ -425,7 +425,7 @@ func TestParseItemWithEncodedHTMLTitle(t *testing.T) {
 	}
 
 	if feed.Entries[0].Title != `AT&T` {
-		t.Errorf("Incorrect entry title, got: %v", feed.Entries[0].Title)
+		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
 }
 
@@ -502,7 +502,7 @@ func TestParseFeedWithURLWrappedInSpaces(t *testing.T) {
 	<item rdf:about="http://biorxiv.org/cgi/content/short/857789v1?rss=1">
 		<title>
 			<![CDATA[
-			Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models 
+			Microscale Collagen and Fibroblast Interactions Enhance Primary Human Hepatocyte Functions in 3-Dimensional Models
 			]]>
 		</title>
 		<link>
@@ -568,7 +568,7 @@ func TestParseRDFWithContentEncoded(t *testing.T) {
 	expected := `<p>Test</p>`
 	result := feed.Entries[0].Content
 	if result != expected {
-		t.Errorf(`Unexpected entry URL, got %q instead of %q`, result, expected)
+		t.Errorf(`Unexpected entry content, got %q instead of %q`, result, expected)
 	}
 }
 
@@ -601,6 +601,105 @@ func TestParseRDFWithEncodedHTMLDescription(t *testing.T) {
 	expected := `AT&amp;T <img src="https://example.org/img.png"></a>`
 	result := feed.Entries[0].Content
 	if result != expected {
-		t.Errorf(`Unexpected entry URL, got %v instead of %v`, result, expected)
+		t.Errorf(`Unexpected entry content, got %v instead of %v`, result, expected)
+	}
+}
+
+func TestParseRDFItemWithDuplicateTitleElement(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/"
+		xmlns:dc="http://purl.org/dc/elements/1.1/">
+		<channel>
+			<title>Example Feed</title>
+			<link>http://example.org/</link>
+		</channel>
+		<item>
+			<title>Item Title</title>
+			<dc:title/>
+			<link>http://example.org/</link>
+			<description>Test</description>
+		</item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
+	}
+
+	expected := `Item Title`
+	result := feed.Entries[0].Title
+	if result != expected {
+		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
+	}
+}
+
+func TestParseRDFItemWithDublinCoreTitleElement(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/"
+		xmlns:dc="http://purl.org/dc/elements/1.1/">
+		<channel>
+			<title>Example Feed</title>
+			<link>http://example.org/</link>
+		</channel>
+		<item>
+			<dc:title>Dublin Core Title</dc:title>
+			<link>http://example.org/</link>
+			<description>Test</description>
+		</item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
+	}
+
+	expected := `Dublin Core Title`
+	result := feed.Entries[0].Title
+	if result != expected {
+		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
+	}
+}
+
+func TestParseRDFItemWitEmptyTitleElement(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<rdf:RDF
+		xmlns:rdf="http://www.w3.org/1999/02/22-rdf-syntax-ns#"
+		xmlns="http://purl.org/rss/1.0/">
+		<channel>
+			<title>Example Feed</title>
+			<link>http://example.org/</link>
+		</channel>
+		<item>
+			<title> </title>
+			<link>http://example.org/item</link>
+			<description>Test</description>
+		</item>
+	</rdf:RDF>`
+
+	feed, err := Parse("http://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Fatalf(`Unexpected number of entries, got %d`, len(feed.Entries))
+	}
+
+	expected := `http://example.org/item`
+	result := feed.Entries[0].Title
+	if result != expected {
+		t.Errorf(`Unexpected entry title, got %q instead of %q`, result, expected)
 	}
 }

+ 12 - 2
internal/reader/rdf/rdf.go

@@ -58,7 +58,7 @@ func (r *rdfFeed) Transform(baseURL string) *model.Feed {
 }
 
 type rdfItem struct {
-	Title       string `xml:"title"`
+	Title       string `xml:"http://purl.org/rss/1.0/ title"`
 	Link        string `xml:"link"`
 	Description string `xml:"description"`
 	dublincore.DublinCoreItemElement
@@ -72,11 +72,21 @@ func (r *rdfItem) Transform() *model.Entry {
 	entry.Content = r.entryContent()
 	entry.Hash = r.entryHash()
 	entry.Date = r.entryDate()
+
+	if entry.Title == "" {
+		entry.Title = entry.URL
+	}
 	return entry
 }
 
 func (r *rdfItem) entryTitle() string {
-	return html.UnescapeString(strings.TrimSpace(r.Title))
+	for _, title := range []string{r.Title, r.DublinCoreTitle} {
+		title = strings.TrimSpace(title)
+		if title != "" {
+			return html.UnescapeString(title)
+		}
+	}
+	return ""
 }
 
 func (r *rdfItem) entryContent() string {