Bläddra i källkod

Add better support of Atom text constructs

- Note that Miniflux does not render entry title with HTML tags as of now
- Omit XHTML div element because it should not be part of the content
Frédéric Guillot 5 år sedan
förälder
incheckning
c8c1f05328
2 ändrade filer med 172 tillägg och 68 borttagningar
  1. 19 5
      reader/atom/atom_10.go
  2. 153 63
      reader/atom/atom_10_test.go

+ 19 - 5
reader/atom/atom_10.go

@@ -221,19 +221,33 @@ func (a *atom10Entry) entryCommentsURL() string {
 }
 
 type atom10Text struct {
-	Type     string `xml:"type,attr"`
-	CharData string `xml:",chardata"`
-	InnerXML string `xml:",innerxml"`
+	Type             string               `xml:"type,attr"`
+	CharData         string               `xml:",chardata"`
+	InnerXML         string               `xml:",innerxml"`
+	XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
 }
 
 func (a *atom10Text) String() string {
 	var content string
 
-	if a.Type == "xhtml" {
+	switch {
+	case strings.HasPrefix(a.InnerXML, `<![CDATA[`):
+		content = a.CharData
+	case a.Type == "", a.Type == "text", a.Type == "text/plain":
 		content = a.InnerXML
-	} else {
+	case a.Type == "xhtml":
+		if a.XHTMLRootElement.InnerXML != "" {
+			content = a.XHTMLRootElement.InnerXML
+		} else {
+			content = a.InnerXML
+		}
+	default:
 		content = a.CharData
 	}
 
 	return strings.TrimSpace(content)
 }
+
+type atomXHTMLRootElement struct {
+	InnerXML string `xml:",innerxml"`
+}

+ 153 - 63
reader/atom/atom_10_test.go

@@ -258,26 +258,8 @@ func TestParseEntryWithPlainTextTitle(t *testing.T) {
 		<summary>Some text.</summary>
 	  </entry>
 
-	</feed>`
-
-	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Title != `AT&T bought by SBC!` {
-		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
-	}
-}
-
-func TestParseEntryWithHTMLAndCDATATitle(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<feed xmlns="http://www.w3.org/2005/Atom">
-	  <title>Example Feed</title>
-	  <link href="http://example.org/"/>
-
 	  <entry>
-		<title type="html"><![CDATA[Test &#8220;Test&#8221;]]></title>
+		<title>AT&amp;T bought by SBC!</title>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
@@ -291,8 +273,11 @@ func TestParseEntryWithHTMLAndCDATATitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != "Test “Test”" {
-		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
+	expected := `AT&T bought by SBC!`
+	for i := 0; i < 2; i++ {
+		if feed.Entries[i].Title != expected {
+			t.Errorf("Incorrect title for entry #%d, got: %q", i, feed.Entries[i].Title)
+		}
 	}
 }
 
@@ -310,6 +295,14 @@ func TestParseEntryWithHTMLTitle(t *testing.T) {
 		<summary>Some text.</summary>
 	  </entry>
 
+	  <entry>
+		<title type="html"><![CDATA[Test &#8220;Test&#8221;]]></title>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary>Some text.</summary>
+	  </entry>
+
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
@@ -320,6 +313,10 @@ func TestParseEntryWithHTMLTitle(t *testing.T) {
 	if feed.Entries[0].Title != "<code>Test</code> Test" {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
+
+	if feed.Entries[1].Title != "Test “Test”" {
+		t.Errorf("Incorrect entry title, got: %q", feed.Entries[1].Title)
+	}
 }
 
 func TestParseEntryWithXHTMLTitle(t *testing.T) {
@@ -330,7 +327,19 @@ func TestParseEntryWithXHTMLTitle(t *testing.T) {
 
 	  <entry>
 		<title type="xhtml"><code>Test</code> Test</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/a"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary>Some text.</summary>
+	  </entry>
+
+	  <entry>
+		<title type="xhtml">
+			<div xmlns="http://www.w3.org/1999/xhtml">
+				This is <b>XHTML</b> content.
+	 		</div>
+		</title>
+		<link href="http://example.org/b"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
@@ -343,9 +352,13 @@ func TestParseEntryWithXHTMLTitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != "<code>Test</code> Test" {
+	if feed.Entries[0].Title != `<code>Test</code> Test` {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
+
+	if feed.Entries[1].Title != `This is <b>XHTML</b> content.` {
+		t.Errorf("Incorrect entry title, got: %q", feed.Entries[1].Title)
+	}
 }
 
 func TestParseEntryWithNumericCharacterReferenceTitle(t *testing.T) {
@@ -395,7 +408,7 @@ func TestParseEntryWithDoubleEncodedEntitiesTitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != `'AT&T'` {
+	if feed.Entries[0].Title != `&#39;AT&amp;T&#39;` {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
 }
@@ -414,6 +427,14 @@ func TestParseEntryWithXHTMLSummary(t *testing.T) {
 		<summary type="xhtml"><p>Some text.</p></summary>
 	  </entry>
 
+	  <entry>
+		<title type="xhtml">Example</title>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="xhtml"><div xmlns="http://www.w3.org/1999/xhtml"><p>Test: <code>std::unique_ptr&lt;S&gt;</code></p></div></summary>
+	  </entry>
+
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
@@ -421,12 +442,16 @@ func TestParseEntryWithXHTMLSummary(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != "<p>Some text.</p>" {
+	if feed.Entries[0].Content != `<p>Some text.</p>` {
 		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
 	}
+
+	if feed.Entries[1].Content != `<p>Test: <code>std::unique_ptr&lt;S&gt;</code></p>` {
+		t.Errorf("Incorrect entry content, got: %s", feed.Entries[1].Content)
+	}
 }
 
-func TestParseEntryWithHTMLAndCDATASummary(t *testing.T) {
+func TestParseEntryWithHTMLSummary(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
@@ -434,36 +459,26 @@ func TestParseEntryWithHTMLAndCDATASummary(t *testing.T) {
 
 	  <entry>
 		<title type="html">Example</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/1"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<summary type="html"><![CDATA[<p>Some text.</p>]]></summary>
+		<summary type="html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt;&lt;/code&gt;</summary>
 	  </entry>
 
-	</feed>`
-
-	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
-	if err != nil {
-		t.Fatal(err)
-	}
-
-	if feed.Entries[0].Content != "<p>Some text.</p>" {
-		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
-	}
-}
-
-func TestParseEntryWithPlainTextAndCDATASummary(t *testing.T) {
-	data := `<?xml version="1.0" encoding="utf-8"?>
-	<feed xmlns="http://www.w3.org/2005/Atom">
-	  <title>Example Feed</title>
-	  <link href="http://example.org/"/>
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/2"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="text/html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt;&lt;/code&gt;</summary>
+	  </entry>
 
 	  <entry>
 		<title type="html">Example</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/3"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<summary type="text"><![CDATA[<Some text.>]]></summary>
+		<summary type="html"><![CDATA[<code>std::unique_ptr&lt;S&gt;</code>]]></summary>
 	  </entry>
 
 	</feed>`
@@ -473,12 +488,15 @@ func TestParseEntryWithPlainTextAndCDATASummary(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != "<Some text.>" {
-		t.Errorf("Incorrect entry content, got: %s", feed.Entries[0].Content)
+	expected := `<code>std::unique_ptr&lt;S&gt;</code>`
+	for i := 0; i < 3; i++ {
+		if feed.Entries[i].Content != expected {
+			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
+		}
 	}
 }
 
-func TestParseEntryWithTextAndCDATAContent(t *testing.T) {
+func TestParseEntryWithTextSummary(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
@@ -486,12 +504,35 @@ func TestParseEntryWithTextAndCDATAContent(t *testing.T) {
 
 	  <entry>
 		<title type="html">Example</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/a"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content><![CDATA[AT&amp;T bought by SBC!]]></content>
+		<summary>AT&amp;T &lt;S&gt;</summary>
 	  </entry>
 
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/b"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="text">AT&amp;T &lt;S&gt;</summary>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/c"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="text/plain">AT&amp;T &lt;S&gt;</summary>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/d"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary type="text"><![CDATA[AT&amp;T &lt;S&gt;]]></summary>
+	  </entry>
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
@@ -499,8 +540,11 @@ func TestParseEntryWithTextAndCDATAContent(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != "AT&amp;T bought by SBC!" {
-		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
+	expected := `AT&amp;T &lt;S&gt;`
+	for i := 0; i < 4; i++ {
+		if feed.Entries[i].Content != expected {
+			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
+		}
 	}
 }
 
@@ -512,10 +556,34 @@ func TestParseEntryWithTextContent(t *testing.T) {
 
 	  <entry>
 		<title type="html">Example</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/a"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content>AT&amp;T bought by SBC!</content>
+		<content>AT&amp;T &lt;S&gt;</content>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/b"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content type="text">AT&amp;T &lt;S&gt;</content>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/c"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content type="text/plain">AT&amp;T &lt;S&gt;</content>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/d"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content><![CDATA[AT&amp;T &lt;S&gt;]]></content>
 	  </entry>
 
 	</feed>`
@@ -525,8 +593,11 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != "AT&T bought by SBC!" {
-		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
+	expected := `AT&amp;T &lt;S&gt;`
+	for i := 0; i < 4; i++ {
+		if feed.Entries[i].Content != expected {
+			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
+		}
 	}
 }
 
@@ -538,12 +609,28 @@ func TestParseEntryWithHTMLContent(t *testing.T) {
 
 	  <entry>
 		<title type="html">Example</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
+		<link href="http://example.org/a"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
 		<content type="html">AT&amp;amp;T bought &lt;b&gt;by SBC&lt;/b&gt;!</content>
 	  </entry>
 
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/b"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content type="text/html">AT&amp;amp;T bought &lt;b&gt;by SBC&lt;/b&gt;!</content>
+	  </entry>
+
+	  <entry>
+		<title type="html">Example</title>
+		<link href="http://example.org/c"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<content type="html"><![CDATA[AT&amp;T bought <b>by SBC</b>!]]></content>
+	  </entry>
+
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
@@ -551,8 +638,11 @@ func TestParseEntryWithHTMLContent(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != "AT&amp;T bought <b>by SBC</b>!" {
-		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
+	expected := `AT&amp;T bought <b>by SBC</b>!`
+	for i := 0; i < 3; i++ {
+		if feed.Entries[i].Content != expected {
+			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
+		}
 	}
 }
 
@@ -563,7 +653,7 @@ func TestParseEntryWithXHTMLContent(t *testing.T) {
 	  <link href="http://example.org/"/>
 
 	  <entry>
-		<title type="html">Example</title>
+		<title>Example</title>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
@@ -579,7 +669,7 @@ func TestParseEntryWithXHTMLContent(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Content != `<div xmlns="http://www.w3.org/1999/xhtml">AT&amp;T bought <b>by SBC</b>!</div>` {
+	if feed.Entries[0].Content != `AT&amp;T bought <b>by SBC</b>!` {
 		t.Errorf("Incorrect entry content, got: %q", feed.Entries[0].Content)
 	}
 }