Procházet zdrojové kódy

Allow only absolute URLs in comments URL

Some feeds are using invalid URLs (random text).
Frédéric Guillot před 6 roky
rodič
revize
bf632fad2e
6 změnil soubory, kde provedl 105 přidání a 2 odebrání
  1. 10 1
      reader/atom/atom_10.go
  2. 40 0
      reader/atom/atom_10_test.go
  3. 25 0
      reader/rss/parser_test.go
  4. 6 1
      reader/rss/rss.go
  5. 9 0
      url/url.go
  6. 15 0
      url/url_test.go

+ 10 - 1
reader/atom/atom_10.go

@@ -84,7 +84,7 @@ func (a *atom10Entry) Transform() *model.Entry {
 	entry.Content = a.entryContent()
 	entry.Title = a.entryTitle()
 	entry.Enclosures = a.entryEnclosures()
-	entry.CommentsURL = a.Links.firstLinkWithRelationAndType("replies", "text/html")
+	entry.CommentsURL = a.entryCommentsURL()
 	return entry
 }
 
@@ -194,6 +194,15 @@ func (a *atom10Entry) entryEnclosures() model.EnclosureList {
 	return enclosures
 }
 
+// See https://tools.ietf.org/html/rfc4685#section-3
+func (a *atom10Entry) entryCommentsURL() string {
+	commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html")
+	if url.IsAbsoluteURL(commentsURL) {
+		return commentsURL
+	}
+	return ""
+}
+
 type atom10Text struct {
 	Type string `xml:"type,attr"`
 	Data string `xml:",chardata"`

+ 40 - 0
reader/atom/atom_10_test.go

@@ -777,3 +777,43 @@ func TestParseRepliesLinkRelation(t *testing.T) {
 		t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL)
 	}
 }
+
+func TestAbsoluteCommentsURL(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<feed xmlns="http://www.w3.org/2005/Atom"
+			xmlns:thr="http://purl.org/syndication/thread/1.0">
+		<id>http://www.example.org/myfeed</id>
+		<title>My Example Feed</title>
+		<updated>2005-07-28T12:00:00Z</updated>
+		<link href="http://www.example.org/myfeed" />
+		<author><name>James</name></author>
+		<entry>
+			<id>tag:entries.com,2005:1</id>
+			<title>My original entry</title>
+			<updated>2006-03-01T12:12:12Z</updated>
+			<link href="http://www.example.org/entries/1" />
+			<link rel="replies"
+				type="text/html"
+				href="invalid url"
+				thr:count="10" thr:updated="2005-07-28T12:10:00Z" />
+			<summary>This is my original entry</summary>
+		</entry>
+	</feed>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries) != 1 {
+		t.Errorf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	if feed.Entries[0].URL != "http://www.example.org/entries/1" {
+		t.Errorf("Incorrect entry URL, got: %s", feed.Entries[0].URL)
+	}
+
+	if feed.Entries[0].CommentsURL != "" {
+		t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL)
+	}
+}

+ 25 - 0
reader/rss/parser_test.go

@@ -837,6 +837,31 @@ func TestParseEntryWithCommentsURL(t *testing.T) {
 	}
 }
 
+func TestParseEntryWithInvalidCommentsURL(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss version="2.0" xmlns:slash="http://purl.org/rss/1.0/modules/slash/">
+		<channel>
+			<link>https://example.org/</link>
+			<item>
+				<title>Item 1</title>
+				<link>https://example.org/item1</link>
+				<comments>
+					Some text
+				</comments>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse(bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.Entries[0].CommentsURL != "" {
+		t.Errorf("Incorrect entry comments URL, got: %q", feed.Entries[0].CommentsURL)
+	}
+}
+
 func TestParseInvalidXml(t *testing.T) {
 	data := `garbage`
 	_, err := Parse(bytes.NewBufferString(data))

+ 6 - 1
reader/rss/rss.go

@@ -317,7 +317,12 @@ func (r *rssItem) entryEnclosures() model.EnclosureList {
 func (r *rssItem) entryCommentsURL() string {
 	for _, commentLink := range r.CommentLinks {
 		if commentLink.XMLName.Space == "" {
-			return strings.TrimSpace(commentLink.Data)
+			commentsURL := strings.TrimSpace(commentLink.Data)
+			// The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
+			// See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
+			if url.IsAbsoluteURL(commentsURL) {
+				return commentsURL
+			}
 		}
 	}
 

+ 9 - 0
url/url.go

@@ -11,6 +11,15 @@ import (
 	"strings"
 )
 
+// IsAbsoluteURL returns true if the link is absolute.
+func IsAbsoluteURL(link string) bool {
+	u, err := url.Parse(link)
+	if err != nil {
+		return false
+	}
+	return u.IsAbs()
+}
+
 // AbsoluteURL converts the input URL as absolute URL if necessary.
 func AbsoluteURL(baseURL, input string) (string, error) {
 	if strings.HasPrefix(input, "//") {

+ 15 - 0
url/url_test.go

@@ -6,6 +6,21 @@ package url // import "miniflux.app/url"
 
 import "testing"
 
+func TestIsAbsoluteURL(t *testing.T) {
+	scenarios := map[string]bool{
+		"https://example.org/file.pdf": true,
+		"magnet:?xt.1=urn:sha1:YNCKHTQCWBTRNJIV4WNAE52SJUQCZO5C&xt.2=urn:sha1:TXGCZQTH26NL6OUQAJJPFALHG2LTGBC7": true,
+		"invalid url": false,
+	}
+
+	for input, expected := range scenarios {
+		actual := IsAbsoluteURL(input)
+		if actual != expected {
+			t.Errorf(`Unexpected result, got %v instead of %v for %q`, actual, expected, input)
+		}
+	}
+}
+
 func TestAbsoluteURL(t *testing.T) {
 	scenarios := [][]string{
 		[]string{"https://example.org/path/file.ext", "https://example.org/folder/", "/path/file.ext"},