Browse Source

Parse `<category>` from Feeds (RSS, Atom and JSON)

privatmamtora 3 years ago
parent
commit
8f9ccc6540

+ 3 - 0
api/entry.go

@@ -132,6 +132,8 @@ func (h *handler) findEntries(w http.ResponseWriter, r *http.Request, feedID int
 		return
 	}
 
+	tags := request.QueryStringParamList(r, "tags")
+
 	builder := h.store.NewEntryQueryBuilder(userID)
 	builder.WithFeedID(feedID)
 	builder.WithCategoryID(categoryID)
@@ -140,6 +142,7 @@ func (h *handler) findEntries(w http.ResponseWriter, r *http.Request, feedID int
 	builder.WithDirection(direction)
 	builder.WithOffset(offset)
 	builder.WithLimit(limit)
+	builder.WithTags(tags)
 	configureFilters(builder, r)
 
 	entries, err := builder.GetEntries()

+ 1 - 0
client/model.go

@@ -218,6 +218,7 @@ type Entry struct {
 	ReadingTime int        `json:"reading_time"`
 	Enclosures  Enclosures `json:"enclosures,omitempty"`
 	Feed        *Feed      `json:"feed,omitempty"`
+	Tags        []string   `json:"tags"`
 }
 
 // Entries represents a list of entries.

+ 6 - 0
database/migrations.go

@@ -638,4 +638,10 @@ var migrations = []func(tx *sql.Tx) error{
 		_, err = tx.Exec(sql)
 		return err
 	},
+	func(tx *sql.Tx) (err error) {
+		_, err = tx.Exec(`
+			ALTER TABLE entries ADD COLUMN tags text[] default '{}';
+		`)
+		return
+	},
 }

+ 1 - 0
model/entry.go

@@ -37,6 +37,7 @@ type Entry struct {
 	ReadingTime int           `json:"reading_time"`
 	Enclosures  EnclosureList `json:"enclosures"`
 	Feed        *Feed         `json:"feed,omitempty"`
+	Tags        []string      `json:"tags"`
 }
 
 // Entries represents a list of entries.

+ 29 - 8
reader/atom/atom_10.go

@@ -80,14 +80,15 @@ func (a *atom10Feed) Transform(baseURL string) *model.Feed {
 }
 
 type atom10Entry struct {
-	ID        string      `xml:"id"`
-	Title     atom10Text  `xml:"title"`
-	Published string      `xml:"published"`
-	Updated   string      `xml:"updated"`
-	Links     atomLinks   `xml:"link"`
-	Summary   atom10Text  `xml:"summary"`
-	Content   atom10Text  `xml:"http://www.w3.org/2005/Atom content"`
-	Authors   atomAuthors `xml:"author"`
+	ID         string           `xml:"id"`
+	Title      atom10Text       `xml:"title"`
+	Published  string           `xml:"published"`
+	Updated    string           `xml:"updated"`
+	Links      atomLinks        `xml:"link"`
+	Summary    atom10Text       `xml:"summary"`
+	Content    atom10Text       `xml:"http://www.w3.org/2005/Atom content"`
+	Authors    atomAuthors      `xml:"author"`
+	Categories []atom10Category `xml:"category"`
 	media.Element
 }
 
@@ -101,6 +102,7 @@ func (a *atom10Entry) Transform() *model.Entry {
 	entry.Title = a.entryTitle()
 	entry.Enclosures = a.entryEnclosures()
 	entry.CommentsURL = a.entryCommentsURL()
+	entry.Tags = a.entryCategories()
 	return entry
 }
 
@@ -214,6 +216,20 @@ func (a *atom10Entry) entryEnclosures() model.EnclosureList {
 	return enclosures
 }
 
+func (r *atom10Entry) entryCategories() []string {
+	var categoryList []string
+
+	for _, atomCategory := range r.Categories {
+		if strings.TrimSpace(atomCategory.Label) != "" {
+			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Label))
+		} else {
+			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Term))
+		}
+	}
+
+	return categoryList
+}
+
 // See https://tools.ietf.org/html/rfc4685#section-4
 // If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
 // We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
@@ -232,6 +248,11 @@ type atom10Text struct {
 	XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
 }
 
+type atom10Category struct {
+	Term  string `xml:"term,attr"`
+	Label string `xml:"label,attr"`
+}
+
 // Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1
 // HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2
 // XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3

+ 45 - 0
reader/atom/atom_10_test.go

@@ -1604,3 +1604,48 @@ func TestAbsoluteCommentsURL(t *testing.T) {
 		t.Errorf("Incorrect entry comments URL, got: %s", feed.Entries[0].CommentsURL)
 	}
 }
+
+func TestParseFeedWithCategories(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+	  <author>
+		<name>Alice</name>
+	  </author>
+	  <author>
+		<name>Bob</name>
+	  </author>
+
+	  <entry>
+		<link href="http://example.org/2003/12/13/atom03"/>
+		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary>Some text.</summary>
+		<category term='Tech' />
+		<category term='Technology' label='Science' />
+	  </entry>
+
+	</feed>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries[0].Tags) != 2 {
+		t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
+	}
+
+	expected := "Tech"
+	result := feed.Entries[0].Tags[0]
+	if result != expected {
+		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
+	}
+
+	expected = "Science"
+	result = feed.Entries[0].Tags[1]
+	if result != expected {
+		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
+	}
+}

+ 2 - 0
reader/json/json.go

@@ -43,6 +43,7 @@ type jsonItem struct {
 	Authors       []jsonAuthor     `json:"authors"`
 	Author        jsonAuthor       `json:"author"`
 	Attachments   []jsonAttachment `json:"attachments"`
+	Tags          []string         `json:"tags"`
 }
 
 type jsonAttachment struct {
@@ -181,6 +182,7 @@ func (j *jsonItem) Transform() *model.Entry {
 	entry.Content = j.GetContent()
 	entry.Title = strings.TrimSpace(j.GetTitle())
 	entry.Enclosures = j.GetEnclosures()
+	entry.Tags = j.Tags
 	return entry
 }
 

+ 42 - 0
reader/json/parser_test.go

@@ -575,3 +575,45 @@ func TestParseInvalidJSON(t *testing.T) {
 		t.Error("Parse should returns an error")
 	}
 }
+
+func TestParseTags(t *testing.T) {
+	data := `{
+		"version": "https://jsonfeed.org/version/1",
+		"user_comment": "This is a microblog feed. You can add this to your feed reader using the following URL: https://example.org/feed.json",
+		"title": "Brent Simmons’s Microblog",
+		"home_page_url": "https://example.org/",
+		"feed_url": "https://example.org/feed.json",
+		"author": {
+			"name": "Brent Simmons",
+			"url": "http://example.org/",
+			"avatar": "https://example.org/avatar.png"
+		},
+		"items": [
+			{
+				"id": "2347259",
+				"url": "https://example.org/2347259",
+				"content_text": "Cats are neat. \n\nhttps://example.org/cats",
+				"date_published": "2016-02-09T14:22:00-07:00",
+				"tags": [
+					"tag 1",
+					"tag 2"
+				]
+			}
+		]
+	}`
+
+	feed, err := Parse("https://example.org/feed.json", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries[0].Tags) != 2 {
+		t.Errorf("Incorrect number of Tags, got: %d", len(feed.Entries[0].Tags))
+	}
+
+	expected := "tag 2"
+	result := feed.Entries[0].Tags[1]
+	if result != expected {
+		t.Errorf("Incorrect entry tag, got %q instead of %q", result, expected)
+	}
+}

+ 66 - 0
reader/rss/parser_test.go

@@ -1426,3 +1426,69 @@ func TestEntryDescriptionFromGooglePlayDescription(t *testing.T) {
 		t.Errorf(`Unexpected podcast content, got %q instead of %q`, result, expected)
 	}
 }
+
+func TestParseEntryWithCategoryAndInnerHTML(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
+		<channel>
+			<title>Example</title>
+			<link>https://example.org/</link>
+			<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
+			<item>
+				<title>Test</title>
+				<link>https://example.org/item</link>
+				<category>Category 1</category>
+				<category>Category 2</category>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries[0].Tags) != 2 {
+		t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
+	}
+
+	expected := "Category 2"
+	result := feed.Entries[0].Tags[1]
+	if result != expected {
+		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
+	}
+}
+
+func TestParseEntryWithCategoryAndCDATA(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+		<rss xmlns:atom="http://www.w3.org/2005/Atom" version="2.0">
+		<channel>
+			<title>Example</title>
+			<link>https://example.org/</link>
+			<atom:link href="https://example.org/rss" type="application/rss+xml" rel="self"></atom:link>
+			<item>
+				<title>Test</title>
+				<link>https://example.org/item</link>
+				<author>
+					by <![CDATA[Foo Bar]]>
+				</author>
+				<category>Sample Category</category>
+			</item>
+		</channel>
+		</rss>`
+
+	feed, err := Parse("https://example.org/", bytes.NewBufferString(data))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries[0].Tags) != 1 {
+		t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
+	}
+
+	expected := "Sample Category"
+	result := feed.Entries[0].Tags[0]
+	if result != expected {
+		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
+	}
+}

+ 23 - 0
reader/rss/rss.go

@@ -156,6 +156,12 @@ type rssEnclosure struct {
 	Length string `xml:"length,attr"`
 }
 
+type rssCategory struct {
+	XMLName xml.Name
+	Data    string `xml:",chardata"`
+	Inner   string `xml:",innerxml"`
+}
+
 func (enclosure *rssEnclosure) Size() int64 {
 	if enclosure.Length == "" {
 		return 0
@@ -173,6 +179,7 @@ type rssItem struct {
 	Authors        []rssAuthor      `xml:"author"`
 	CommentLinks   []rssCommentLink `xml:"comments"`
 	EnclosureLinks []rssEnclosure   `xml:"enclosure"`
+	Categories     []rssCategory    `xml:"category"`
 	DublinCoreElement
 	FeedBurnerElement
 	PodcastEntryElement
@@ -189,6 +196,8 @@ func (r *rssItem) Transform() *model.Entry {
 	entry.Content = r.entryContent()
 	entry.Title = r.entryTitle()
 	entry.Enclosures = r.entryEnclosures()
+	entry.Tags = r.entryCategories()
+
 	return entry
 }
 
@@ -372,6 +381,20 @@ func (r *rssItem) entryEnclosures() model.EnclosureList {
 	return enclosures
 }
 
+func (r *rssItem) entryCategories() []string {
+	var categoryList []string
+
+	for _, rssCategory := range r.Categories {
+		if strings.Contains(rssCategory.Inner, "<![CDATA[") {
+			categoryList = append(categoryList, strings.TrimSpace(rssCategory.Data))
+		} else {
+			categoryList = append(categoryList, strings.TrimSpace(rssCategory.Inner))
+		}
+	}
+
+	return categoryList
+}
+
 func (r *rssItem) entryCommentsURL() string {
 	for _, commentLink := range r.CommentLinks {
 		if commentLink.XMLName.Space == "" {

+ 21 - 3
storage/entry.go

@@ -119,7 +119,8 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
 				feed_id,
 				reading_time,
 				changed_at,
-				document_vectors
+				document_vectors,
+				tags
 			)
 		VALUES
 			(
@@ -134,7 +135,8 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
 				$9,
 				$10,
 				now(),
-				setweight(to_tsvector(left(coalesce($1, ''), 500000)), 'A') || setweight(to_tsvector(left(coalesce($6, ''), 500000)), 'B')
+				setweight(to_tsvector(left(coalesce($1, ''), 500000)), 'A') || setweight(to_tsvector(left(coalesce($6, ''), 500000)), 'B'),
+				$11
 			)
 		RETURNING
 			id, status
@@ -151,6 +153,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
 		entry.UserID,
 		entry.FeedID,
 		entry.ReadingTime,
+		pq.Array(removeDuplicates(entry.Tags)),
 	).Scan(&entry.ID, &entry.Status)
 
 	if err != nil {
@@ -183,7 +186,8 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
 			content=$4,
 			author=$5,
 			reading_time=$6,
-			document_vectors = setweight(to_tsvector(left(coalesce($1, ''), 500000)), 'A') || setweight(to_tsvector(left(coalesce($4, ''), 500000)), 'B')
+			document_vectors = setweight(to_tsvector(left(coalesce($1, ''), 500000)), 'A') || setweight(to_tsvector(left(coalesce($4, ''), 500000)), 'B'),
+			tags=$10
 		WHERE
 			user_id=$7 AND feed_id=$8 AND hash=$9
 		RETURNING
@@ -200,6 +204,7 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
 		entry.UserID,
 		entry.FeedID,
 		entry.Hash,
+		pq.Array(removeDuplicates(entry.Tags)),
 	).Scan(&entry.ID)
 
 	if err != nil {
@@ -535,3 +540,16 @@ func (s *Storage) UnshareEntry(userID int64, entryID int64) (err error) {
 	}
 	return
 }
+
+// removeDuplicate removes duplicate entries from a slice
+func removeDuplicates[T string | int](sliceList []T) []T {
+	allKeys := make(map[T]bool)
+	list := []T{}
+	for _, item := range sliceList {
+		if _, value := allKeys[item]; !value {
+			allKeys[item] = true
+			list = append(list, item)
+		}
+	}
+	return list
+}

+ 13 - 0
storage/entry_query_builder.go

@@ -135,6 +135,17 @@ func (e *EntryQueryBuilder) WithStatuses(statuses []string) *EntryQueryBuilder {
 	return e
 }
 
+// WithTags filter by a list of entry tags.
+func (e *EntryQueryBuilder) WithTags(tags []string) *EntryQueryBuilder {
+	if len(tags) > 0 {
+		for _, cat := range tags {
+			e.conditions = append(e.conditions, fmt.Sprintf("$%d = ANY(e.tags)", len(e.args)+1))
+			e.args = append(e.args, cat)
+		}
+	}
+	return e
+}
+
 // WithoutStatus set the entry status that should not be returned.
 func (e *EntryQueryBuilder) WithoutStatus(status string) *EntryQueryBuilder {
 	if status != "" {
@@ -250,6 +261,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
 			e.reading_time,
 			e.created_at,
 			e.changed_at,
+			e.tags,
 			f.title as feed_title,
 			f.feed_url,
 			f.site_url,
@@ -312,6 +324,7 @@ func (e *EntryQueryBuilder) GetEntries() (model.Entries, error) {
 			&entry.ReadingTime,
 			&entry.CreatedAt,
 			&entry.ChangedAt,
+			pq.Array(&entry.Tags),
 			&entry.Feed.Title,
 			&entry.Feed.FeedURL,
 			&entry.Feed.SiteURL,