Przeglądaj źródła

Refactor Atom parser to use an adapter

Frédéric Guillot 2 lat temu
rodzic
commit
dd4fb660c1

+ 95 - 139
internal/reader/atom/atom_03.go

@@ -6,158 +6,114 @@ package atom // import "miniflux.app/v2/internal/reader/atom"
 import (
 	"encoding/base64"
 	"html"
-	"log/slog"
 	"strings"
-	"time"
-
-	"miniflux.app/v2/internal/crypto"
-	"miniflux.app/v2/internal/model"
-	"miniflux.app/v2/internal/reader/date"
-	"miniflux.app/v2/internal/reader/sanitizer"
-	"miniflux.app/v2/internal/urllib"
 )
 
 // Specs: http://web.archive.org/web/20060811235523/http://www.mnot.net/drafts/draft-nottingham-atom-format-02.html
-type atom03Feed struct {
-	ID      string        `xml:"id"`
-	Title   atom03Text    `xml:"title"`
-	Author  atomPerson    `xml:"author"`
-	Links   atomLinks     `xml:"link"`
-	Entries []atom03Entry `xml:"entry"`
+type Atom03Feed struct {
+	Version string `xml:"version,attr"`
+
+	// The "atom:id" element's content conveys a permanent, globally unique identifier for the feed.
+	// It MUST NOT change over time, even if the feed is relocated. atom:feed elements MAY contain an atom:id element,
+	// but MUST NOT contain more than one. The content of this element, when present, MUST be a URI.
+	ID string `xml:"http://purl.org/atom/ns# id"`
+
+	// The "atom:title" element is a Content construct that conveys a human-readable title for the feed.
+	// atom:feed elements MUST contain exactly one atom:title element.
+	// If the feed describes a Web resource, its content SHOULD be the same as that resource's title.
+	Title Atom03Content `xml:"http://purl.org/atom/ns# title"`
+
+	// The "atom:link" element is a Link construct that conveys a URI associated with the feed.
+	// The nature of the relationship as well as the link itself is determined by the element's content.
+	// atom:feed elements MUST contain at least one atom:link element with a rel attribute value of "alternate".
+	// atom:feed elements MUST NOT contain more than one atom:link element with a rel attribute value of "alternate" that has the same type attribute value.
+	// atom:feed elements MAY contain additional atom:link elements beyond those described above.
+	Links AtomLinks `xml:"http://purl.org/atom/ns# link"`
+
+	// The "atom:author" element is a Person construct that indicates the default author of the feed.
+	// atom:feed elements MUST contain exactly one atom:author element,
+	// UNLESS all of the atom:feed element's child atom:entry elements contain an atom:author element.
+	// atom:feed elements MUST NOT contain more than one atom:author element.
+	Author AtomPerson `xml:"http://purl.org/atom/ns# author"`
+
+	// The "atom:entry" element's represents an individual entry that is contained by the feed.
+	// atom:feed elements MAY contain one or more atom:entry elements.
+	Entries []Atom03Entry `xml:"http://purl.org/atom/ns# entry"`
 }
 
-func (a *atom03Feed) Transform(baseURL string) *model.Feed {
-	var err error
-
-	feed := new(model.Feed)
-
-	feedURL := a.Links.firstLinkWithRelation("self")
-	feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
-	if err != nil {
-		feed.FeedURL = feedURL
-	}
-
-	siteURL := a.Links.originalLink()
-	feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
-	if err != nil {
-		feed.SiteURL = siteURL
-	}
-
-	feed.Title = a.Title.String()
-	if feed.Title == "" {
-		feed.Title = feed.SiteURL
-	}
-
-	for _, entry := range a.Entries {
-		item := entry.Transform()
-		entryURL, err := urllib.AbsoluteURL(feed.SiteURL, item.URL)
-		if err == nil {
-			item.URL = entryURL
-		}
-
-		if item.Author == "" {
-			item.Author = a.Author.String()
-		}
-
-		if item.Title == "" {
-			item.Title = sanitizer.TruncateHTML(item.Content, 100)
-		}
-
-		if item.Title == "" {
-			item.Title = item.URL
-		}
-
-		feed.Entries = append(feed.Entries, item)
-	}
-
-	return feed
+type Atom03Entry struct {
+	// The "atom:id" element's content conveys a permanent, globally unique identifier for the entry.
+	// It MUST NOT change over time, even if other representations of the entry (such as a web representation pointed to by the entry's atom:link element) are relocated.
+	// If the same entry is syndicated in two atom:feeds published by the same entity, the entry's atom:id MUST be the same in both feeds.
+	ID string `xml:"id"`
+
+	// The "atom:title" element is a Content construct that conveys a human-readable title for the entry.
+	// atom:entry elements MUST have exactly one "atom:title" element.
+	// If an entry describes a Web resource, its content SHOULD be the same as that resource's title.
+	Title Atom03Content `xml:"title"`
+
+	// The "atom:modified" element is a Date construct that indicates the time that the entry was last modified.
+	// atom:entry elements MUST contain an atom:modified element, but MUST NOT contain more than one.
+	// The content of an atom:modified element MUST have a time zone whose value SHOULD be "UTC".
+	Modified string `xml:"modified"`
+
+	// The "atom:issued" element is a Date construct that indicates the time that the entry was issued.
+	// atom:entry elements MUST contain an atom:issued element, but MUST NOT contain more than one.
+	// The content of an atom:issued element MAY omit a time zone.
+	Issued string `xml:"issued"`
+
+	// The "atom:created" element is a Date construct that indicates the time that the entry was created.
+	// atom:entry elements MAY contain an atom:created element, but MUST NOT contain more than one.
+	// The content of an atom:created element MUST have a time zone whose value SHOULD be "UTC".
+	// If atom:created is not present, its content MUST considered to be the same as that of atom:modified.
+	Created string `xml:"created"`
+
+	// The "atom:link" element is a Link construct that conveys a URI associated with the entry.
+	// The nature of the relationship as well as the link itself is determined by the element's content.
+	// atom:entry elements MUST contain at least one atom:link element with a rel attribute value of "alternate".
+	// atom:entry elements MUST NOT contain more than one atom:link element with a rel attribute value of "alternate" that has the same type attribute value.
+	// atom:entry elements MAY contain additional atom:link elements beyond those described above.
+	Links AtomLinks `xml:"link"`
+
+	// The "atom:summary" element is a Content construct that conveys a short summary, abstract or excerpt of the entry.
+	// atom:entry elements MAY contain an atom:created element, but MUST NOT contain more than one.
+	Summary Atom03Content `xml:"summary"`
+
+	// The "atom:content" element is a Content construct that conveys the content of the entry.
+	// atom:entry elements MAY contain one or more atom:content elements.
+	Content Atom03Content `xml:"content"`
+
+	// The "atom:author" element is a Person construct that indicates the default author of the entry.
+	// atom:entry elements MUST contain exactly one atom:author element,
+	// UNLESS the atom:feed element containing them contains an atom:author element itself.
+	// atom:entry elements MUST NOT contain more than one atom:author element.
+	Author AtomPerson `xml:"author"`
 }
 
-type atom03Entry struct {
-	ID       string     `xml:"id"`
-	Title    atom03Text `xml:"title"`
-	Modified string     `xml:"modified"`
-	Issued   string     `xml:"issued"`
-	Created  string     `xml:"created"`
-	Links    atomLinks  `xml:"link"`
-	Summary  atom03Text `xml:"summary"`
-	Content  atom03Text `xml:"content"`
-	Author   atomPerson `xml:"author"`
-}
-
-func (a *atom03Entry) Transform() *model.Entry {
-	entry := model.NewEntry()
-	entry.URL = a.Links.originalLink()
-	entry.Date = a.entryDate()
-	entry.Author = a.Author.String()
-	entry.Hash = a.entryHash()
-	entry.Content = a.entryContent()
-	entry.Title = a.entryTitle()
-	return entry
-}
-
-func (a *atom03Entry) entryTitle() string {
-	return sanitizer.StripTags(a.Title.String())
-}
-
-func (a *atom03Entry) entryContent() string {
-	content := a.Content.String()
-	if content != "" {
-		return content
-	}
-
-	summary := a.Summary.String()
-	if summary != "" {
-		return summary
-	}
-
-	return ""
-}
-
-func (a *atom03Entry) entryDate() time.Time {
-	dateText := ""
-	for _, value := range []string{a.Issued, a.Modified, a.Created} {
-		if value != "" {
-			dateText = value
-			break
-		}
-	}
-
-	if dateText != "" {
-		result, err := date.Parse(dateText)
-		if err != nil {
-			slog.Debug("Unable to parse date from Atom 0.3 feed",
-				slog.String("date", dateText),
-				slog.String("id", a.ID),
-				slog.Any("error", err),
-			)
-			return time.Now()
-		}
-
-		return result
-	}
-
-	return time.Now()
-}
-
-func (a *atom03Entry) entryHash() string {
-	for _, value := range []string{a.ID, a.Links.originalLink()} {
-		if value != "" {
-			return crypto.Hash(value)
-		}
-	}
-
-	return ""
-}
+type Atom03Content struct {
+	// Content constructs MAY have a "type" attribute, whose value indicates the media type of the content.
+	// When present, this attribute's value MUST be a registered media type [RFC2045].
+	// If not present, its value MUST be considered to be "text/plain".
+	Type string `xml:"type,attr"`
+
+	// Content constructs MAY have a "mode" attribute, whose value indicates the method used to encode the content.
+	// When present, this attribute's value MUST be listed below.
+	// If not present, its value MUST be considered to be "xml".
+	//
+	// "xml": A mode attribute with the value "xml" indicates that the element's content is inline xml (for example, namespace-qualified XHTML).
+	//
+	// "escaped": A mode attribute with the value "escaped" indicates that the element's content is an escaped string.
+	// Processors MUST unescape the element's content before considering it as content of the indicated media type.
+	//
+	// "base64": A mode attribute with the value "base64" indicates that the element's content is base64-encoded [RFC2045].
+	// Processors MUST decode the element's content before considering it as content of the the indicated media type.
+	Mode string `xml:"mode,attr"`
 
-type atom03Text struct {
-	Type     string `xml:"type,attr"`
-	Mode     string `xml:"mode,attr"`
 	CharData string `xml:",chardata"`
 	InnerXML string `xml:",innerxml"`
 }
 
-func (a *atom03Text) String() string {
+func (a *Atom03Content) Content() string {
 	content := ""
 
 	switch {

+ 115 - 0
internal/reader/atom/atom_03_adapter.go

@@ -0,0 +1,115 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package atom // import "miniflux.app/v2/internal/reader/atom"
+
+import (
+	"log/slog"
+	"time"
+
+	"miniflux.app/v2/internal/crypto"
+	"miniflux.app/v2/internal/model"
+	"miniflux.app/v2/internal/reader/date"
+	"miniflux.app/v2/internal/reader/sanitizer"
+	"miniflux.app/v2/internal/urllib"
+)
+
+type Atom03Adapter struct {
+	atomFeed *Atom03Feed
+}
+
+func NewAtom03Adapter(atomFeed *Atom03Feed) *Atom03Adapter {
+	return &Atom03Adapter{atomFeed}
+}
+
+func (a *Atom03Adapter) BuildFeed(baseURL string) *model.Feed {
+	feed := new(model.Feed)
+
+	// Populate the feed URL.
+	feedURL := a.atomFeed.Links.firstLinkWithRelation("self")
+	if feedURL != "" {
+		if absoluteFeedURL, err := urllib.AbsoluteURL(baseURL, feedURL); err == nil {
+			feed.FeedURL = absoluteFeedURL
+		}
+	} else {
+		feed.FeedURL = baseURL
+	}
+
+	// Populate the site URL.
+	siteURL := a.atomFeed.Links.OriginalLink()
+	if siteURL != "" {
+		if absoluteSiteURL, err := urllib.AbsoluteURL(baseURL, siteURL); err == nil {
+			feed.SiteURL = absoluteSiteURL
+		}
+	} else {
+		feed.SiteURL = baseURL
+	}
+
+	// Populate the feed title.
+	feed.Title = a.atomFeed.Title.Content()
+	if feed.Title == "" {
+		feed.Title = feed.SiteURL
+	}
+
+	for _, atomEntry := range a.atomFeed.Entries {
+		entry := model.NewEntry()
+
+		// Populate the entry URL.
+		entry.URL = atomEntry.Links.OriginalLink()
+		if entry.URL != "" {
+			if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL); err == nil {
+				entry.URL = absoluteEntryURL
+			}
+		}
+
+		// Populate the entry content.
+		entry.Content = atomEntry.Content.Content()
+		if entry.Content == "" {
+			entry.Content = atomEntry.Summary.Content()
+		}
+
+		// Populate the entry title.
+		entry.Title = atomEntry.Title.Content()
+		if entry.Title == "" {
+			entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
+		}
+		if entry.Title == "" {
+			entry.Title = entry.URL
+		}
+
+		// Populate the entry author.
+		entry.Author = atomEntry.Author.PersonName()
+		if entry.Author == "" {
+			entry.Author = a.atomFeed.Author.PersonName()
+		}
+
+		// Populate the entry date.
+		for _, value := range []string{atomEntry.Issued, atomEntry.Modified, atomEntry.Created} {
+			if parsedDate, err := date.Parse(value); err == nil {
+				entry.Date = parsedDate
+				break
+			} else {
+				slog.Debug("Unable to parse date from Atom 0.3 feed",
+					slog.String("date", value),
+					slog.String("id", atomEntry.ID),
+					slog.Any("error", err),
+				)
+			}
+		}
+		if entry.Date.IsZero() {
+			entry.Date = time.Now()
+		}
+
+		// Generate the entry hash.
+		for _, value := range []string{atomEntry.ID, atomEntry.Links.OriginalLink()} {
+			if value != "" {
+				entry.Hash = crypto.Hash(value)
+				break
+			}
+		}
+
+		feed.Entries = append(feed.Entries, entry)
+	}
+
+	return feed
+}

+ 24 - 2
internal/reader/atom/atom_03_test.go

@@ -27,7 +27,7 @@ func TestParseAtom03(t *testing.T) {
 		</entry>
 	</feed>`
 
-	feed, err := Parse("http://diveintomark.org/", bytes.NewReader([]byte(data)), "0.3")
+	feed, err := Parse("http://diveintomark.org/atom.xml", bytes.NewReader([]byte(data)), "0.3")
 	if err != nil {
 		t.Fatal(err)
 	}
@@ -36,7 +36,7 @@ func TestParseAtom03(t *testing.T) {
 		t.Errorf("Incorrect title, got: %s", feed.Title)
 	}
 
-	if feed.FeedURL != "http://diveintomark.org/" {
+	if feed.FeedURL != "http://diveintomark.org/atom.xml" {
 		t.Errorf("Incorrect feed URL, got: %s", feed.FeedURL)
 	}
 
@@ -74,6 +74,28 @@ func TestParseAtom03(t *testing.T) {
 	}
 }
 
+func TestParseAtom03WithoutSiteURL(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+		<modified>2003-12-13T18:30:02Z</modified>
+		<author><name>Mark Pilgrim</name></author>
+		<entry>
+			<title>Atom 0.3 snapshot</title>
+			<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
+			<id>tag:diveintomark.org,2003:3.2397</id>
+		</entry>
+	</feed>`
+
+	feed, err := Parse("http://diveintomark.org/atom.xml", bytes.NewReader([]byte(data)), "0.3")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if feed.SiteURL != "http://diveintomark.org/atom.xml" {
+		t.Errorf("Incorrect title, got: %s", feed.Title)
+	}
+}
+
 func TestParseAtom03WithoutFeedTitle(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed version="0.3" xmlns="http://purl.org/atom/ns#">

+ 163 - 250
internal/reader/atom/atom_10.go

@@ -6,286 +6,199 @@ package atom // import "miniflux.app/v2/internal/reader/atom"
 import (
 	"encoding/xml"
 	"html"
-	"log/slog"
-	"strconv"
 	"strings"
-	"time"
 
-	"miniflux.app/v2/internal/crypto"
-	"miniflux.app/v2/internal/model"
-	"miniflux.app/v2/internal/reader/date"
 	"miniflux.app/v2/internal/reader/media"
 	"miniflux.app/v2/internal/reader/sanitizer"
-	"miniflux.app/v2/internal/urllib"
 )
 
+// The "atom:feed" element is the document (i.e., top-level) element of
+// an Atom Feed Document, acting as a container for metadata and data
+// associated with the feed. Its element children consist of metadata
+// elements followed by zero or more atom:entry child elements.
+//
 // Specs:
 // https://tools.ietf.org/html/rfc4287
 // https://validator.w3.org/feed/docs/atom.html
-type atom10Feed struct {
-	XMLName xml.Name      `xml:"http://www.w3.org/2005/Atom feed"`
-	ID      string        `xml:"id"`
-	Title   atom10Text    `xml:"title"`
-	Authors atomAuthors   `xml:"author"`
-	Icon    string        `xml:"icon"`
-	Links   atomLinks     `xml:"link"`
-	Entries []atom10Entry `xml:"entry"`
+type Atom10Feed struct {
+	XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
+
+	// The "atom:id" element conveys a permanent, universally unique
+	// identifier for an entry or feed.
+	//
+	// Its content MUST be an IRI, as defined by [RFC3987].  Note that the
+	// definition of "IRI" excludes relative references.  Though the IRI
+	// might use a dereferencable scheme, Atom Processors MUST NOT assume it
+	// can be dereferenced.
+	//
+	// atom:feed elements MUST contain exactly one atom:id element.
+	ID string `xml:"http://www.w3.org/2005/Atom id"`
+
+	// The "atom:title" element is a Text construct that conveys a human-
+	// readable title for an entry or feed.
+	//
+	// atom:feed elements MUST contain exactly one atom:title element.
+	Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"`
+
+	// The "atom:author" element is a Person construct that indicates the
+	// author of the entry or feed.
+	//
+	// atom:feed elements MUST contain one or more atom:author elements,
+	// unless all of the atom:feed element's child atom:entry elements
+	// contain at least one atom:author element.
+	Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"`
+
+	// The "atom:icon" element's content is an IRI reference [RFC3987] that
+	// identifies an image that provides iconic visual identification for a
+	// feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:icon element.
+	Icon string `xml:"http://www.w3.org/2005/Atom icon"`
+
+	// The "atom:logo" element's content is an IRI reference [RFC3987] that
+	// identifies an image that provides visual identification for a feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:logo element.
+	Logo string `xml:"http://www.w3.org/2005/Atom logo"`
+
+	// atom:feed elements SHOULD contain one atom:link element with a rel
+	// attribute value of "self". This is the preferred URI for
+	// retrieving Atom Feed Documents representing this Atom feed.
+	//
+	// atom:feed elements MUST NOT contain more than one atom:link
+	// element with a rel attribute value of "alternate" that has the
+	// same combination of type and hreflang attribute values.
+	Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"`
+
+	// The "atom:category" element conveys information about a category
+	// associated with an entry or feed.  This specification assigns no
+	// meaning to the content (if any) of this element.
+	//
+	// atom:feed elements MAY contain any number of atom:category
+	// elements.
+	Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"`
+
+	Entries []Atom10Entry `xml:"http://www.w3.org/2005/Atom entry"`
 }
 
-func (a *atom10Feed) Transform(baseURL string) *model.Feed {
-	var err error
+type Atom10Entry struct {
+	// The "atom:id" element conveys a permanent, universally unique
+	// identifier for an entry or feed.
+	//
+	// Its content MUST be an IRI, as defined by [RFC3987].  Note that the
+	// definition of "IRI" excludes relative references.  Though the IRI
+	// might use a dereferencable scheme, Atom Processors MUST NOT assume it
+	// can be dereferenced.
+	//
+	// atom:entry elements MUST contain exactly one atom:id element.
+	ID string `xml:"http://www.w3.org/2005/Atom id"`
+
+	// The "atom:title" element is a Text construct that conveys a human-
+	// readable title for an entry or feed.
+	//
+	// atom:entry elements MUST contain exactly one atom:title element.
+	Title Atom10Text `xml:"http://www.w3.org/2005/Atom title"`
+
+	// The "atom:published" element is a Date construct indicating an
+	// instant in time associated with an event early in the life cycle of
+	// the entry.
+	Published string `xml:"http://www.w3.org/2005/Atom published"`
+
+	// The "atom:updated" element is a Date construct indicating the most
+	// recent instant in time when an entry or feed was modified in a way
+	// the publisher considers significant. Therefore, not all
+	// modifications necessarily result in a changed atom:updated value.
+	//
+	// atom:entry elements MUST contain exactly one atom:updated element.
+	Updated string `xml:"http://www.w3.org/2005/Atom updated"`
+
+	// atom:entry elements MUST NOT contain more than one atom:link
+	// element with a rel attribute value of "alternate" that has the
+	// same combination of type and hreflang attribute values.
+	Links AtomLinks `xml:"http://www.w3.org/2005/Atom link"`
+
+	// atom:entry elements MUST contain an atom:summary element in either
+	// of the following cases:
+	// *  the atom:entry contains an atom:content that has a "src"
+	//    attribute (and is thus empty).
+	// *  the atom:entry contains content that is encoded in Base64;
+	//    i.e., the "type" attribute of atom:content is a MIME media type
+	//    [MIMEREG], but is not an XML media type [RFC3023], does not
+	//    begin with "text/", and does not end with "/xml" or "+xml".
+	//
+	// atom:entry elements MUST NOT contain more than one atom:summary
+	// element.
+	Summary Atom10Text `xml:"http://www.w3.org/2005/Atom summary"`
+
+	// atom:entry elements MUST NOT contain more than one atom:content
+	// element.
+	Content Atom10Text `xml:"http://www.w3.org/2005/Atom content"`
+
+	// The "atom:author" element is a Person construct that indicates the
+	// author of the entry or feed.
+	//
+	// atom:entry elements MUST contain one or more atom:author elements
+	Authors AtomPersons `xml:"http://www.w3.org/2005/Atom author"`
+
+	// The "atom:category" element conveys information about a category
+	// associated with an entry or feed.  This specification assigns no
+	// meaning to the content (if any) of this element.
+	//
+	// atom:entry elements MAY contain any number of atom:category
+	// elements.
+	Categories AtomCategories `xml:"http://www.w3.org/2005/Atom category"`
 
-	feed := new(model.Feed)
-
-	feedURL := a.Links.firstLinkWithRelation("self")
-	feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
-	if err != nil {
-		feed.FeedURL = feedURL
-	}
-
-	siteURL := a.Links.originalLink()
-	feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
-	if err != nil {
-		feed.SiteURL = siteURL
-	}
-
-	feed.Title = html.UnescapeString(a.Title.String())
-	if feed.Title == "" {
-		feed.Title = feed.SiteURL
-	}
-
-	feed.IconURL = strings.TrimSpace(a.Icon)
-
-	for _, entry := range a.Entries {
-		item := entry.Transform()
-		entryURL, err := urllib.AbsoluteURL(feed.SiteURL, item.URL)
-		if err == nil {
-			item.URL = entryURL
-		}
-
-		if item.Author == "" {
-			item.Author = a.Authors.String()
-		}
-
-		if item.Title == "" {
-			item.Title = sanitizer.TruncateHTML(item.Content, 100)
-		}
-
-		if item.Title == "" {
-			item.Title = item.URL
-		}
-
-		feed.Entries = append(feed.Entries, item)
-	}
-
-	return feed
-}
-
-type atom10Entry struct {
-	ID         string           `xml:"id"`
-	Title      atom10Text       `xml:"title"`
-	Published  string           `xml:"published"`
-	Updated    string           `xml:"updated"`
-	Links      atomLinks        `xml:"link"`
-	Summary    atom10Text       `xml:"summary"`
-	Content    atom10Text       `xml:"http://www.w3.org/2005/Atom content"`
-	Authors    atomAuthors      `xml:"author"`
-	Categories []atom10Category `xml:"category"`
 	media.MediaItemElement
 }
 
-func (a *atom10Entry) Transform() *model.Entry {
-	entry := model.NewEntry()
-	entry.URL = a.Links.originalLink()
-	entry.Date = a.entryDate()
-	entry.Author = a.Authors.String()
-	entry.Hash = a.entryHash()
-	entry.Content = a.entryContent()
-	entry.Title = a.entryTitle()
-	entry.Enclosures = a.entryEnclosures()
-	entry.CommentsURL = a.entryCommentsURL()
-	entry.Tags = a.entryCategories()
-	return entry
-}
-
-func (a *atom10Entry) entryTitle() string {
-	return html.UnescapeString(a.Title.String())
-}
-
-func (a *atom10Entry) entryContent() string {
-	content := a.Content.String()
-	if content != "" {
-		return content
-	}
-
-	summary := a.Summary.String()
-	if summary != "" {
-		return summary
-	}
-
-	mediaDescription := a.FirstMediaDescription()
-	if mediaDescription != "" {
-		return mediaDescription
-	}
-
-	return ""
-}
-
-// Note: The published date represents the original creation date for YouTube feeds.
-// Example:
-// <published>2019-01-26T08:02:28+00:00</published>
-// <updated>2019-01-29T07:27:27+00:00</updated>
-func (a *atom10Entry) entryDate() time.Time {
-	dateText := a.Published
-	if dateText == "" {
-		dateText = a.Updated
-	}
-
-	if dateText != "" {
-		result, err := date.Parse(dateText)
-		if err != nil {
-			slog.Debug("Unable to parse date from Atom 0.3 feed",
-				slog.String("date", dateText),
-				slog.String("id", a.ID),
-				slog.Any("error", err),
-			)
-			return time.Now()
-		}
-
-		return result
-	}
-
-	return time.Now()
-}
-
-func (a *atom10Entry) entryHash() string {
-	for _, value := range []string{a.ID, a.Links.originalLink()} {
-		if value != "" {
-			return crypto.Hash(value)
-		}
-	}
-
-	return ""
+// A Text construct contains human-readable text, usually in small
+// quantities. The content of Text constructs is Language-Sensitive.
+// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1
+// Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1
+// HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2
+// XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3
+type Atom10Text struct {
+	Type             string               `xml:"type,attr"`
+	CharData         string               `xml:",chardata"`
+	InnerXML         string               `xml:",innerxml"`
+	XHTMLRootElement AtomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
 }
 
-func (a *atom10Entry) entryEnclosures() model.EnclosureList {
-	enclosures := make(model.EnclosureList, 0)
-	duplicates := make(map[string]bool)
-
-	for _, mediaThumbnail := range a.AllMediaThumbnails() {
-		if _, found := duplicates[mediaThumbnail.URL]; !found {
-			duplicates[mediaThumbnail.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaThumbnail.URL,
-				MimeType: mediaThumbnail.MimeType(),
-				Size:     mediaThumbnail.Size(),
-			})
-		}
-	}
-
-	for _, link := range a.Links {
-		if strings.EqualFold(link.Rel, "enclosure") {
-			if link.URL == "" {
-				continue
-			}
-
-			if _, found := duplicates[link.URL]; !found {
-				duplicates[link.URL] = true
-				length, _ := strconv.ParseInt(link.Length, 10, 0)
-				enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
-			}
-		}
-	}
-
-	for _, mediaContent := range a.AllMediaContents() {
-		if _, found := duplicates[mediaContent.URL]; !found {
-			duplicates[mediaContent.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaContent.URL,
-				MimeType: mediaContent.MimeType(),
-				Size:     mediaContent.Size(),
-			})
-		}
-	}
+func (a *Atom10Text) Body() string {
+	var content string
 
-	for _, mediaPeerLink := range a.AllMediaPeerLinks() {
-		if _, found := duplicates[mediaPeerLink.URL]; !found {
-			duplicates[mediaPeerLink.URL] = true
-			enclosures = append(enclosures, &model.Enclosure{
-				URL:      mediaPeerLink.URL,
-				MimeType: mediaPeerLink.MimeType(),
-				Size:     mediaPeerLink.Size(),
-			})
-		}
+	if strings.EqualFold(a.Type, "xhtml") {
+		content = a.xhtmlContent()
+	} else {
+		content = a.CharData
 	}
 
-	return enclosures
+	return strings.TrimSpace(content)
 }
 
-func (r *atom10Entry) entryCategories() []string {
-	categoryList := make([]string, 0)
-
-	for _, atomCategory := range r.Categories {
-		if strings.TrimSpace(atomCategory.Label) != "" {
-			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Label))
-		} else {
-			categoryList = append(categoryList, strings.TrimSpace(atomCategory.Term))
-		}
-	}
-
-	return categoryList
-}
+func (a *Atom10Text) Title() string {
+	var content string
 
-// See https://tools.ietf.org/html/rfc4685#section-4
-// If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
-// We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
-func (a *atom10Entry) entryCommentsURL() string {
-	commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
-	if urllib.IsAbsoluteURL(commentsURL) {
-		return commentsURL
+	if strings.EqualFold(a.Type, "xhtml") {
+		content = a.xhtmlContent()
+	} else if strings.Contains(a.InnerXML, "<![CDATA[") {
+		content = html.UnescapeString(a.CharData)
+	} else {
+		content = a.CharData
 	}
-	return ""
-}
-
-type atom10Text struct {
-	Type             string               `xml:"type,attr"`
-	CharData         string               `xml:",chardata"`
-	InnerXML         string               `xml:",innerxml"`
-	XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
-}
 
-type atom10Category struct {
-	Term  string `xml:"term,attr"`
-	Label string `xml:"label,attr"`
+	content = sanitizer.StripTags(content)
+	return strings.TrimSpace(content)
 }
 
-// Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1
-// HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2
-// XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3
-func (a *atom10Text) String() string {
-	var content string
-	switch {
-	case a.Type == "", a.Type == "text", a.Type == "text/plain":
-		if strings.HasPrefix(strings.TrimSpace(a.InnerXML), `<![CDATA[`) {
-			content = html.EscapeString(a.CharData)
-		} else {
-			content = a.InnerXML
-		}
-	case a.Type == "xhtml":
-		var root = a.XHTMLRootElement
-		if root.XMLName.Local == "div" {
-			content = root.InnerXML
-		} else {
-			content = a.InnerXML
-		}
-	default:
-		content = a.CharData
+func (a *Atom10Text) xhtmlContent() string {
+	if a.XHTMLRootElement.XMLName.Local == "div" {
+		return a.XHTMLRootElement.InnerXML
 	}
-
-	return strings.TrimSpace(content)
+	return a.InnerXML
 }
 
-type atomXHTMLRootElement struct {
+type AtomXHTMLRootElement struct {
 	XMLName  xml.Name `xml:"div"`
 	InnerXML string   `xml:",innerxml"`
 }

+ 210 - 0
internal/reader/atom/atom_10_adapter.go

@@ -0,0 +1,210 @@
+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0
+
+package atom // import "miniflux.app/v2/internal/reader/atom"
+
+import (
+	"log/slog"
+	"slices"
+	"sort"
+	"strconv"
+	"strings"
+	"time"
+
+	"miniflux.app/v2/internal/crypto"
+	"miniflux.app/v2/internal/model"
+	"miniflux.app/v2/internal/reader/date"
+	"miniflux.app/v2/internal/reader/sanitizer"
+	"miniflux.app/v2/internal/urllib"
+)
+
+type Atom10Adapter struct {
+	atomFeed *Atom10Feed
+}
+
+func NewAtom10Adapter(atomFeed *Atom10Feed) *Atom10Adapter {
+	return &Atom10Adapter{atomFeed}
+}
+
+func (a *Atom10Adapter) BuildFeed(baseURL string) *model.Feed {
+	feed := new(model.Feed)
+
+	// Populate the feed URL.
+	feedURL := a.atomFeed.Links.firstLinkWithRelation("self")
+	if feedURL != "" {
+		if absoluteFeedURL, err := urllib.AbsoluteURL(baseURL, feedURL); err == nil {
+			feed.FeedURL = absoluteFeedURL
+		}
+	} else {
+		feed.FeedURL = baseURL
+	}
+
+	// Populate the site URL.
+	siteURL := a.atomFeed.Links.OriginalLink()
+	if siteURL != "" {
+		if absoluteSiteURL, err := urllib.AbsoluteURL(baseURL, siteURL); err == nil {
+			feed.SiteURL = absoluteSiteURL
+		}
+	} else {
+		feed.SiteURL = baseURL
+	}
+
+	// Populate the feed title.
+	feed.Title = a.atomFeed.Title.Body()
+	if feed.Title == "" {
+		feed.Title = feed.SiteURL
+	}
+
+	// Populate the feed icon.
+	if a.atomFeed.Icon != "" {
+		if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, a.atomFeed.Icon); err == nil {
+			feed.IconURL = absoluteIconURL
+		}
+	} else if a.atomFeed.Logo != "" {
+		if absoluteLogoURL, err := urllib.AbsoluteURL(feed.SiteURL, a.atomFeed.Logo); err == nil {
+			feed.IconURL = absoluteLogoURL
+		}
+	}
+
+	for _, atomEntry := range a.atomFeed.Entries {
+		entry := model.NewEntry()
+
+		// Populate the entry URL.
+		entry.URL = atomEntry.Links.OriginalLink()
+		if entry.URL != "" {
+			if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL); err == nil {
+				entry.URL = absoluteEntryURL
+			}
+		}
+
+		// Populate the entry content.
+		entry.Content = atomEntry.Content.Body()
+		if entry.Content == "" {
+			entry.Content = atomEntry.Summary.Body()
+		}
+		if entry.Content == "" {
+			entry.Content = atomEntry.FirstMediaDescription()
+		}
+
+		// Populate the entry title.
+		entry.Title = atomEntry.Title.Title()
+		if entry.Title == "" {
+			entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
+		}
+		if entry.Title == "" {
+			entry.Title = entry.URL
+		}
+
+		// Populate the entry author.
+		authors := atomEntry.Authors.PersonNames()
+		if len(authors) == 0 {
+			authors = append(authors, a.atomFeed.Authors.PersonNames()...)
+		}
+		authors = slices.Compact(authors)
+		sort.Strings(authors)
+		entry.Author = strings.Join(authors, ", ")
+
+		// Populate the entry date.
+		for _, value := range []string{atomEntry.Published, atomEntry.Updated} {
+			if parsedDate, err := date.Parse(value); err != nil {
+				slog.Debug("Unable to parse date from Atom 1.0 feed",
+					slog.String("date", value),
+					slog.String("url", entry.URL),
+					slog.Any("error", err),
+				)
+			} else {
+				entry.Date = parsedDate
+				break
+			}
+		}
+		if entry.Date.IsZero() {
+			entry.Date = time.Now()
+		}
+
+		// Populate categories.
+		categories := atomEntry.Categories.CategoryNames()
+		if len(categories) == 0 {
+			categories = append(categories, a.atomFeed.Categories.CategoryNames()...)
+		}
+		if len(categories) > 0 {
+			categories = slices.Compact(categories)
+			sort.Strings(categories)
+			entry.Tags = categories
+		}
+
+		// Populate the commentsURL if defined.
+		// See https://tools.ietf.org/html/rfc4685#section-4
+		// If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
+		// We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
+		commentsURL := atomEntry.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
+		if urllib.IsAbsoluteURL(commentsURL) {
+			entry.CommentsURL = commentsURL
+		}
+
+		// Generate the entry hash.
+		for _, value := range []string{atomEntry.ID, atomEntry.Links.OriginalLink()} {
+			if value != "" {
+				entry.Hash = crypto.Hash(value)
+				break
+			}
+		}
+
+		// Populate the entry enclosures.
+		uniqueEnclosuresMap := make(map[string]bool)
+
+		for _, mediaThumbnail := range atomEntry.AllMediaThumbnails() {
+			if _, found := uniqueEnclosuresMap[mediaThumbnail.URL]; !found {
+				uniqueEnclosuresMap[mediaThumbnail.URL] = true
+				entry.Enclosures = append(entry.Enclosures, &model.Enclosure{
+					URL:      mediaThumbnail.URL,
+					MimeType: mediaThumbnail.MimeType(),
+					Size:     mediaThumbnail.Size(),
+				})
+			}
+		}
+
+		for _, link := range atomEntry.Links {
+			if strings.EqualFold(link.Rel, "enclosure") {
+				if link.Href == "" {
+					continue
+				}
+
+				if _, found := uniqueEnclosuresMap[link.Href]; !found {
+					uniqueEnclosuresMap[link.Href] = true
+					length, _ := strconv.ParseInt(link.Length, 10, 0)
+					entry.Enclosures = append(entry.Enclosures, &model.Enclosure{
+						URL:      link.Href,
+						MimeType: link.Type,
+						Size:     length,
+					})
+				}
+			}
+		}
+
+		for _, mediaContent := range atomEntry.AllMediaContents() {
+			if _, found := uniqueEnclosuresMap[mediaContent.URL]; !found {
+				uniqueEnclosuresMap[mediaContent.URL] = true
+				entry.Enclosures = append(entry.Enclosures, &model.Enclosure{
+					URL:      mediaContent.URL,
+					MimeType: mediaContent.MimeType(),
+					Size:     mediaContent.Size(),
+				})
+			}
+		}
+
+		for _, mediaPeerLink := range atomEntry.AllMediaPeerLinks() {
+			if _, found := uniqueEnclosuresMap[mediaPeerLink.URL]; !found {
+				uniqueEnclosuresMap[mediaPeerLink.URL] = true
+				entry.Enclosures = append(entry.Enclosures, &model.Enclosure{
+					URL:      mediaPeerLink.URL,
+					MimeType: mediaPeerLink.MimeType(),
+					Size:     mediaPeerLink.Size(),
+				})
+			}
+		}
+
+		feed.Entries = append(feed.Entries, entry)
+	}
+
+	return feed
+}

+ 70 - 75
internal/reader/atom/atom_10_test.go

@@ -12,7 +12,6 @@ import (
 func TestParseAtomSample(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
-
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
 	  <updated>2003-12-13T18:30:02Z</updated>
@@ -20,7 +19,6 @@ func TestParseAtomSample(t *testing.T) {
 		<name>John Doe</name>
 	  </author>
 	  <id>urn:uuid:60a76c80-d399-11d9-b93C-0003939e0af6</id>
-
 	  <entry>
 		<title>Atom-Powered Robots Run Amok</title>
 		<link href="http://example.org/2003/12/13/atom03"/>
@@ -28,7 +26,6 @@ func TestParseAtomSample(t *testing.T) {
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("http://example.org/feed.xml", bytes.NewReader([]byte(data)), "10")
@@ -420,7 +417,7 @@ func TestParseEntryWithPlainTextTitle(t *testing.T) {
 	expected := `AT&T bought by SBC!`
 	for i := range 2 {
 		if feed.Entries[i].Title != expected {
-			t.Errorf("Incorrect title for entry #%d, got: %q", i, feed.Entries[i].Title)
+			t.Errorf("Incorrect title for entry #%d, got: %q instead of %q", i, feed.Entries[i].Title, expected)
 		}
 	}
 }
@@ -430,33 +427,20 @@ func TestParseEntryWithHTMLTitle(t *testing.T) {
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
-
 	  <entry>
-		<title type="html">&lt;code&gt;Test&lt;/code&gt; Test</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary>Some text.</summary>
+		<title type="html">&lt;code&gt;Code&lt;/code&gt; Test</title>
+		<link href="http://example.org/z"/>
 	  </entry>
-
 	  <entry>
-		<title type="html"><![CDATA[Test &#8220;Test&#8221;]]></title>
-		<link href="http://example.org/2003/12/13/atom03"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary>Some text.</summary>
+		<title type="html"><![CDATA[Test with &#8220;unicode quote&#8221;]]></title>
+		<link href="http://example.org/b"/>
 	  </entry>
-
 	  <entry>
 		<title>
 			<![CDATA[Entry title with space around CDATA]]>
 		</title>
-		<link href="http://example.org/2003/12/13/atom03"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary>Some text.</summary>
+		<link href="http://example.org/c"/>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -464,11 +448,11 @@ func TestParseEntryWithHTMLTitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != "<code>Test</code> Test" {
+	if feed.Entries[0].Title != "Code Test" {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
 
-	if feed.Entries[1].Title != "Test “Test”" {
+	if feed.Entries[1].Title != "Test with “unicode quote”" {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[1].Title)
 	}
 
@@ -502,8 +486,8 @@ func TestParseEntryWithXHTMLTitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != `This is <b>XHTML</b> content.` {
-		t.Errorf("Incorrect entry title, got: %q", feed.Entries[1].Title)
+	if feed.Entries[0].Title != `This is XHTML content.` {
+		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
 }
 
@@ -608,7 +592,7 @@ func TestParseEntryWithDoubleEncodedEntitiesTitle(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	if feed.Entries[0].Title != `&#39;AT&amp;T&#39;` {
+	if feed.Entries[0].Title != `'AT&T'` {
 		t.Errorf("Incorrect entry title, got: %q", feed.Entries[0].Title)
 	}
 }
@@ -644,31 +628,21 @@ func TestParseEntryWithHTMLSummary(t *testing.T) {
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
-
 	  <entry>
-		<title type="html">Example</title>
+		<title type="html">Example 1</title>
 		<link href="http://example.org/1"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary type="html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt;&lt;/code&gt;</summary>
+		<summary type="html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt; myvar;&lt;/code&gt;</summary>
 	  </entry>
-
 	  <entry>
-		<title type="html">Example</title>
+		<title type="html">Example 2</title>
 		<link href="http://example.org/2"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary type="text/html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt;&lt;/code&gt;</summary>
+		<summary type="text/html">&lt;code&gt;std::unique_ptr&amp;lt;S&amp;gt; myvar;&lt;/code&gt;</summary>
 	  </entry>
-
 	  <entry>
-		<title type="html">Example</title>
+		<title type="html">Example 3</title>
 		<link href="http://example.org/3"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
-		<updated>2003-12-13T18:30:02Z</updated>
-		<summary type="html"><![CDATA[<code>std::unique_ptr&lt;S&gt;</code>]]></summary>
+		<summary type="html"><![CDATA[<code>std::unique_ptr&lt;S&gt; myvar;</code>]]></summary>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -676,7 +650,11 @@ func TestParseEntryWithHTMLSummary(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	expected := `<code>std::unique_ptr&lt;S&gt;</code>`
+	if len(feed.Entries) != 3 {
+		t.Fatalf("Incorrect number of entries, got: %d", len(feed.Entries))
+	}
+
+	expected := `<code>std::unique_ptr&lt;S&gt; myvar;</code>`
 	for i := range 3 {
 		if feed.Entries[i].Content != expected {
 			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
@@ -728,7 +706,7 @@ func TestParseEntryWithTextSummary(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	expected := `AT&amp;T &lt;S&gt;`
+	expected := `AT&T <S>`
 	for i := range 4 {
 		if feed.Entries[i].Content != expected {
 			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
@@ -747,7 +725,7 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		<link href="http://example.org/a"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content>AT&amp;T &lt;S&gt;</content>
+		<content>AT&amp;T &lt;strong&gt;Strong Element&lt;/strong&gt;</content>
 	  </entry>
 
 	  <entry>
@@ -755,7 +733,7 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		<link href="http://example.org/b"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content type="text">AT&amp;T &lt;S&gt;</content>
+		<content type="text">AT&amp;T &lt;strong&gt;Strong Element&lt;/strong&gt;</content>
 	  </entry>
 
 	  <entry>
@@ -763,7 +741,7 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		<link href="http://example.org/c"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content type="text/plain">AT&amp;T &lt;S&gt;</content>
+		<content type="text/plain">AT&amp;T &lt;strong&gt;Strong Element&lt;/strong&gt;</content>
 	  </entry>
 
 	  <entry>
@@ -771,7 +749,7 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		<link href="http://example.org/d"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
-		<content><![CDATA[AT&T <S>]]></content>
+		<content><![CDATA[AT&T <strong>Strong Element</strong>]]></content>
 	  </entry>
 
 	</feed>`
@@ -781,10 +759,10 @@ func TestParseEntryWithTextContent(t *testing.T) {
 		t.Fatal(err)
 	}
 
-	expected := `AT&amp;T &lt;S&gt;`
+	expected := `AT&T <strong>Strong Element</strong>`
 	for i := range 4 {
 		if feed.Entries[i].Content != expected {
-			t.Errorf("Incorrect content for entry #%d, got: %q", i, feed.Entries[i].Content)
+			t.Errorf("Incorrect content for entry #%d, got: %q instead of %q", i, feed.Entries[i].Content, expected)
 		}
 	}
 }
@@ -925,7 +903,6 @@ func TestParseEntryWithMultipleAuthors(t *testing.T) {
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
-
 	  <entry>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
@@ -938,7 +915,6 @@ func TestParseEntryWithMultipleAuthors(t *testing.T) {
 			<name>Bob</name>
 		</author>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -951,7 +927,7 @@ func TestParseEntryWithMultipleAuthors(t *testing.T) {
 	}
 }
 
-func TestParseEntryWithoutAuthor(t *testing.T) {
+func TestParseFeedWithEntryWithoutAuthor(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
@@ -959,14 +935,12 @@ func TestParseEntryWithoutAuthor(t *testing.T) {
 	  <author>
 		<name>John Doe</name>
 	  </author>
-
 	  <entry>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -990,14 +964,15 @@ func TestParseFeedWithMultipleAuthors(t *testing.T) {
 	  <author>
 		<name>Bob</name>
 	  </author>
-
+	  <author>
+		<name>Bob</name>
+	  </author>
 	  <entry>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -1015,14 +990,12 @@ func TestParseFeedWithoutAuthor(t *testing.T) {
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
-
 	  <entry>
 		<link href="http://example.org/2003/12/13/atom03"/>
 		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -1608,27 +1581,18 @@ func TestAbsoluteCommentsURL(t *testing.T) {
 	}
 }
 
-func TestParseFeedWithCategories(t *testing.T) {
+func TestParseItemWithCategories(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 	  <title>Example Feed</title>
 	  <link href="http://example.org/"/>
-	  <author>
-		<name>Alice</name>
-	  </author>
-	  <author>
-		<name>Bob</name>
-	  </author>
-
 	  <entry>
-		<link href="http://example.org/2003/12/13/atom03"/>
-		<id>urn:uuid:1225c695-cfb8-4ebb-aaaa-80da344efa6a</id>
+	  	<link href="http://www.example.org/entries/1" />
 		<updated>2003-12-13T18:30:02Z</updated>
 		<summary>Some text.</summary>
-		<category term='Tech' />
+		<category term='ZZZZ' />
 		<category term='Technology' label='Science' />
 	  </entry>
-
 	</feed>`
 
 	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
@@ -1637,22 +1601,53 @@ func TestParseFeedWithCategories(t *testing.T) {
 	}
 
 	if len(feed.Entries[0].Tags) != 2 {
-		t.Errorf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
+		t.Fatalf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
 	}
 
-	expected := "Tech"
+	expected := "Science"
 	result := feed.Entries[0].Tags[0]
 	if result != expected {
 		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
 	}
 
-	expected = "Science"
+	expected = "ZZZZ"
 	result = feed.Entries[0].Tags[1]
 	if result != expected {
 		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
 	}
 }
 
+func TestParseFeedWithCategories(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed xmlns="http://www.w3.org/2005/Atom">
+	  <title>Example Feed</title>
+	  <link href="http://example.org/"/>
+	  <category term='Test' label='Some Label' />
+	  <category term='Test' label='Some Label' />
+	  <category term='Test' label='Some Label' />
+	  <entry>
+	  	<link href="http://www.example.org/entries/1" />
+		<updated>2003-12-13T18:30:02Z</updated>
+		<summary>Some text.</summary>
+	  </entry>
+	</feed>`
+
+	feed, err := Parse("https://example.org/", bytes.NewReader([]byte(data)), "10")
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if len(feed.Entries[0].Tags) != 1 {
+		t.Fatalf("Incorrect number of tags, got: %d", len(feed.Entries[0].Tags))
+	}
+
+	expected := "Some Label"
+	result := feed.Entries[0].Tags[0]
+	if result != expected {
+		t.Errorf("Incorrect entry category, got %q instead of %q", result, expected)
+	}
+}
+
 func TestParseFeedWithIconURL(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">

+ 85 - 28
internal/reader/atom/atom_common.go

@@ -3,77 +3,91 @@
 
 package atom // import "miniflux.app/v2/internal/reader/atom"
 
-import "strings"
-
-type atomPerson struct {
-	Name  string `xml:"name"`
+import (
+	"strings"
+)
+
+// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-3.2
+type AtomPerson struct {
+	// The "atom:name" element's content conveys a human-readable name for the author.
+	// It MAY be the name of a corporation or other entity no individual authors can be named.
+	// Person constructs MUST contain exactly one "atom:name" element, whose content MUST be a string.
+	Name string `xml:"name"`
+
+	// The "atom:email" element's content conveys an e-mail address associated with the Person construct.
+	// Person constructs MAY contain an atom:email element, but MUST NOT contain more than one.
+	// Its content MUST be an e-mail address [RFC2822].
+	// Ordering of the element children of Person constructs MUST NOT be considered significant.
 	Email string `xml:"email"`
 }
 
-func (a *atomPerson) String() string {
-	name := ""
-
-	switch {
-	case a.Name != "":
-		name = a.Name
-	case a.Email != "":
-		name = a.Email
+func (a *AtomPerson) PersonName() string {
+	name := strings.TrimSpace(a.Name)
+	if name != "" {
+		return name
 	}
 
-	return strings.TrimSpace(name)
+	return strings.TrimSpace(a.Email)
 }
 
-type atomAuthors []*atomPerson
+type AtomPersons []*AtomPerson
 
-func (a atomAuthors) String() string {
-	var authors []string
+func (a AtomPersons) PersonNames() []string {
+	var names []string
+	authorNamesMap := make(map[string]bool)
 
 	for _, person := range a {
-		authors = append(authors, person.String())
+		personName := person.PersonName()
+		if _, ok := authorNamesMap[personName]; !ok {
+			names = append(names, personName)
+			authorNamesMap[personName] = true
+		}
 	}
 
-	return strings.Join(authors, ", ")
+	return names
 }
 
-type atomLink struct {
-	URL    string `xml:"href,attr"`
+// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-4.2.7
+type AtomLink struct {
+	Href   string `xml:"href,attr"`
 	Type   string `xml:"type,attr"`
 	Rel    string `xml:"rel,attr"`
 	Length string `xml:"length,attr"`
+	Title  string `xml:"title,attr"`
 }
 
-type atomLinks []*atomLink
+type AtomLinks []*AtomLink
 
-func (a atomLinks) originalLink() string {
+func (a AtomLinks) OriginalLink() string {
 	for _, link := range a {
 		if strings.EqualFold(link.Rel, "alternate") {
-			return strings.TrimSpace(link.URL)
+			return strings.TrimSpace(link.Href)
 		}
 
 		if link.Rel == "" && (link.Type == "" || link.Type == "text/html") {
-			return strings.TrimSpace(link.URL)
+			return strings.TrimSpace(link.Href)
 		}
 	}
 
 	return ""
 }
 
-func (a atomLinks) firstLinkWithRelation(relation string) string {
+func (a AtomLinks) firstLinkWithRelation(relation string) string {
 	for _, link := range a {
 		if strings.EqualFold(link.Rel, relation) {
-			return strings.TrimSpace(link.URL)
+			return strings.TrimSpace(link.Href)
 		}
 	}
 
 	return ""
 }
 
-func (a atomLinks) firstLinkWithRelationAndType(relation string, contentTypes ...string) string {
+func (a AtomLinks) firstLinkWithRelationAndType(relation string, contentTypes ...string) string {
 	for _, link := range a {
 		if strings.EqualFold(link.Rel, relation) {
 			for _, contentType := range contentTypes {
 				if strings.EqualFold(link.Type, contentType) {
-					return strings.TrimSpace(link.URL)
+					return strings.TrimSpace(link.Href)
 				}
 			}
 		}
@@ -81,3 +95,46 @@ func (a atomLinks) firstLinkWithRelationAndType(relation string, contentTypes ..
 
 	return ""
 }
+
+// The "atom:category" element conveys information about a category
+// associated with an entry or feed.  This specification assigns no
+// meaning to the content (if any) of this element.
+//
+// Specs: https://datatracker.ietf.org/doc/html/rfc4287#section-4.2.2
+type AtomCategory struct {
+	// The "term" attribute is a string that identifies the category to
+	// which the entry or feed belongs. Category elements MUST have a
+	// "term" attribute.
+	Term string `xml:"term,attr"`
+
+	// The "scheme" attribute is an IRI that identifies a categorization
+	// scheme. Category elements MAY have a "scheme" attribute.
+	Scheme string `xml:"scheme,attr"`
+
+	// The "label" attribute provides a human-readable label for display in
+	// end-user applications. The content of the "label" attribute is
+	// Language-Sensitive. Entities such as "&amp;" and "&lt;" represent
+	// their corresponding characters ("&" and "<", respectively), not
+	// markup. Category elements MAY have a "label" attribute.
+	Label string `xml:"label,attr"`
+}
+
+type AtomCategories []AtomCategory
+
+func (ac AtomCategories) CategoryNames() []string {
+	var categories []string
+
+	for _, category := range ac {
+		label := strings.TrimSpace(category.Label)
+		if label != "" {
+			categories = append(categories, label)
+		} else {
+			term := strings.TrimSpace(category.Term)
+			if term != "" {
+				categories = append(categories, term)
+			}
+		}
+	}
+
+	return categories
+}

+ 13 - 15
internal/reader/atom/parser.go

@@ -11,22 +11,20 @@ import (
 	xml_decoder "miniflux.app/v2/internal/reader/xml"
 )
 
-type atomFeed interface {
-	Transform(baseURL string) *model.Feed
-}
-
 // Parse returns a normalized feed struct from a Atom feed.
 func Parse(baseURL string, r io.ReadSeeker, version string) (*model.Feed, error) {
-	var rawFeed atomFeed
-	if version == "0.3" {
-		rawFeed = new(atom03Feed)
-	} else {
-		rawFeed = new(atom10Feed)
+	switch version {
+	case "0.3":
+		atomFeed := new(Atom03Feed)
+		if err := xml_decoder.NewXMLDecoder(r).Decode(atomFeed); err != nil {
+			return nil, fmt.Errorf("atom: unable to parse Atom 0.3 feed: %w", err)
+		}
+		return NewAtom03Adapter(atomFeed).BuildFeed(baseURL), nil
+	default:
+		atomFeed := new(Atom10Feed)
+		if err := xml_decoder.NewXMLDecoder(r).Decode(atomFeed); err != nil {
+			return nil, fmt.Errorf("atom: unable to parse Atom 1.0 feed: %w", err)
+		}
+		return NewAtom10Adapter(atomFeed).BuildFeed(baseURL), nil
 	}
-
-	if err := xml_decoder.NewXMLDecoder(r).Decode(rawFeed); err != nil {
-		return nil, fmt.Errorf("atom: unable to parse feed: %w", err)
-	}
-
-	return rawFeed.Transform(baseURL), nil
 }

+ 3 - 1
internal/reader/json/adapter.go

@@ -98,7 +98,6 @@ func (j *JSONAdapter) BuildFeed(feedURL string) *model.Feed {
 		}
 
 		// Populate the entry date.
-		entry.Date = time.Now()
 		for _, value := range []string{item.DatePublished, item.DateModified} {
 			value = strings.TrimSpace(value)
 			if value != "" {
@@ -114,6 +113,9 @@ func (j *JSONAdapter) BuildFeed(feedURL string) *model.Feed {
 				}
 			}
 		}
+		if entry.Date.IsZero() {
+			entry.Date = time.Now()
+		}
 
 		// Populate the entry author.
 		itemAuthors := append(item.Authors, j.jsonFeed.Authors...)

+ 29 - 1
internal/reader/parser/parser_test.go

@@ -85,7 +85,35 @@ func FuzzParse(f *testing.F) {
 	})
 }
 
-func TestParseAtom(t *testing.T) {
+func TestParseAtom03Feed(t *testing.T) {
+	data := `<?xml version="1.0" encoding="utf-8"?>
+	<feed version="0.3" xmlns="http://purl.org/atom/ns#">
+		<title>dive into mark</title>
+		<link rel="alternate" type="text/html" href="http://diveintomark.org/"/>
+		<modified>2003-12-13T18:30:02Z</modified>
+		<author><name>Mark Pilgrim</name></author>
+		<entry>
+			<title>Atom 0.3 snapshot</title>
+			<link rel="alternate" type="text/html" href="http://diveintomark.org/2003/12/13/atom03"/>
+			<id>tag:diveintomark.org,2003:3.2397</id>
+			<issued>2003-12-13T08:29:29-04:00</issued>
+			<modified>2003-12-13T18:30:02Z</modified>
+			<summary type="text/plain">It&apos;s a test</summary>
+			<content type="text/html" mode="escaped"><![CDATA[<p>HTML content</p>]]></content>
+		</entry>
+	</feed>`
+
+	feed, err := ParseFeed("https://example.org/", strings.NewReader(data))
+	if err != nil {
+		t.Error(err)
+	}
+
+	if feed.Title != "dive into mark" {
+		t.Errorf("Incorrect title, got: %s", feed.Title)
+	}
+}
+
+func TestParseAtom10Feed(t *testing.T) {
 	data := `<?xml version="1.0" encoding="utf-8"?>
 	<feed xmlns="http://www.w3.org/2005/Atom">
 

+ 1 - 2
internal/reader/rss/adapter.go

@@ -69,7 +69,6 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
 
 	for _, item := range r.rss.Channel.Items {
 		entry := model.NewEntry()
-		entry.Author = findEntryAuthor(&item)
 		entry.Date = findEntryDate(&item)
 		entry.Content = findEntryContent(&item)
 		entry.Enclosures = findEntryEnclosures(&item)
@@ -91,11 +90,11 @@ func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
 		if entry.Title == "" {
 			entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
 		}
-
 		if entry.Title == "" {
 			entry.Title = entry.URL
 		}
 
+		entry.Author = findEntryAuthor(&item)
 		if entry.Author == "" {
 			entry.Author = findFeedAuthor(&r.rss.Channel)
 		}