atom_10.go 7.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package atom // import "miniflux.app/v2/internal/reader/atom"
  4. import (
  5. "encoding/xml"
  6. "html"
  7. "log/slog"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/model"
  13. "miniflux.app/v2/internal/reader/date"
  14. "miniflux.app/v2/internal/reader/media"
  15. "miniflux.app/v2/internal/reader/sanitizer"
  16. "miniflux.app/v2/internal/urllib"
  17. )
  18. // Specs:
  19. // https://tools.ietf.org/html/rfc4287
  20. // https://validator.w3.org/feed/docs/atom.html
  21. type atom10Feed struct {
  22. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  23. ID string `xml:"id"`
  24. Title atom10Text `xml:"title"`
  25. Authors atomAuthors `xml:"author"`
  26. Icon string `xml:"icon"`
  27. Links atomLinks `xml:"link"`
  28. Entries []atom10Entry `xml:"entry"`
  29. }
  30. func (a *atom10Feed) Transform(baseURL string) *model.Feed {
  31. var err error
  32. feed := new(model.Feed)
  33. feedURL := a.Links.firstLinkWithRelation("self")
  34. feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
  35. if err != nil {
  36. feed.FeedURL = feedURL
  37. }
  38. siteURL := a.Links.originalLink()
  39. feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
  40. if err != nil {
  41. feed.SiteURL = siteURL
  42. }
  43. feed.Title = html.UnescapeString(a.Title.String())
  44. if feed.Title == "" {
  45. feed.Title = feed.SiteURL
  46. }
  47. feed.IconURL = strings.TrimSpace(a.Icon)
  48. for _, entry := range a.Entries {
  49. item := entry.Transform()
  50. entryURL, err := urllib.AbsoluteURL(feed.SiteURL, item.URL)
  51. if err == nil {
  52. item.URL = entryURL
  53. }
  54. if item.Author == "" {
  55. item.Author = a.Authors.String()
  56. }
  57. if item.Title == "" {
  58. item.Title = sanitizer.TruncateHTML(item.Content, 100)
  59. }
  60. if item.Title == "" {
  61. item.Title = item.URL
  62. }
  63. feed.Entries = append(feed.Entries, item)
  64. }
  65. return feed
  66. }
  67. type atom10Entry struct {
  68. ID string `xml:"id"`
  69. Title atom10Text `xml:"title"`
  70. Published string `xml:"published"`
  71. Updated string `xml:"updated"`
  72. Links atomLinks `xml:"link"`
  73. Summary atom10Text `xml:"summary"`
  74. Content atom10Text `xml:"http://www.w3.org/2005/Atom content"`
  75. Authors atomAuthors `xml:"author"`
  76. Categories []atom10Category `xml:"category"`
  77. media.Element
  78. }
  79. func (a *atom10Entry) Transform() *model.Entry {
  80. entry := model.NewEntry()
  81. entry.URL = a.Links.originalLink()
  82. entry.Date = a.entryDate()
  83. entry.Author = a.Authors.String()
  84. entry.Hash = a.entryHash()
  85. entry.Content = a.entryContent()
  86. entry.Title = a.entryTitle()
  87. entry.Enclosures = a.entryEnclosures()
  88. entry.CommentsURL = a.entryCommentsURL()
  89. entry.Tags = a.entryCategories()
  90. return entry
  91. }
  92. func (a *atom10Entry) entryTitle() string {
  93. return html.UnescapeString(a.Title.String())
  94. }
  95. func (a *atom10Entry) entryContent() string {
  96. content := a.Content.String()
  97. if content != "" {
  98. return content
  99. }
  100. summary := a.Summary.String()
  101. if summary != "" {
  102. return summary
  103. }
  104. mediaDescription := a.FirstMediaDescription()
  105. if mediaDescription != "" {
  106. return mediaDescription
  107. }
  108. return ""
  109. }
  110. // Note: The published date represents the original creation date for YouTube feeds.
  111. // Example:
  112. // <published>2019-01-26T08:02:28+00:00</published>
  113. // <updated>2019-01-29T07:27:27+00:00</updated>
  114. func (a *atom10Entry) entryDate() time.Time {
  115. dateText := a.Published
  116. if dateText == "" {
  117. dateText = a.Updated
  118. }
  119. if dateText != "" {
  120. result, err := date.Parse(dateText)
  121. if err != nil {
  122. slog.Debug("Unable to parse date from Atom 0.3 feed",
  123. slog.String("date", dateText),
  124. slog.String("id", a.ID),
  125. slog.Any("error", err),
  126. )
  127. return time.Now()
  128. }
  129. return result
  130. }
  131. return time.Now()
  132. }
  133. func (a *atom10Entry) entryHash() string {
  134. for _, value := range []string{a.ID, a.Links.originalLink()} {
  135. if value != "" {
  136. return crypto.Hash(value)
  137. }
  138. }
  139. return ""
  140. }
  141. func (a *atom10Entry) entryEnclosures() model.EnclosureList {
  142. enclosures := make(model.EnclosureList, 0)
  143. duplicates := make(map[string]bool)
  144. for _, mediaThumbnail := range a.AllMediaThumbnails() {
  145. if _, found := duplicates[mediaThumbnail.URL]; !found {
  146. duplicates[mediaThumbnail.URL] = true
  147. enclosures = append(enclosures, &model.Enclosure{
  148. URL: mediaThumbnail.URL,
  149. MimeType: mediaThumbnail.MimeType(),
  150. Size: mediaThumbnail.Size(),
  151. })
  152. }
  153. }
  154. for _, link := range a.Links {
  155. if strings.ToLower(link.Rel) == "enclosure" {
  156. if link.URL == "" {
  157. continue
  158. }
  159. if _, found := duplicates[link.URL]; !found {
  160. duplicates[link.URL] = true
  161. length, _ := strconv.ParseInt(link.Length, 10, 0)
  162. enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
  163. }
  164. }
  165. }
  166. for _, mediaContent := range a.AllMediaContents() {
  167. if _, found := duplicates[mediaContent.URL]; !found {
  168. duplicates[mediaContent.URL] = true
  169. enclosures = append(enclosures, &model.Enclosure{
  170. URL: mediaContent.URL,
  171. MimeType: mediaContent.MimeType(),
  172. Size: mediaContent.Size(),
  173. })
  174. }
  175. }
  176. for _, mediaPeerLink := range a.AllMediaPeerLinks() {
  177. if _, found := duplicates[mediaPeerLink.URL]; !found {
  178. duplicates[mediaPeerLink.URL] = true
  179. enclosures = append(enclosures, &model.Enclosure{
  180. URL: mediaPeerLink.URL,
  181. MimeType: mediaPeerLink.MimeType(),
  182. Size: mediaPeerLink.Size(),
  183. })
  184. }
  185. }
  186. return enclosures
  187. }
  188. func (r *atom10Entry) entryCategories() []string {
  189. categoryList := make([]string, 0)
  190. for _, atomCategory := range r.Categories {
  191. if strings.TrimSpace(atomCategory.Label) != "" {
  192. categoryList = append(categoryList, strings.TrimSpace(atomCategory.Label))
  193. } else {
  194. categoryList = append(categoryList, strings.TrimSpace(atomCategory.Term))
  195. }
  196. }
  197. return categoryList
  198. }
  199. // See https://tools.ietf.org/html/rfc4685#section-4
  200. // If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
  201. // We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
  202. func (a *atom10Entry) entryCommentsURL() string {
  203. commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
  204. if urllib.IsAbsoluteURL(commentsURL) {
  205. return commentsURL
  206. }
  207. return ""
  208. }
  209. type atom10Text struct {
  210. Type string `xml:"type,attr"`
  211. CharData string `xml:",chardata"`
  212. InnerXML string `xml:",innerxml"`
  213. XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
  214. }
  215. type atom10Category struct {
  216. Term string `xml:"term,attr"`
  217. Label string `xml:"label,attr"`
  218. }
  219. // Text: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.1
  220. // HTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.2
  221. // XHTML: https://datatracker.ietf.org/doc/html/rfc4287#section-3.1.1.3
  222. func (a *atom10Text) String() string {
  223. var content string
  224. switch {
  225. case a.Type == "", a.Type == "text", a.Type == "text/plain":
  226. if strings.HasPrefix(strings.TrimSpace(a.InnerXML), `<![CDATA[`) {
  227. content = html.EscapeString(a.CharData)
  228. } else {
  229. content = a.InnerXML
  230. }
  231. case a.Type == "xhtml":
  232. var root = a.XHTMLRootElement
  233. if root.XMLName.Local == "div" {
  234. content = root.InnerXML
  235. } else {
  236. content = a.InnerXML
  237. }
  238. default:
  239. content = a.CharData
  240. }
  241. return strings.TrimSpace(content)
  242. }
  243. type atomXHTMLRootElement struct {
  244. XMLName xml.Name `xml:"div"`
  245. InnerXML string `xml:",innerxml"`
  246. }