atom_10.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255
  1. // Copyright 2019 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package atom // import "miniflux.app/reader/atom"
  5. import (
  6. "encoding/xml"
  7. "html"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/media"
  16. "miniflux.app/url"
  17. )
  18. // Specs:
  19. // https://tools.ietf.org/html/rfc4287
  20. // https://validator.w3.org/feed/docs/atom.html
  21. type atom10Feed struct {
  22. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  23. ID string `xml:"id"`
  24. Title atom10Text `xml:"title"`
  25. Authors atomAuthors `xml:"author"`
  26. Links atomLinks `xml:"link"`
  27. Entries []atom10Entry `xml:"entry"`
  28. }
  29. func (a *atom10Feed) Transform(baseURL string) *model.Feed {
  30. var err error
  31. feed := new(model.Feed)
  32. feedURL := a.Links.firstLinkWithRelation("self")
  33. feed.FeedURL, err = url.AbsoluteURL(baseURL, feedURL)
  34. if err != nil {
  35. feed.FeedURL = feedURL
  36. }
  37. siteURL := a.Links.originalLink()
  38. feed.SiteURL, err = url.AbsoluteURL(baseURL, siteURL)
  39. if err != nil {
  40. feed.SiteURL = siteURL
  41. }
  42. feed.Title = html.UnescapeString(a.Title.String())
  43. if feed.Title == "" {
  44. feed.Title = feed.SiteURL
  45. }
  46. for _, entry := range a.Entries {
  47. item := entry.Transform()
  48. entryURL, err := url.AbsoluteURL(feed.SiteURL, item.URL)
  49. if err == nil {
  50. item.URL = entryURL
  51. }
  52. if item.Author == "" {
  53. item.Author = a.Authors.String()
  54. }
  55. if item.Title == "" {
  56. item.Title = item.URL
  57. }
  58. feed.Entries = append(feed.Entries, item)
  59. }
  60. return feed
  61. }
  62. type atom10Entry struct {
  63. ID string `xml:"id"`
  64. Title atom10Text `xml:"title"`
  65. Published string `xml:"published"`
  66. Updated string `xml:"updated"`
  67. Links atomLinks `xml:"link"`
  68. Summary atom10Text `xml:"summary"`
  69. Content atom10Text `xml:"http://www.w3.org/2005/Atom content"`
  70. Authors atomAuthors `xml:"author"`
  71. media.Element
  72. }
  73. func (a *atom10Entry) Transform() *model.Entry {
  74. entry := new(model.Entry)
  75. entry.URL = a.Links.originalLink()
  76. entry.Date = a.entryDate()
  77. entry.Author = a.Authors.String()
  78. entry.Hash = a.entryHash()
  79. entry.Content = a.entryContent()
  80. entry.Title = a.entryTitle()
  81. entry.Enclosures = a.entryEnclosures()
  82. entry.CommentsURL = a.entryCommentsURL()
  83. return entry
  84. }
  85. func (a *atom10Entry) entryTitle() string {
  86. return html.UnescapeString(a.Title.String())
  87. }
  88. func (a *atom10Entry) entryContent() string {
  89. content := a.Content.String()
  90. if content != "" {
  91. return content
  92. }
  93. summary := a.Summary.String()
  94. if summary != "" {
  95. return summary
  96. }
  97. mediaDescription := a.FirstMediaDescription()
  98. if mediaDescription != "" {
  99. return mediaDescription
  100. }
  101. return ""
  102. }
  103. // Note: The published date represents the original creation date for YouTube feeds.
  104. // Example:
  105. // <published>2019-01-26T08:02:28+00:00</published>
  106. // <updated>2019-01-29T07:27:27+00:00</updated>
  107. func (a *atom10Entry) entryDate() time.Time {
  108. dateText := a.Published
  109. if dateText == "" {
  110. dateText = a.Updated
  111. }
  112. if dateText != "" {
  113. result, err := date.Parse(dateText)
  114. if err != nil {
  115. logger.Error("atom: %v (entry ID = %s)", err, a.ID)
  116. return time.Now()
  117. }
  118. return result
  119. }
  120. return time.Now()
  121. }
  122. func (a *atom10Entry) entryHash() string {
  123. for _, value := range []string{a.ID, a.Links.originalLink()} {
  124. if value != "" {
  125. return crypto.Hash(value)
  126. }
  127. }
  128. return ""
  129. }
  130. func (a *atom10Entry) entryEnclosures() model.EnclosureList {
  131. enclosures := make(model.EnclosureList, 0)
  132. duplicates := make(map[string]bool)
  133. for _, mediaThumbnail := range a.AllMediaThumbnails() {
  134. if _, found := duplicates[mediaThumbnail.URL]; !found {
  135. duplicates[mediaThumbnail.URL] = true
  136. enclosures = append(enclosures, &model.Enclosure{
  137. URL: mediaThumbnail.URL,
  138. MimeType: mediaThumbnail.MimeType(),
  139. Size: mediaThumbnail.Size(),
  140. })
  141. }
  142. }
  143. for _, link := range a.Links {
  144. if strings.ToLower(link.Rel) == "enclosure" {
  145. if link.URL == "" {
  146. continue
  147. }
  148. if _, found := duplicates[link.URL]; !found {
  149. duplicates[link.URL] = true
  150. length, _ := strconv.ParseInt(link.Length, 10, 0)
  151. enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
  152. }
  153. }
  154. }
  155. for _, mediaContent := range a.AllMediaContents() {
  156. if _, found := duplicates[mediaContent.URL]; !found {
  157. duplicates[mediaContent.URL] = true
  158. enclosures = append(enclosures, &model.Enclosure{
  159. URL: mediaContent.URL,
  160. MimeType: mediaContent.MimeType(),
  161. Size: mediaContent.Size(),
  162. })
  163. }
  164. }
  165. for _, mediaPeerLink := range a.AllMediaPeerLinks() {
  166. if _, found := duplicates[mediaPeerLink.URL]; !found {
  167. duplicates[mediaPeerLink.URL] = true
  168. enclosures = append(enclosures, &model.Enclosure{
  169. URL: mediaPeerLink.URL,
  170. MimeType: mediaPeerLink.MimeType(),
  171. Size: mediaPeerLink.Size(),
  172. })
  173. }
  174. }
  175. return enclosures
  176. }
  177. // See https://tools.ietf.org/html/rfc4685#section-4
  178. // If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
  179. // We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
  180. func (a *atom10Entry) entryCommentsURL() string {
  181. commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
  182. if url.IsAbsoluteURL(commentsURL) {
  183. return commentsURL
  184. }
  185. return ""
  186. }
  187. type atom10Text struct {
  188. Type string `xml:"type,attr"`
  189. CharData string `xml:",chardata"`
  190. InnerXML string `xml:",innerxml"`
  191. XHTMLRootElement atomXHTMLRootElement `xml:"http://www.w3.org/1999/xhtml div"`
  192. }
  193. func (a *atom10Text) String() string {
  194. var content string
  195. switch {
  196. case a.Type == "", a.Type == "text", a.Type == "text/plain":
  197. if strings.HasPrefix(a.InnerXML, `<![CDATA[`) {
  198. content = html.EscapeString(a.CharData)
  199. } else {
  200. content = a.InnerXML
  201. }
  202. case a.Type == "xhtml":
  203. if a.XHTMLRootElement.InnerXML != "" {
  204. content = a.XHTMLRootElement.InnerXML
  205. } else {
  206. content = a.InnerXML
  207. }
  208. default:
  209. content = a.CharData
  210. }
  211. return strings.TrimSpace(content)
  212. }
  213. type atomXHTMLRootElement struct {
  214. InnerXML string `xml:",innerxml"`
  215. }