atom_10.go 5.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231
  1. // Copyright 2019 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package atom // import "miniflux.app/reader/atom"
  5. import (
  6. "encoding/xml"
  7. "html"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/media"
  16. "miniflux.app/reader/sanitizer"
  17. "miniflux.app/url"
  18. )
  19. // Specs:
  20. // https://tools.ietf.org/html/rfc4287
  21. // https://validator.w3.org/feed/docs/atom.html
  22. type atom10Feed struct {
  23. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  24. ID string `xml:"id"`
  25. Title atom10Text `xml:"title"`
  26. Author atomPerson `xml:"author"`
  27. Links atomLinks `xml:"link"`
  28. Entries []atom10Entry `xml:"entry"`
  29. }
  30. func (a *atom10Feed) Transform() *model.Feed {
  31. feed := new(model.Feed)
  32. feed.FeedURL = a.Links.firstLinkWithRelation("self")
  33. feed.SiteURL = a.Links.originalLink()
  34. feed.Title = a.Title.String()
  35. if feed.Title == "" {
  36. feed.Title = feed.SiteURL
  37. }
  38. for _, entry := range a.Entries {
  39. item := entry.Transform()
  40. entryURL, err := url.AbsoluteURL(feed.SiteURL, item.URL)
  41. if err == nil {
  42. item.URL = entryURL
  43. }
  44. if item.Author == "" {
  45. item.Author = a.Author.String()
  46. }
  47. if item.Title == "" {
  48. item.Title = item.URL
  49. }
  50. feed.Entries = append(feed.Entries, item)
  51. }
  52. return feed
  53. }
  54. type atom10Entry struct {
  55. ID string `xml:"id"`
  56. Title atom10Text `xml:"title"`
  57. Published string `xml:"published"`
  58. Updated string `xml:"updated"`
  59. Links atomLinks `xml:"link"`
  60. Summary atom10Text `xml:"summary"`
  61. Content atom10Text `xml:"http://www.w3.org/2005/Atom content"`
  62. Author atomPerson `xml:"author"`
  63. media.Element
  64. }
  65. func (a *atom10Entry) Transform() *model.Entry {
  66. entry := new(model.Entry)
  67. entry.URL = a.Links.originalLink()
  68. entry.Date = a.entryDate()
  69. entry.Author = a.Author.String()
  70. entry.Hash = a.entryHash()
  71. entry.Content = a.entryContent()
  72. entry.Title = a.entryTitle()
  73. entry.Enclosures = a.entryEnclosures()
  74. entry.CommentsURL = a.entryCommentsURL()
  75. return entry
  76. }
  77. func (a *atom10Entry) entryTitle() string {
  78. return sanitizer.StripTags(a.Title.String())
  79. }
  80. func (a *atom10Entry) entryContent() string {
  81. content := a.Content.String()
  82. if content != "" {
  83. return content
  84. }
  85. summary := a.Summary.String()
  86. if summary != "" {
  87. return summary
  88. }
  89. mediaDescription := a.FirstMediaDescription()
  90. if mediaDescription != "" {
  91. return mediaDescription
  92. }
  93. return ""
  94. }
  95. // Note: The published date represents the original creation date for YouTube feeds.
  96. // Example:
  97. // <published>2019-01-26T08:02:28+00:00</published>
  98. // <updated>2019-01-29T07:27:27+00:00</updated>
  99. func (a *atom10Entry) entryDate() time.Time {
  100. dateText := a.Published
  101. if dateText == "" {
  102. dateText = a.Updated
  103. }
  104. if dateText != "" {
  105. result, err := date.Parse(dateText)
  106. if err != nil {
  107. logger.Error("atom: %v", err)
  108. return time.Now()
  109. }
  110. return result
  111. }
  112. return time.Now()
  113. }
  114. func (a *atom10Entry) entryHash() string {
  115. for _, value := range []string{a.ID, a.Links.originalLink()} {
  116. if value != "" {
  117. return crypto.Hash(value)
  118. }
  119. }
  120. return ""
  121. }
  122. func (a *atom10Entry) entryEnclosures() model.EnclosureList {
  123. enclosures := make(model.EnclosureList, 0)
  124. duplicates := make(map[string]bool, 0)
  125. for _, mediaThumbnail := range a.AllMediaThumbnails() {
  126. if _, found := duplicates[mediaThumbnail.URL]; !found {
  127. duplicates[mediaThumbnail.URL] = true
  128. enclosures = append(enclosures, &model.Enclosure{
  129. URL: mediaThumbnail.URL,
  130. MimeType: mediaThumbnail.MimeType(),
  131. Size: mediaThumbnail.Size(),
  132. })
  133. }
  134. }
  135. for _, link := range a.Links {
  136. if strings.ToLower(link.Rel) == "enclosure" {
  137. if link.URL == "" {
  138. continue
  139. }
  140. if _, found := duplicates[link.URL]; !found {
  141. duplicates[link.URL] = true
  142. length, _ := strconv.ParseInt(link.Length, 10, 0)
  143. enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
  144. }
  145. }
  146. }
  147. for _, mediaContent := range a.AllMediaContents() {
  148. if _, found := duplicates[mediaContent.URL]; !found {
  149. duplicates[mediaContent.URL] = true
  150. enclosures = append(enclosures, &model.Enclosure{
  151. URL: mediaContent.URL,
  152. MimeType: mediaContent.MimeType(),
  153. Size: mediaContent.Size(),
  154. })
  155. }
  156. }
  157. for _, mediaPeerLink := range a.AllMediaPeerLinks() {
  158. if _, found := duplicates[mediaPeerLink.URL]; !found {
  159. duplicates[mediaPeerLink.URL] = true
  160. enclosures = append(enclosures, &model.Enclosure{
  161. URL: mediaPeerLink.URL,
  162. MimeType: mediaPeerLink.MimeType(),
  163. Size: mediaPeerLink.Size(),
  164. })
  165. }
  166. }
  167. return enclosures
  168. }
  169. // See https://tools.ietf.org/html/rfc4685#section-4
  170. // If the type attribute of the atom:link is omitted, its value is assumed to be "application/atom+xml".
  171. // We accept only HTML or XHTML documents for now since the intention is to have the same behavior as RSS.
  172. func (a *atom10Entry) entryCommentsURL() string {
  173. commentsURL := a.Links.firstLinkWithRelationAndType("replies", "text/html", "application/xhtml+xml")
  174. if url.IsAbsoluteURL(commentsURL) {
  175. return commentsURL
  176. }
  177. return ""
  178. }
  179. type atom10Text struct {
  180. Type string `xml:"type,attr"`
  181. Data string `xml:",chardata"`
  182. XML string `xml:",innerxml"`
  183. }
  184. func (a *atom10Text) String() string {
  185. content := ""
  186. switch {
  187. case a.Type == "xhtml":
  188. content = a.XML
  189. case a.Type == "html":
  190. content = a.Data
  191. case a.Type == "text" || a.Type == "":
  192. content = html.EscapeString(a.Data)
  193. }
  194. return strings.TrimSpace(content)
  195. }