atom.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package atom // import "miniflux.app/reader/atom"
  5. import (
  6. "encoding/xml"
  7. "html"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/media"
  16. "miniflux.app/reader/sanitizer"
  17. "miniflux.app/url"
  18. )
  19. type atomFeed struct {
  20. XMLName xml.Name `xml:"http://www.w3.org/2005/Atom feed"`
  21. ID string `xml:"id"`
  22. Title string `xml:"title"`
  23. Author atomAuthor `xml:"author"`
  24. Links []atomLink `xml:"link"`
  25. Entries []atomEntry `xml:"entry"`
  26. }
  27. type atomEntry struct {
  28. ID string `xml:"id"`
  29. Title atomContent `xml:"title"`
  30. Published string `xml:"published"`
  31. Updated string `xml:"updated"`
  32. Links []atomLink `xml:"link"`
  33. Summary atomContent `xml:"summary"`
  34. Content atomContent `xml:"http://www.w3.org/2005/Atom content"`
  35. Author atomAuthor `xml:"author"`
  36. media.Element
  37. }
  38. type atomAuthor struct {
  39. Name string `xml:"name"`
  40. Email string `xml:"email"`
  41. }
  42. type atomLink struct {
  43. URL string `xml:"href,attr"`
  44. Type string `xml:"type,attr"`
  45. Rel string `xml:"rel,attr"`
  46. Length string `xml:"length,attr"`
  47. }
  48. type atomContent struct {
  49. Type string `xml:"type,attr"`
  50. Data string `xml:",chardata"`
  51. XML string `xml:",innerxml"`
  52. }
  53. func (a *atomFeed) Transform() *model.Feed {
  54. feed := new(model.Feed)
  55. feed.FeedURL = getRelationURL(a.Links, "self")
  56. feed.SiteURL = getURL(a.Links)
  57. feed.Title = strings.TrimSpace(a.Title)
  58. if feed.Title == "" {
  59. feed.Title = feed.SiteURL
  60. }
  61. for _, entry := range a.Entries {
  62. item := entry.Transform()
  63. entryURL, err := url.AbsoluteURL(feed.SiteURL, item.URL)
  64. if err == nil {
  65. item.URL = entryURL
  66. }
  67. if item.Author == "" {
  68. item.Author = getAuthor(a.Author)
  69. }
  70. if item.Title == "" {
  71. item.Title = item.URL
  72. }
  73. feed.Entries = append(feed.Entries, item)
  74. }
  75. return feed
  76. }
  77. func (a *atomEntry) Transform() *model.Entry {
  78. entry := new(model.Entry)
  79. entry.URL = getURL(a.Links)
  80. entry.Date = getDate(a)
  81. entry.Author = getAuthor(a.Author)
  82. entry.Hash = getHash(a)
  83. entry.Content = getContent(a)
  84. entry.Title = getTitle(a)
  85. entry.Enclosures = getEnclosures(a)
  86. entry.CommentsURL = getRelationURLWithType(a.Links, "replies", "text/html")
  87. return entry
  88. }
  89. func getURL(links []atomLink) string {
  90. for _, link := range links {
  91. if strings.ToLower(link.Rel) == "alternate" {
  92. return strings.TrimSpace(link.URL)
  93. }
  94. if link.Rel == "" && link.Type == "" {
  95. return strings.TrimSpace(link.URL)
  96. }
  97. }
  98. return ""
  99. }
  100. func getRelationURL(links []atomLink, relation string) string {
  101. for _, link := range links {
  102. if strings.ToLower(link.Rel) == relation {
  103. return strings.TrimSpace(link.URL)
  104. }
  105. }
  106. return ""
  107. }
  108. func getRelationURLWithType(links []atomLink, relation, contentType string) string {
  109. for _, link := range links {
  110. if strings.ToLower(link.Rel) == relation && strings.ToLower(link.Type) == contentType {
  111. return strings.TrimSpace(link.URL)
  112. }
  113. }
  114. return ""
  115. }
  116. func getDate(a *atomEntry) time.Time {
  117. // Note: The published date represents the original creation date for YouTube feeds.
  118. // Example:
  119. // <published>2019-01-26T08:02:28+00:00</published>
  120. // <updated>2019-01-29T07:27:27+00:00</updated>
  121. dateText := a.Published
  122. if dateText == "" {
  123. dateText = a.Updated
  124. }
  125. if dateText != "" {
  126. result, err := date.Parse(dateText)
  127. if err != nil {
  128. logger.Error("atom: %v", err)
  129. return time.Now()
  130. }
  131. return result
  132. }
  133. return time.Now()
  134. }
  135. func atomContentToString(c atomContent) string {
  136. if c.Type == "xhtml" {
  137. return c.XML
  138. }
  139. if c.Type == "html" {
  140. return c.Data
  141. }
  142. if c.Type == "text" || c.Type == "" {
  143. return html.EscapeString(c.Data)
  144. }
  145. return ""
  146. }
  147. func getContent(a *atomEntry) string {
  148. r := atomContentToString(a.Content)
  149. if r != "" {
  150. return r
  151. }
  152. r = atomContentToString(a.Summary)
  153. if r != "" {
  154. return r
  155. }
  156. mediaDescription := a.FirstMediaDescription()
  157. if mediaDescription != "" {
  158. return mediaDescription
  159. }
  160. return ""
  161. }
  162. func getTitle(a *atomEntry) string {
  163. title := atomContentToString(a.Title)
  164. return strings.TrimSpace(sanitizer.StripTags(title))
  165. }
  166. func getHash(a *atomEntry) string {
  167. for _, value := range []string{a.ID, getURL(a.Links)} {
  168. if value != "" {
  169. return crypto.Hash(value)
  170. }
  171. }
  172. return ""
  173. }
  174. func getEnclosures(a *atomEntry) model.EnclosureList {
  175. enclosures := make(model.EnclosureList, 0)
  176. duplicates := make(map[string]bool, 0)
  177. for _, mediaThumbnail := range a.AllMediaThumbnails() {
  178. if _, found := duplicates[mediaThumbnail.URL]; !found {
  179. duplicates[mediaThumbnail.URL] = true
  180. enclosures = append(enclosures, &model.Enclosure{
  181. URL: mediaThumbnail.URL,
  182. MimeType: mediaThumbnail.MimeType(),
  183. Size: mediaThumbnail.Size(),
  184. })
  185. }
  186. }
  187. for _, link := range a.Links {
  188. if strings.ToLower(link.Rel) == "enclosure" {
  189. if _, found := duplicates[link.URL]; !found {
  190. duplicates[link.URL] = true
  191. length, _ := strconv.ParseInt(link.Length, 10, 0)
  192. enclosures = append(enclosures, &model.Enclosure{URL: link.URL, MimeType: link.Type, Size: length})
  193. }
  194. }
  195. }
  196. for _, mediaContent := range a.AllMediaContents() {
  197. if _, found := duplicates[mediaContent.URL]; !found {
  198. duplicates[mediaContent.URL] = true
  199. enclosures = append(enclosures, &model.Enclosure{
  200. URL: mediaContent.URL,
  201. MimeType: mediaContent.MimeType(),
  202. Size: mediaContent.Size(),
  203. })
  204. }
  205. }
  206. for _, mediaPeerLink := range a.AllMediaPeerLinks() {
  207. if _, found := duplicates[mediaPeerLink.URL]; !found {
  208. duplicates[mediaPeerLink.URL] = true
  209. enclosures = append(enclosures, &model.Enclosure{
  210. URL: mediaPeerLink.URL,
  211. MimeType: mediaPeerLink.MimeType(),
  212. Size: mediaPeerLink.Size(),
  213. })
  214. }
  215. }
  216. return enclosures
  217. }
  218. func getAuthor(author atomAuthor) string {
  219. if author.Name != "" {
  220. return strings.TrimSpace(author.Name)
  221. }
  222. if author.Email != "" {
  223. return strings.TrimSpace(author.Email)
  224. }
  225. return ""
  226. }