rss.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss // import "miniflux.app/reader/rss"
  5. import (
  6. "encoding/xml"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/media"
  16. "miniflux.app/reader/sanitizer"
  17. "miniflux.app/url"
  18. )
  19. type rssFeed struct {
  20. XMLName xml.Name `xml:"rss"`
  21. Version string `xml:"version,attr"`
  22. Title string `xml:"channel>title"`
  23. Links []rssLink `xml:"channel>link"`
  24. Language string `xml:"channel>language"`
  25. Description string `xml:"channel>description"`
  26. PubDate string `xml:"channel>pubDate"`
  27. ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
  28. Items []rssItem `xml:"channel>item"`
  29. }
  30. type rssLink struct {
  31. XMLName xml.Name
  32. Data string `xml:",chardata"`
  33. Href string `xml:"href,attr"`
  34. Rel string `xml:"rel,attr"`
  35. }
  36. type rssCommentLink struct {
  37. XMLName xml.Name
  38. Data string `xml:",chardata"`
  39. }
  40. type rssAuthor struct {
  41. XMLName xml.Name
  42. Data string `xml:",chardata"`
  43. Name string `xml:"name"`
  44. Inner string `xml:",innerxml"`
  45. }
  46. type rssEnclosure struct {
  47. URL string `xml:"url,attr"`
  48. Type string `xml:"type,attr"`
  49. Length string `xml:"length,attr"`
  50. }
  51. func (enclosure *rssEnclosure) Size() int64 {
  52. if enclosure.Length == "" {
  53. return 0
  54. }
  55. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  56. return size
  57. }
  58. type rssItem struct {
  59. GUID string `xml:"guid"`
  60. Title string `xml:"title"`
  61. Links []rssLink `xml:"link"`
  62. OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
  63. CommentLinks []rssCommentLink `xml:"comments"`
  64. Description string `xml:"description"`
  65. EncodedContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
  66. PubDate string `xml:"pubDate"`
  67. Date string `xml:"http://purl.org/dc/elements/1.1/ date"`
  68. Authors []rssAuthor `xml:"author"`
  69. Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
  70. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  71. OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
  72. media.Element
  73. }
  74. func (r *rssFeed) SiteURL() string {
  75. for _, element := range r.Links {
  76. if element.XMLName.Space == "" {
  77. return strings.TrimSpace(element.Data)
  78. }
  79. }
  80. return ""
  81. }
  82. func (r *rssFeed) FeedURL() string {
  83. for _, element := range r.Links {
  84. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  85. return strings.TrimSpace(element.Href)
  86. }
  87. }
  88. return ""
  89. }
  90. func (r *rssFeed) Transform() *model.Feed {
  91. feed := new(model.Feed)
  92. feed.SiteURL = r.SiteURL()
  93. feed.FeedURL = r.FeedURL()
  94. feed.Title = strings.TrimSpace(r.Title)
  95. if feed.Title == "" {
  96. feed.Title = feed.SiteURL
  97. }
  98. for _, item := range r.Items {
  99. entry := item.Transform()
  100. if entry.Author == "" && r.ItunesAuthor != "" {
  101. entry.Author = r.ItunesAuthor
  102. }
  103. entry.Author = strings.TrimSpace(sanitizer.StripTags(entry.Author))
  104. if entry.URL == "" {
  105. entry.URL = feed.SiteURL
  106. } else {
  107. entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL)
  108. if err == nil {
  109. entry.URL = entryURL
  110. }
  111. }
  112. if entry.Title == "" {
  113. entry.Title = entry.URL
  114. }
  115. feed.Entries = append(feed.Entries, entry)
  116. }
  117. return feed
  118. }
  119. func (r *rssItem) PublishedDate() time.Time {
  120. value := r.PubDate
  121. if r.Date != "" {
  122. value = r.Date
  123. }
  124. if value != "" {
  125. result, err := date.Parse(value)
  126. if err != nil {
  127. logger.Error("rss: %v", err)
  128. return time.Now()
  129. }
  130. return result
  131. }
  132. return time.Now()
  133. }
  134. func (r *rssItem) Author() string {
  135. for _, element := range r.Authors {
  136. if element.Name != "" {
  137. return element.Name
  138. }
  139. if element.Inner != "" {
  140. return element.Inner
  141. }
  142. }
  143. return r.Creator
  144. }
  145. func (r *rssItem) Hash() string {
  146. for _, value := range []string{r.GUID, r.URL()} {
  147. if value != "" {
  148. return crypto.Hash(value)
  149. }
  150. }
  151. return ""
  152. }
  153. func (r *rssItem) Content() string {
  154. if r.EncodedContent != "" {
  155. return r.EncodedContent
  156. }
  157. return r.Description
  158. }
  159. func (r *rssItem) URL() string {
  160. if r.OriginalLink != "" {
  161. return r.OriginalLink
  162. }
  163. for _, link := range r.Links {
  164. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  165. return strings.TrimSpace(link.Href)
  166. }
  167. if link.Data != "" {
  168. return strings.TrimSpace(link.Data)
  169. }
  170. }
  171. return ""
  172. }
  173. func (r *rssItem) Enclosures() model.EnclosureList {
  174. enclosures := make(model.EnclosureList, 0)
  175. duplicates := make(map[string]bool, 0)
  176. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  177. if _, found := duplicates[mediaThumbnail.URL]; !found {
  178. duplicates[mediaThumbnail.URL] = true
  179. enclosures = append(enclosures, &model.Enclosure{
  180. URL: mediaThumbnail.URL,
  181. MimeType: mediaThumbnail.MimeType(),
  182. Size: mediaThumbnail.Size(),
  183. })
  184. }
  185. }
  186. for _, enclosure := range r.EnclosureLinks {
  187. enclosureURL := enclosure.URL
  188. if r.OrigEnclosureLink != "" {
  189. filename := path.Base(r.OrigEnclosureLink)
  190. if strings.Contains(enclosureURL, filename) {
  191. enclosureURL = r.OrigEnclosureLink
  192. }
  193. }
  194. if _, found := duplicates[enclosureURL]; !found {
  195. duplicates[enclosureURL] = true
  196. enclosures = append(enclosures, &model.Enclosure{
  197. URL: enclosureURL,
  198. MimeType: enclosure.Type,
  199. Size: enclosure.Size(),
  200. })
  201. }
  202. }
  203. for _, mediaContent := range r.AllMediaContents() {
  204. if _, found := duplicates[mediaContent.URL]; !found {
  205. duplicates[mediaContent.URL] = true
  206. enclosures = append(enclosures, &model.Enclosure{
  207. URL: mediaContent.URL,
  208. MimeType: mediaContent.MimeType(),
  209. Size: mediaContent.Size(),
  210. })
  211. }
  212. }
  213. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  214. if _, found := duplicates[mediaPeerLink.URL]; !found {
  215. duplicates[mediaPeerLink.URL] = true
  216. enclosures = append(enclosures, &model.Enclosure{
  217. URL: mediaPeerLink.URL,
  218. MimeType: mediaPeerLink.MimeType(),
  219. Size: mediaPeerLink.Size(),
  220. })
  221. }
  222. }
  223. return enclosures
  224. }
  225. func (r *rssItem) CommentsURL() string {
  226. for _, commentLink := range r.CommentLinks {
  227. if commentLink.XMLName.Space == "" {
  228. return strings.TrimSpace(commentLink.Data)
  229. }
  230. }
  231. return ""
  232. }
  233. func (r *rssItem) Transform() *model.Entry {
  234. entry := new(model.Entry)
  235. entry.URL = r.URL()
  236. entry.CommentsURL = r.CommentsURL()
  237. entry.Date = r.PublishedDate()
  238. entry.Author = r.Author()
  239. entry.Hash = r.Hash()
  240. entry.Content = r.Content()
  241. entry.Title = strings.TrimSpace(r.Title)
  242. entry.Enclosures = r.Enclosures()
  243. return entry
  244. }
  245. func isValidLinkRelation(rel string) bool {
  246. switch rel {
  247. case "", "alternate", "enclosure", "related", "self", "via":
  248. return true
  249. default:
  250. if strings.HasPrefix(rel, "http") {
  251. return true
  252. }
  253. return false
  254. }
  255. }