rss.go 8.4 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss // import "miniflux.app/reader/rss"
  5. import (
  6. "encoding/xml"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/media"
  16. "miniflux.app/reader/sanitizer"
  17. "miniflux.app/url"
  18. )
  19. // Specs: https://cyber.harvard.edu/rss/rss.html
  20. type rssFeed struct {
  21. XMLName xml.Name `xml:"rss"`
  22. Version string `xml:"version,attr"`
  23. Title string `xml:"channel>title"`
  24. Links []rssLink `xml:"channel>link"`
  25. Language string `xml:"channel>language"`
  26. Description string `xml:"channel>description"`
  27. PubDate string `xml:"channel>pubDate"`
  28. ManagingEditor string `xml:"channel>managingEditor"`
  29. Webmaster string `xml:"channel>webMaster"`
  30. Items []rssItem `xml:"channel>item"`
  31. PodcastFeedElement
  32. }
  33. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  34. var err error
  35. feed := new(model.Feed)
  36. siteURL := r.siteURL()
  37. feed.SiteURL, err = url.AbsoluteURL(baseURL, siteURL)
  38. if err != nil {
  39. feed.SiteURL = siteURL
  40. }
  41. feedURL := r.feedURL()
  42. feed.FeedURL, err = url.AbsoluteURL(baseURL, feedURL)
  43. if err != nil {
  44. feed.FeedURL = feedURL
  45. }
  46. feed.Title = strings.TrimSpace(r.Title)
  47. if feed.Title == "" {
  48. feed.Title = feed.SiteURL
  49. }
  50. for _, item := range r.Items {
  51. entry := item.Transform()
  52. if entry.Author == "" {
  53. entry.Author = r.feedAuthor()
  54. }
  55. if entry.URL == "" {
  56. entry.URL = feed.SiteURL
  57. } else {
  58. entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL)
  59. if err == nil {
  60. entry.URL = entryURL
  61. }
  62. }
  63. if entry.Title == "" {
  64. entry.Title = entry.URL
  65. }
  66. feed.Entries = append(feed.Entries, entry)
  67. }
  68. return feed
  69. }
  70. func (r *rssFeed) siteURL() string {
  71. for _, element := range r.Links {
  72. if element.XMLName.Space == "" {
  73. return strings.TrimSpace(element.Data)
  74. }
  75. }
  76. return ""
  77. }
  78. func (r *rssFeed) feedURL() string {
  79. for _, element := range r.Links {
  80. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  81. return strings.TrimSpace(element.Href)
  82. }
  83. }
  84. return ""
  85. }
  86. func (r rssFeed) feedAuthor() string {
  87. author := r.PodcastAuthor()
  88. switch {
  89. case r.ManagingEditor != "":
  90. author = r.ManagingEditor
  91. case r.Webmaster != "":
  92. author = r.Webmaster
  93. }
  94. return sanitizer.StripTags(strings.TrimSpace(author))
  95. }
  96. type rssLink struct {
  97. XMLName xml.Name
  98. Data string `xml:",chardata"`
  99. Href string `xml:"href,attr"`
  100. Rel string `xml:"rel,attr"`
  101. }
  102. type rssCommentLink struct {
  103. XMLName xml.Name
  104. Data string `xml:",chardata"`
  105. }
  106. type rssAuthor struct {
  107. XMLName xml.Name
  108. Data string `xml:",chardata"`
  109. Name string `xml:"name"`
  110. Email string `xml:"email"`
  111. Inner string `xml:",innerxml"`
  112. }
  113. type rssTitle struct {
  114. XMLName xml.Name
  115. Data string `xml:",chardata"`
  116. Inner string `xml:",innerxml"`
  117. }
  118. type rssEnclosure struct {
  119. URL string `xml:"url,attr"`
  120. Type string `xml:"type,attr"`
  121. Length string `xml:"length,attr"`
  122. }
  123. func (enclosure *rssEnclosure) Size() int64 {
  124. if enclosure.Length == "" {
  125. return 0
  126. }
  127. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  128. return size
  129. }
  130. type rssItem struct {
  131. GUID string `xml:"guid"`
  132. Title []rssTitle `xml:"title"`
  133. Links []rssLink `xml:"link"`
  134. Description string `xml:"description"`
  135. PubDate string `xml:"pubDate"`
  136. Authors []rssAuthor `xml:"author"`
  137. CommentLinks []rssCommentLink `xml:"comments"`
  138. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  139. DublinCoreElement
  140. FeedBurnerElement
  141. PodcastEntryElement
  142. media.Element
  143. }
  144. func (r *rssItem) Transform() *model.Entry {
  145. entry := new(model.Entry)
  146. entry.URL = r.entryURL()
  147. entry.CommentsURL = r.entryCommentsURL()
  148. entry.Date = r.entryDate()
  149. entry.Author = r.entryAuthor()
  150. entry.Hash = r.entryHash()
  151. entry.Content = r.entryContent()
  152. entry.Title = r.entryTitle()
  153. entry.Enclosures = r.entryEnclosures()
  154. return entry
  155. }
  156. func (r *rssItem) entryDate() time.Time {
  157. value := r.PubDate
  158. if r.DublinCoreDate != "" {
  159. value = r.DublinCoreDate
  160. }
  161. if value != "" {
  162. result, err := date.Parse(value)
  163. if err != nil {
  164. logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
  165. return time.Now()
  166. }
  167. return result
  168. }
  169. return time.Now()
  170. }
  171. func (r *rssItem) entryAuthor() string {
  172. author := ""
  173. for _, rssAuthor := range r.Authors {
  174. switch rssAuthor.XMLName.Space {
  175. case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
  176. author = rssAuthor.Data
  177. case "http://www.w3.org/2005/Atom":
  178. if rssAuthor.Name != "" {
  179. author = rssAuthor.Name
  180. } else if rssAuthor.Email != "" {
  181. author = rssAuthor.Email
  182. }
  183. default:
  184. if rssAuthor.Name != "" {
  185. author = rssAuthor.Name
  186. } else {
  187. author = rssAuthor.Inner
  188. }
  189. }
  190. }
  191. if author == "" {
  192. author = r.DublinCoreCreator
  193. }
  194. return sanitizer.StripTags(strings.TrimSpace(author))
  195. }
  196. func (r *rssItem) entryHash() string {
  197. for _, value := range []string{r.GUID, r.entryURL()} {
  198. if value != "" {
  199. return crypto.Hash(value)
  200. }
  201. }
  202. return ""
  203. }
  204. func (r *rssItem) entryTitle() string {
  205. var title string
  206. for _, rssTitle := range r.Title {
  207. switch rssTitle.XMLName.Space {
  208. case "http://search.yahoo.com/mrss/":
  209. // Ignore title in media namespace
  210. case "http://purl.org/dc/elements/1.1/":
  211. title = rssTitle.Data
  212. default:
  213. title = rssTitle.Data
  214. }
  215. if title != "" {
  216. break
  217. }
  218. }
  219. return strings.TrimSpace(title)
  220. }
  221. func (r *rssItem) entryContent() string {
  222. for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} {
  223. if value != "" {
  224. return value
  225. }
  226. }
  227. return ""
  228. }
  229. func (r *rssItem) entryURL() string {
  230. if r.FeedBurnerLink != "" {
  231. return r.FeedBurnerLink
  232. }
  233. for _, link := range r.Links {
  234. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  235. return strings.TrimSpace(link.Href)
  236. }
  237. if link.Data != "" {
  238. return strings.TrimSpace(link.Data)
  239. }
  240. }
  241. return ""
  242. }
  243. func (r *rssItem) entryEnclosures() model.EnclosureList {
  244. enclosures := make(model.EnclosureList, 0)
  245. duplicates := make(map[string]bool, 0)
  246. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  247. if _, found := duplicates[mediaThumbnail.URL]; !found {
  248. duplicates[mediaThumbnail.URL] = true
  249. enclosures = append(enclosures, &model.Enclosure{
  250. URL: mediaThumbnail.URL,
  251. MimeType: mediaThumbnail.MimeType(),
  252. Size: mediaThumbnail.Size(),
  253. })
  254. }
  255. }
  256. for _, enclosure := range r.EnclosureLinks {
  257. enclosureURL := enclosure.URL
  258. if r.FeedBurnerEnclosureLink != "" {
  259. filename := path.Base(r.FeedBurnerEnclosureLink)
  260. if strings.Contains(enclosureURL, filename) {
  261. enclosureURL = r.FeedBurnerEnclosureLink
  262. }
  263. }
  264. if enclosureURL == "" {
  265. continue
  266. }
  267. if _, found := duplicates[enclosureURL]; !found {
  268. duplicates[enclosureURL] = true
  269. enclosures = append(enclosures, &model.Enclosure{
  270. URL: enclosureURL,
  271. MimeType: enclosure.Type,
  272. Size: enclosure.Size(),
  273. })
  274. }
  275. }
  276. for _, mediaContent := range r.AllMediaContents() {
  277. if _, found := duplicates[mediaContent.URL]; !found {
  278. duplicates[mediaContent.URL] = true
  279. enclosures = append(enclosures, &model.Enclosure{
  280. URL: mediaContent.URL,
  281. MimeType: mediaContent.MimeType(),
  282. Size: mediaContent.Size(),
  283. })
  284. }
  285. }
  286. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  287. if _, found := duplicates[mediaPeerLink.URL]; !found {
  288. duplicates[mediaPeerLink.URL] = true
  289. enclosures = append(enclosures, &model.Enclosure{
  290. URL: mediaPeerLink.URL,
  291. MimeType: mediaPeerLink.MimeType(),
  292. Size: mediaPeerLink.Size(),
  293. })
  294. }
  295. }
  296. return enclosures
  297. }
  298. func (r *rssItem) entryCommentsURL() string {
  299. for _, commentLink := range r.CommentLinks {
  300. if commentLink.XMLName.Space == "" {
  301. commentsURL := strings.TrimSpace(commentLink.Data)
  302. // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
  303. // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
  304. if url.IsAbsoluteURL(commentsURL) {
  305. return commentsURL
  306. }
  307. }
  308. }
  309. return ""
  310. }
  311. func isValidLinkRelation(rel string) bool {
  312. switch rel {
  313. case "", "alternate", "enclosure", "related", "self", "via":
  314. return true
  315. default:
  316. if strings.HasPrefix(rel, "http") {
  317. return true
  318. }
  319. return false
  320. }
  321. }