rss.go 8.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss // import "miniflux.app/reader/rss"
  5. import (
  6. "encoding/xml"
  7. "html"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "miniflux.app/crypto"
  13. "miniflux.app/logger"
  14. "miniflux.app/model"
  15. "miniflux.app/reader/date"
  16. "miniflux.app/reader/media"
  17. "miniflux.app/reader/sanitizer"
  18. "miniflux.app/url"
  19. )
  20. // Specs: https://cyber.harvard.edu/rss/rss.html
  21. type rssFeed struct {
  22. XMLName xml.Name `xml:"rss"`
  23. Version string `xml:"version,attr"`
  24. Title string `xml:"channel>title"`
  25. Links []rssLink `xml:"channel>link"`
  26. Language string `xml:"channel>language"`
  27. Description string `xml:"channel>description"`
  28. PubDate string `xml:"channel>pubDate"`
  29. ManagingEditor string `xml:"channel>managingEditor"`
  30. Webmaster string `xml:"channel>webMaster"`
  31. Items []rssItem `xml:"channel>item"`
  32. PodcastFeedElement
  33. }
  34. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  35. var err error
  36. feed := new(model.Feed)
  37. siteURL := r.siteURL()
  38. feed.SiteURL, err = url.AbsoluteURL(baseURL, siteURL)
  39. if err != nil {
  40. feed.SiteURL = siteURL
  41. }
  42. feedURL := r.feedURL()
  43. feed.FeedURL, err = url.AbsoluteURL(baseURL, feedURL)
  44. if err != nil {
  45. feed.FeedURL = feedURL
  46. }
  47. feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
  48. if feed.Title == "" {
  49. feed.Title = feed.SiteURL
  50. }
  51. for _, item := range r.Items {
  52. entry := item.Transform()
  53. if entry.Author == "" {
  54. entry.Author = r.feedAuthor()
  55. }
  56. if entry.URL == "" {
  57. entry.URL = feed.SiteURL
  58. } else {
  59. entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL)
  60. if err == nil {
  61. entry.URL = entryURL
  62. }
  63. }
  64. if entry.Title == "" {
  65. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  66. }
  67. if entry.Title == "" {
  68. entry.Title = entry.URL
  69. }
  70. feed.Entries = append(feed.Entries, entry)
  71. }
  72. return feed
  73. }
  74. func (r *rssFeed) siteURL() string {
  75. for _, element := range r.Links {
  76. if element.XMLName.Space == "" {
  77. return strings.TrimSpace(element.Data)
  78. }
  79. }
  80. return ""
  81. }
  82. func (r *rssFeed) feedURL() string {
  83. for _, element := range r.Links {
  84. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  85. return strings.TrimSpace(element.Href)
  86. }
  87. }
  88. return ""
  89. }
  90. func (r rssFeed) feedAuthor() string {
  91. author := r.PodcastAuthor()
  92. switch {
  93. case r.ManagingEditor != "":
  94. author = r.ManagingEditor
  95. case r.Webmaster != "":
  96. author = r.Webmaster
  97. }
  98. return sanitizer.StripTags(strings.TrimSpace(author))
  99. }
  100. type rssLink struct {
  101. XMLName xml.Name
  102. Data string `xml:",chardata"`
  103. Href string `xml:"href,attr"`
  104. Rel string `xml:"rel,attr"`
  105. }
  106. type rssCommentLink struct {
  107. XMLName xml.Name
  108. Data string `xml:",chardata"`
  109. }
  110. type rssAuthor struct {
  111. XMLName xml.Name
  112. Data string `xml:",chardata"`
  113. Name string `xml:"name"`
  114. Email string `xml:"email"`
  115. Inner string `xml:",innerxml"`
  116. }
  117. type rssTitle struct {
  118. XMLName xml.Name
  119. Data string `xml:",chardata"`
  120. Inner string `xml:",innerxml"`
  121. }
  122. type rssEnclosure struct {
  123. URL string `xml:"url,attr"`
  124. Type string `xml:"type,attr"`
  125. Length string `xml:"length,attr"`
  126. }
  127. func (enclosure *rssEnclosure) Size() int64 {
  128. if enclosure.Length == "" {
  129. return 0
  130. }
  131. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  132. return size
  133. }
  134. type rssItem struct {
  135. GUID string `xml:"guid"`
  136. Title []rssTitle `xml:"title"`
  137. Links []rssLink `xml:"link"`
  138. Description string `xml:"description"`
  139. PubDate string `xml:"pubDate"`
  140. Authors []rssAuthor `xml:"author"`
  141. CommentLinks []rssCommentLink `xml:"comments"`
  142. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  143. DublinCoreElement
  144. FeedBurnerElement
  145. PodcastEntryElement
  146. media.Element
  147. }
  148. func (r *rssItem) Transform() *model.Entry {
  149. entry := new(model.Entry)
  150. entry.URL = r.entryURL()
  151. entry.CommentsURL = r.entryCommentsURL()
  152. entry.Date = r.entryDate()
  153. entry.Author = r.entryAuthor()
  154. entry.Hash = r.entryHash()
  155. entry.Content = r.entryContent()
  156. entry.Title = r.entryTitle()
  157. entry.Enclosures = r.entryEnclosures()
  158. return entry
  159. }
  160. func (r *rssItem) entryDate() time.Time {
  161. value := r.PubDate
  162. if r.DublinCoreDate != "" {
  163. value = r.DublinCoreDate
  164. }
  165. if value != "" {
  166. result, err := date.Parse(value)
  167. if err != nil {
  168. logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
  169. return time.Now()
  170. }
  171. return result
  172. }
  173. return time.Now()
  174. }
  175. func (r *rssItem) entryAuthor() string {
  176. author := ""
  177. for _, rssAuthor := range r.Authors {
  178. switch rssAuthor.XMLName.Space {
  179. case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
  180. author = rssAuthor.Data
  181. case "http://www.w3.org/2005/Atom":
  182. if rssAuthor.Name != "" {
  183. author = rssAuthor.Name
  184. } else if rssAuthor.Email != "" {
  185. author = rssAuthor.Email
  186. }
  187. default:
  188. if rssAuthor.Name != "" {
  189. author = rssAuthor.Name
  190. } else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
  191. author = rssAuthor.Data
  192. } else {
  193. author = rssAuthor.Inner
  194. }
  195. }
  196. }
  197. if author == "" {
  198. author = r.DublinCoreCreator
  199. }
  200. return sanitizer.StripTags(strings.TrimSpace(author))
  201. }
  202. func (r *rssItem) entryHash() string {
  203. for _, value := range []string{r.GUID, r.entryURL()} {
  204. if value != "" {
  205. return crypto.Hash(value)
  206. }
  207. }
  208. return ""
  209. }
  210. func (r *rssItem) entryTitle() string {
  211. var title string
  212. for _, rssTitle := range r.Title {
  213. switch rssTitle.XMLName.Space {
  214. case "http://search.yahoo.com/mrss/":
  215. // Ignore title in media namespace
  216. case "http://purl.org/dc/elements/1.1/":
  217. title = rssTitle.Data
  218. default:
  219. title = rssTitle.Data
  220. }
  221. if title != "" {
  222. break
  223. }
  224. }
  225. return html.UnescapeString(strings.TrimSpace(title))
  226. }
  227. func (r *rssItem) entryContent() string {
  228. for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} {
  229. if value != "" {
  230. return value
  231. }
  232. }
  233. return ""
  234. }
  235. func (r *rssItem) entryURL() string {
  236. if r.FeedBurnerLink != "" {
  237. return r.FeedBurnerLink
  238. }
  239. for _, link := range r.Links {
  240. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  241. return strings.TrimSpace(link.Href)
  242. }
  243. if link.Data != "" {
  244. return strings.TrimSpace(link.Data)
  245. }
  246. }
  247. return ""
  248. }
  249. func (r *rssItem) entryEnclosures() model.EnclosureList {
  250. enclosures := make(model.EnclosureList, 0)
  251. duplicates := make(map[string]bool)
  252. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  253. if _, found := duplicates[mediaThumbnail.URL]; !found {
  254. duplicates[mediaThumbnail.URL] = true
  255. enclosures = append(enclosures, &model.Enclosure{
  256. URL: mediaThumbnail.URL,
  257. MimeType: mediaThumbnail.MimeType(),
  258. Size: mediaThumbnail.Size(),
  259. })
  260. }
  261. }
  262. for _, enclosure := range r.EnclosureLinks {
  263. enclosureURL := enclosure.URL
  264. if r.FeedBurnerEnclosureLink != "" {
  265. filename := path.Base(r.FeedBurnerEnclosureLink)
  266. if strings.Contains(enclosureURL, filename) {
  267. enclosureURL = r.FeedBurnerEnclosureLink
  268. }
  269. }
  270. if enclosureURL == "" {
  271. continue
  272. }
  273. if _, found := duplicates[enclosureURL]; !found {
  274. duplicates[enclosureURL] = true
  275. enclosures = append(enclosures, &model.Enclosure{
  276. URL: enclosureURL,
  277. MimeType: enclosure.Type,
  278. Size: enclosure.Size(),
  279. })
  280. }
  281. }
  282. for _, mediaContent := range r.AllMediaContents() {
  283. if _, found := duplicates[mediaContent.URL]; !found {
  284. duplicates[mediaContent.URL] = true
  285. enclosures = append(enclosures, &model.Enclosure{
  286. URL: mediaContent.URL,
  287. MimeType: mediaContent.MimeType(),
  288. Size: mediaContent.Size(),
  289. })
  290. }
  291. }
  292. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  293. if _, found := duplicates[mediaPeerLink.URL]; !found {
  294. duplicates[mediaPeerLink.URL] = true
  295. enclosures = append(enclosures, &model.Enclosure{
  296. URL: mediaPeerLink.URL,
  297. MimeType: mediaPeerLink.MimeType(),
  298. Size: mediaPeerLink.Size(),
  299. })
  300. }
  301. }
  302. return enclosures
  303. }
  304. func (r *rssItem) entryCommentsURL() string {
  305. for _, commentLink := range r.CommentLinks {
  306. if commentLink.XMLName.Space == "" {
  307. commentsURL := strings.TrimSpace(commentLink.Data)
  308. // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
  309. // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
  310. if url.IsAbsoluteURL(commentsURL) {
  311. return commentsURL
  312. }
  313. }
  314. }
  315. return ""
  316. }
  317. func isValidLinkRelation(rel string) bool {
  318. switch rel {
  319. case "", "alternate", "enclosure", "related", "self", "via":
  320. return true
  321. default:
  322. if strings.HasPrefix(rel, "http") {
  323. return true
  324. }
  325. return false
  326. }
  327. }