rss.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "encoding/xml"
  6. "html"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/logger"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/media"
  16. "miniflux.app/v2/internal/reader/sanitizer"
  17. "miniflux.app/v2/internal/urllib"
  18. )
  19. // Specs: https://cyber.harvard.edu/rss/rss.html
  20. type rssFeed struct {
  21. XMLName xml.Name `xml:"rss"`
  22. Version string `xml:"version,attr"`
  23. Title string `xml:"channel>title"`
  24. Links []rssLink `xml:"channel>link"`
  25. ImageURL string `xml:"channel>image>url"`
  26. Language string `xml:"channel>language"`
  27. Description string `xml:"channel>description"`
  28. PubDate string `xml:"channel>pubDate"`
  29. ManagingEditor string `xml:"channel>managingEditor"`
  30. Webmaster string `xml:"channel>webMaster"`
  31. Items []rssItem `xml:"channel>item"`
  32. PodcastFeedElement
  33. }
  34. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  35. var err error
  36. feed := new(model.Feed)
  37. siteURL := r.siteURL()
  38. feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
  39. if err != nil {
  40. feed.SiteURL = siteURL
  41. }
  42. feedURL := r.feedURL()
  43. feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
  44. if err != nil {
  45. feed.FeedURL = feedURL
  46. }
  47. feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
  48. if feed.Title == "" {
  49. feed.Title = feed.SiteURL
  50. }
  51. feed.IconURL = strings.TrimSpace(r.ImageURL)
  52. for _, item := range r.Items {
  53. entry := item.Transform()
  54. if entry.Author == "" {
  55. entry.Author = r.feedAuthor()
  56. }
  57. if entry.URL == "" {
  58. entry.URL = feed.SiteURL
  59. } else {
  60. entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
  61. if err == nil {
  62. entry.URL = entryURL
  63. }
  64. }
  65. if entry.Title == "" {
  66. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  67. }
  68. if entry.Title == "" {
  69. entry.Title = entry.URL
  70. }
  71. feed.Entries = append(feed.Entries, entry)
  72. }
  73. return feed
  74. }
  75. func (r *rssFeed) siteURL() string {
  76. for _, element := range r.Links {
  77. if element.XMLName.Space == "" {
  78. return strings.TrimSpace(element.Data)
  79. }
  80. }
  81. return ""
  82. }
  83. func (r *rssFeed) feedURL() string {
  84. for _, element := range r.Links {
  85. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  86. return strings.TrimSpace(element.Href)
  87. }
  88. }
  89. return ""
  90. }
  91. func (r rssFeed) feedAuthor() string {
  92. author := r.PodcastAuthor()
  93. switch {
  94. case r.ManagingEditor != "":
  95. author = r.ManagingEditor
  96. case r.Webmaster != "":
  97. author = r.Webmaster
  98. }
  99. return sanitizer.StripTags(strings.TrimSpace(author))
  100. }
  101. type rssGUID struct {
  102. XMLName xml.Name
  103. Data string `xml:",chardata"`
  104. IsPermaLink string `xml:"isPermaLink,attr"`
  105. }
  106. type rssLink struct {
  107. XMLName xml.Name
  108. Data string `xml:",chardata"`
  109. Href string `xml:"href,attr"`
  110. Rel string `xml:"rel,attr"`
  111. }
  112. type rssCommentLink struct {
  113. XMLName xml.Name
  114. Data string `xml:",chardata"`
  115. }
  116. type rssAuthor struct {
  117. XMLName xml.Name
  118. Data string `xml:",chardata"`
  119. Name string `xml:"name"`
  120. Email string `xml:"email"`
  121. Inner string `xml:",innerxml"`
  122. }
  123. type rssTitle struct {
  124. XMLName xml.Name
  125. Data string `xml:",chardata"`
  126. Inner string `xml:",innerxml"`
  127. }
  128. type rssEnclosure struct {
  129. URL string `xml:"url,attr"`
  130. Type string `xml:"type,attr"`
  131. Length string `xml:"length,attr"`
  132. }
  133. type rssCategory struct {
  134. XMLName xml.Name
  135. Data string `xml:",chardata"`
  136. Inner string `xml:",innerxml"`
  137. }
  138. func (enclosure *rssEnclosure) Size() int64 {
  139. if enclosure.Length == "" {
  140. return 0
  141. }
  142. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  143. return size
  144. }
  145. type rssItem struct {
  146. GUID rssGUID `xml:"guid"`
  147. Title []rssTitle `xml:"title"`
  148. Links []rssLink `xml:"link"`
  149. Description string `xml:"description"`
  150. PubDate string `xml:"pubDate"`
  151. Authors []rssAuthor `xml:"author"`
  152. CommentLinks []rssCommentLink `xml:"comments"`
  153. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  154. Categories []rssCategory `xml:"category"`
  155. DublinCoreElement
  156. FeedBurnerElement
  157. PodcastEntryElement
  158. media.Element
  159. }
  160. func (r *rssItem) Transform() *model.Entry {
  161. entry := new(model.Entry)
  162. entry.URL = r.entryURL()
  163. entry.CommentsURL = r.entryCommentsURL()
  164. entry.Date = r.entryDate()
  165. entry.Author = r.entryAuthor()
  166. entry.Hash = r.entryHash()
  167. entry.Content = r.entryContent()
  168. entry.Title = r.entryTitle()
  169. entry.Enclosures = r.entryEnclosures()
  170. entry.Tags = r.entryCategories()
  171. if duration, err := normalizeDuration(r.Duration); err == nil {
  172. entry.ReadingTime = duration
  173. }
  174. return entry
  175. }
  176. func (r *rssItem) entryDate() time.Time {
  177. value := r.PubDate
  178. if r.DublinCoreDate != "" {
  179. value = r.DublinCoreDate
  180. }
  181. if value != "" {
  182. result, err := date.Parse(value)
  183. if err != nil {
  184. logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
  185. return time.Now()
  186. }
  187. return result
  188. }
  189. return time.Now()
  190. }
  191. func (r *rssItem) entryAuthor() string {
  192. author := ""
  193. for _, rssAuthor := range r.Authors {
  194. switch rssAuthor.XMLName.Space {
  195. case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
  196. author = rssAuthor.Data
  197. case "http://www.w3.org/2005/Atom":
  198. if rssAuthor.Name != "" {
  199. author = rssAuthor.Name
  200. } else if rssAuthor.Email != "" {
  201. author = rssAuthor.Email
  202. }
  203. default:
  204. if rssAuthor.Name != "" {
  205. author = rssAuthor.Name
  206. } else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
  207. author = rssAuthor.Data
  208. } else {
  209. author = rssAuthor.Inner
  210. }
  211. }
  212. }
  213. if author == "" {
  214. author = r.DublinCoreCreator
  215. }
  216. return sanitizer.StripTags(strings.TrimSpace(author))
  217. }
  218. func (r *rssItem) entryHash() string {
  219. for _, value := range []string{r.GUID.Data, r.entryURL()} {
  220. if value != "" {
  221. return crypto.Hash(value)
  222. }
  223. }
  224. return ""
  225. }
  226. func (r *rssItem) entryTitle() string {
  227. var title string
  228. for _, rssTitle := range r.Title {
  229. switch rssTitle.XMLName.Space {
  230. case "http://search.yahoo.com/mrss/":
  231. // Ignore title in media namespace
  232. case "http://purl.org/dc/elements/1.1/":
  233. title = rssTitle.Data
  234. default:
  235. title = rssTitle.Data
  236. }
  237. if title != "" {
  238. break
  239. }
  240. }
  241. return html.UnescapeString(strings.TrimSpace(title))
  242. }
  243. func (r *rssItem) entryContent() string {
  244. for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} {
  245. if value != "" {
  246. return value
  247. }
  248. }
  249. return ""
  250. }
  251. func (r *rssItem) entryURL() string {
  252. if r.FeedBurnerLink != "" {
  253. return r.FeedBurnerLink
  254. }
  255. for _, link := range r.Links {
  256. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  257. return strings.TrimSpace(link.Href)
  258. }
  259. if link.Data != "" {
  260. return strings.TrimSpace(link.Data)
  261. }
  262. }
  263. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  264. // isPermaLink is optional, its default value is true.
  265. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  266. if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" {
  267. return strings.TrimSpace(r.GUID.Data)
  268. }
  269. return ""
  270. }
  271. func (r *rssItem) entryEnclosures() model.EnclosureList {
  272. enclosures := make(model.EnclosureList, 0)
  273. duplicates := make(map[string]bool)
  274. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  275. if _, found := duplicates[mediaThumbnail.URL]; !found {
  276. duplicates[mediaThumbnail.URL] = true
  277. enclosures = append(enclosures, &model.Enclosure{
  278. URL: mediaThumbnail.URL,
  279. MimeType: mediaThumbnail.MimeType(),
  280. Size: mediaThumbnail.Size(),
  281. })
  282. }
  283. }
  284. for _, enclosure := range r.EnclosureLinks {
  285. enclosureURL := enclosure.URL
  286. if r.FeedBurnerEnclosureLink != "" {
  287. filename := path.Base(r.FeedBurnerEnclosureLink)
  288. if strings.Contains(enclosureURL, filename) {
  289. enclosureURL = r.FeedBurnerEnclosureLink
  290. }
  291. }
  292. if enclosureURL == "" {
  293. continue
  294. }
  295. if _, found := duplicates[enclosureURL]; !found {
  296. duplicates[enclosureURL] = true
  297. enclosures = append(enclosures, &model.Enclosure{
  298. URL: enclosureURL,
  299. MimeType: enclosure.Type,
  300. Size: enclosure.Size(),
  301. })
  302. }
  303. }
  304. for _, mediaContent := range r.AllMediaContents() {
  305. if _, found := duplicates[mediaContent.URL]; !found {
  306. duplicates[mediaContent.URL] = true
  307. enclosures = append(enclosures, &model.Enclosure{
  308. URL: mediaContent.URL,
  309. MimeType: mediaContent.MimeType(),
  310. Size: mediaContent.Size(),
  311. })
  312. }
  313. }
  314. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  315. if _, found := duplicates[mediaPeerLink.URL]; !found {
  316. duplicates[mediaPeerLink.URL] = true
  317. enclosures = append(enclosures, &model.Enclosure{
  318. URL: mediaPeerLink.URL,
  319. MimeType: mediaPeerLink.MimeType(),
  320. Size: mediaPeerLink.Size(),
  321. })
  322. }
  323. }
  324. return enclosures
  325. }
  326. func (r *rssItem) entryCategories() []string {
  327. var categoryList []string
  328. for _, rssCategory := range r.Categories {
  329. if strings.Contains(rssCategory.Inner, "<![CDATA[") {
  330. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Data))
  331. } else {
  332. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Inner))
  333. }
  334. }
  335. return categoryList
  336. }
  337. func (r *rssItem) entryCommentsURL() string {
  338. for _, commentLink := range r.CommentLinks {
  339. if commentLink.XMLName.Space == "" {
  340. commentsURL := strings.TrimSpace(commentLink.Data)
  341. // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
  342. // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
  343. if urllib.IsAbsoluteURL(commentsURL) {
  344. return commentsURL
  345. }
  346. }
  347. }
  348. return ""
  349. }
  350. func isValidLinkRelation(rel string) bool {
  351. switch rel {
  352. case "", "alternate", "enclosure", "related", "self", "via":
  353. return true
  354. default:
  355. if strings.HasPrefix(rel, "http") {
  356. return true
  357. }
  358. return false
  359. }
  360. }