rss.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "encoding/xml"
  6. "html"
  7. "log/slog"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/dublincore"
  16. "miniflux.app/v2/internal/reader/media"
  17. "miniflux.app/v2/internal/reader/sanitizer"
  18. "miniflux.app/v2/internal/urllib"
  19. )
  20. // Specs: https://cyber.harvard.edu/rss/rss.html
  21. type rssFeed struct {
  22. XMLName xml.Name `xml:"rss"`
  23. Version string `xml:"version,attr"`
  24. Title string `xml:"channel>title"`
  25. Links []rssLink `xml:"channel>link"`
  26. ImageURL string `xml:"channel>image>url"`
  27. Language string `xml:"channel>language"`
  28. Description string `xml:"channel>description"`
  29. PubDate string `xml:"channel>pubDate"`
  30. ManagingEditor string `xml:"channel>managingEditor"`
  31. Webmaster string `xml:"channel>webMaster"`
  32. TimeToLive rssTTL `xml:"channel>ttl"`
  33. Items []rssItem `xml:"channel>item"`
  34. PodcastFeedElement
  35. }
  36. type rssTTL struct {
  37. Data string `xml:",chardata"`
  38. }
  39. func (r *rssTTL) Value() int {
  40. if r.Data == "" {
  41. return 0
  42. }
  43. value, err := strconv.Atoi(r.Data)
  44. if err != nil {
  45. return 0
  46. }
  47. return value
  48. }
  49. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  50. var err error
  51. feed := new(model.Feed)
  52. siteURL := r.siteURL()
  53. feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
  54. if err != nil {
  55. feed.SiteURL = siteURL
  56. }
  57. feedURL := r.feedURL()
  58. feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
  59. if err != nil {
  60. feed.FeedURL = feedURL
  61. }
  62. feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
  63. if feed.Title == "" {
  64. feed.Title = feed.SiteURL
  65. }
  66. feed.IconURL = strings.TrimSpace(r.ImageURL)
  67. feed.TTL = r.TimeToLive.Value()
  68. for _, item := range r.Items {
  69. entry := item.Transform()
  70. if entry.Author == "" {
  71. entry.Author = r.feedAuthor()
  72. }
  73. if entry.URL == "" {
  74. entry.URL = feed.SiteURL
  75. } else {
  76. entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
  77. if err == nil {
  78. entry.URL = entryURL
  79. }
  80. }
  81. if entry.Title == "" {
  82. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  83. }
  84. if entry.Title == "" {
  85. entry.Title = entry.URL
  86. }
  87. feed.Entries = append(feed.Entries, entry)
  88. }
  89. return feed
  90. }
  91. func (r *rssFeed) siteURL() string {
  92. for _, element := range r.Links {
  93. if element.XMLName.Space == "" {
  94. return strings.TrimSpace(element.Data)
  95. }
  96. }
  97. return ""
  98. }
  99. func (r *rssFeed) feedURL() string {
  100. for _, element := range r.Links {
  101. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  102. return strings.TrimSpace(element.Href)
  103. }
  104. }
  105. return ""
  106. }
  107. func (r rssFeed) feedAuthor() string {
  108. author := r.PodcastAuthor()
  109. switch {
  110. case r.ManagingEditor != "":
  111. author = r.ManagingEditor
  112. case r.Webmaster != "":
  113. author = r.Webmaster
  114. }
  115. return sanitizer.StripTags(strings.TrimSpace(author))
  116. }
  117. type rssGUID struct {
  118. XMLName xml.Name
  119. Data string `xml:",chardata"`
  120. IsPermaLink string `xml:"isPermaLink,attr"`
  121. }
  122. type rssLink struct {
  123. XMLName xml.Name
  124. Data string `xml:",chardata"`
  125. Href string `xml:"href,attr"`
  126. Rel string `xml:"rel,attr"`
  127. }
  128. type rssCommentLink struct {
  129. XMLName xml.Name
  130. Data string `xml:",chardata"`
  131. }
  132. type rssAuthor struct {
  133. XMLName xml.Name
  134. Data string `xml:",chardata"`
  135. Name string `xml:"name"`
  136. Email string `xml:"email"`
  137. Inner string `xml:",innerxml"`
  138. }
  139. type rssTitle struct {
  140. XMLName xml.Name
  141. Data string `xml:",chardata"`
  142. Inner string `xml:",innerxml"`
  143. }
  144. type rssEnclosure struct {
  145. URL string `xml:"url,attr"`
  146. Type string `xml:"type,attr"`
  147. Length string `xml:"length,attr"`
  148. }
  149. type rssCategory struct {
  150. XMLName xml.Name
  151. Data string `xml:",chardata"`
  152. Inner string `xml:",innerxml"`
  153. }
  154. func (enclosure *rssEnclosure) Size() int64 {
  155. if enclosure.Length == "" {
  156. return 0
  157. }
  158. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  159. return size
  160. }
  161. type rssItem struct {
  162. GUID rssGUID `xml:"guid"`
  163. Title []rssTitle `xml:"title"`
  164. Links []rssLink `xml:"link"`
  165. Description string `xml:"description"`
  166. PubDate string `xml:"pubDate"`
  167. Authors []rssAuthor `xml:"author"`
  168. CommentLinks []rssCommentLink `xml:"comments"`
  169. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  170. Categories []rssCategory `xml:"category"`
  171. dublincore.DublinCoreItemElement
  172. FeedBurnerElement
  173. PodcastEntryElement
  174. media.Element
  175. }
  176. func (r *rssItem) Transform() *model.Entry {
  177. entry := model.NewEntry()
  178. entry.URL = r.entryURL()
  179. entry.CommentsURL = r.entryCommentsURL()
  180. entry.Date = r.entryDate()
  181. entry.Author = r.entryAuthor()
  182. entry.Hash = r.entryHash()
  183. entry.Content = r.entryContent()
  184. entry.Title = r.entryTitle()
  185. entry.Enclosures = r.entryEnclosures()
  186. entry.Tags = r.entryCategories()
  187. if duration, err := normalizeDuration(r.Duration); err == nil {
  188. entry.ReadingTime = duration
  189. }
  190. return entry
  191. }
  192. func (r *rssItem) entryDate() time.Time {
  193. value := r.PubDate
  194. if r.DublinCoreDate != "" {
  195. value = r.DublinCoreDate
  196. }
  197. if value != "" {
  198. result, err := date.Parse(value)
  199. if err != nil {
  200. slog.Debug("Unable to parse date from RSS feed",
  201. slog.String("date", value),
  202. slog.String("guid", r.GUID.Data),
  203. slog.Any("error", err),
  204. )
  205. return time.Now()
  206. }
  207. return result
  208. }
  209. return time.Now()
  210. }
  211. func (r *rssItem) entryAuthor() string {
  212. author := ""
  213. for _, rssAuthor := range r.Authors {
  214. switch rssAuthor.XMLName.Space {
  215. case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
  216. author = rssAuthor.Data
  217. case "http://www.w3.org/2005/Atom":
  218. if rssAuthor.Name != "" {
  219. author = rssAuthor.Name
  220. } else if rssAuthor.Email != "" {
  221. author = rssAuthor.Email
  222. }
  223. default:
  224. if rssAuthor.Name != "" {
  225. author = rssAuthor.Name
  226. } else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
  227. author = rssAuthor.Data
  228. } else {
  229. author = rssAuthor.Inner
  230. }
  231. }
  232. }
  233. if author == "" {
  234. author = r.GetSanitizedCreator()
  235. }
  236. return sanitizer.StripTags(strings.TrimSpace(author))
  237. }
  238. func (r *rssItem) entryHash() string {
  239. for _, value := range []string{r.GUID.Data, r.entryURL()} {
  240. if value != "" {
  241. return crypto.Hash(value)
  242. }
  243. }
  244. return ""
  245. }
  246. func (r *rssItem) entryTitle() string {
  247. var title string
  248. for _, rssTitle := range r.Title {
  249. switch rssTitle.XMLName.Space {
  250. case "http://search.yahoo.com/mrss/":
  251. // Ignore title in media namespace
  252. case "http://purl.org/dc/elements/1.1/":
  253. title = rssTitle.Data
  254. default:
  255. title = rssTitle.Data
  256. }
  257. if title != "" {
  258. break
  259. }
  260. }
  261. return html.UnescapeString(strings.TrimSpace(title))
  262. }
  263. func (r *rssItem) entryContent() string {
  264. for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} {
  265. if value != "" {
  266. return value
  267. }
  268. }
  269. return ""
  270. }
  271. func (r *rssItem) entryURL() string {
  272. if r.FeedBurnerLink != "" {
  273. return r.FeedBurnerLink
  274. }
  275. for _, link := range r.Links {
  276. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  277. return strings.TrimSpace(link.Href)
  278. }
  279. if link.Data != "" {
  280. return strings.TrimSpace(link.Data)
  281. }
  282. }
  283. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  284. // isPermaLink is optional, its default value is true.
  285. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  286. if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" {
  287. return strings.TrimSpace(r.GUID.Data)
  288. }
  289. return ""
  290. }
  291. func (r *rssItem) entryEnclosures() model.EnclosureList {
  292. enclosures := make(model.EnclosureList, 0)
  293. duplicates := make(map[string]bool)
  294. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  295. if _, found := duplicates[mediaThumbnail.URL]; !found {
  296. duplicates[mediaThumbnail.URL] = true
  297. enclosures = append(enclosures, &model.Enclosure{
  298. URL: mediaThumbnail.URL,
  299. MimeType: mediaThumbnail.MimeType(),
  300. Size: mediaThumbnail.Size(),
  301. })
  302. }
  303. }
  304. for _, enclosure := range r.EnclosureLinks {
  305. enclosureURL := enclosure.URL
  306. if r.FeedBurnerEnclosureLink != "" {
  307. filename := path.Base(r.FeedBurnerEnclosureLink)
  308. if strings.Contains(enclosureURL, filename) {
  309. enclosureURL = r.FeedBurnerEnclosureLink
  310. }
  311. }
  312. if enclosureURL == "" {
  313. continue
  314. }
  315. if _, found := duplicates[enclosureURL]; !found {
  316. duplicates[enclosureURL] = true
  317. enclosures = append(enclosures, &model.Enclosure{
  318. URL: enclosureURL,
  319. MimeType: enclosure.Type,
  320. Size: enclosure.Size(),
  321. })
  322. }
  323. }
  324. for _, mediaContent := range r.AllMediaContents() {
  325. if _, found := duplicates[mediaContent.URL]; !found {
  326. duplicates[mediaContent.URL] = true
  327. enclosures = append(enclosures, &model.Enclosure{
  328. URL: mediaContent.URL,
  329. MimeType: mediaContent.MimeType(),
  330. Size: mediaContent.Size(),
  331. })
  332. }
  333. }
  334. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  335. if _, found := duplicates[mediaPeerLink.URL]; !found {
  336. duplicates[mediaPeerLink.URL] = true
  337. enclosures = append(enclosures, &model.Enclosure{
  338. URL: mediaPeerLink.URL,
  339. MimeType: mediaPeerLink.MimeType(),
  340. Size: mediaPeerLink.Size(),
  341. })
  342. }
  343. }
  344. return enclosures
  345. }
  346. func (r *rssItem) entryCategories() []string {
  347. categoryList := make([]string, 0)
  348. for _, rssCategory := range r.Categories {
  349. if strings.Contains(rssCategory.Inner, "<![CDATA[") {
  350. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Data))
  351. } else {
  352. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Inner))
  353. }
  354. }
  355. return categoryList
  356. }
  357. func (r *rssItem) entryCommentsURL() string {
  358. for _, commentLink := range r.CommentLinks {
  359. if commentLink.XMLName.Space == "" {
  360. commentsURL := strings.TrimSpace(commentLink.Data)
  361. // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
  362. // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
  363. if urllib.IsAbsoluteURL(commentsURL) {
  364. return commentsURL
  365. }
  366. }
  367. }
  368. return ""
  369. }
  370. func isValidLinkRelation(rel string) bool {
  371. switch rel {
  372. case "", "alternate", "enclosure", "related", "self", "via":
  373. return true
  374. default:
  375. if strings.HasPrefix(rel, "http") {
  376. return true
  377. }
  378. return false
  379. }
  380. }