rss.go 9.3 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "encoding/xml"
  6. "html"
  7. "log/slog"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/dublincore"
  16. "miniflux.app/v2/internal/reader/googleplay"
  17. "miniflux.app/v2/internal/reader/itunes"
  18. "miniflux.app/v2/internal/reader/media"
  19. "miniflux.app/v2/internal/reader/sanitizer"
  20. "miniflux.app/v2/internal/urllib"
  21. )
  22. // Specs: https://www.rssboard.org/rss-specification
  23. type rssFeed struct {
  24. XMLName xml.Name `xml:"rss"`
  25. Version string `xml:"rss version,attr"`
  26. Channel rssChannel `xml:"rss channel"`
  27. }
  28. type rssChannel struct {
  29. Categories []string `xml:"rss category"`
  30. Title string `xml:"rss title"`
  31. Link string `xml:"rss link"`
  32. ImageURL string `xml:"rss image>url"`
  33. Language string `xml:"rss language"`
  34. Description string `xml:"rss description"`
  35. PubDate string `xml:"rss pubDate"`
  36. ManagingEditor string `xml:"rss managingEditor"`
  37. Webmaster string `xml:"rss webMaster"`
  38. TimeToLive rssTTL `xml:"rss ttl"`
  39. Items []rssItem `xml:"rss item"`
  40. AtomLinks
  41. itunes.ItunesFeedElement
  42. googleplay.GooglePlayFeedElement
  43. }
  44. type rssTTL struct {
  45. Data string `xml:",chardata"`
  46. }
  47. func (r *rssTTL) Value() int {
  48. if r.Data == "" {
  49. return 0
  50. }
  51. value, err := strconv.Atoi(r.Data)
  52. if err != nil {
  53. return 0
  54. }
  55. return value
  56. }
  57. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  58. var err error
  59. feed := new(model.Feed)
  60. siteURL := r.siteURL()
  61. feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
  62. if err != nil {
  63. feed.SiteURL = siteURL
  64. }
  65. feedURL := r.feedURL()
  66. feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
  67. if err != nil {
  68. feed.FeedURL = feedURL
  69. }
  70. feed.Title = html.UnescapeString(strings.TrimSpace(r.Channel.Title))
  71. if feed.Title == "" {
  72. feed.Title = feed.SiteURL
  73. }
  74. feed.IconURL = strings.TrimSpace(r.Channel.ImageURL)
  75. feed.TTL = r.Channel.TimeToLive.Value()
  76. for _, item := range r.Channel.Items {
  77. entry := item.Transform()
  78. if entry.Author == "" {
  79. entry.Author = r.feedAuthor()
  80. }
  81. if entry.URL == "" {
  82. entry.URL = feed.SiteURL
  83. } else {
  84. entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
  85. if err == nil {
  86. entry.URL = entryURL
  87. }
  88. }
  89. if entry.Title == "" {
  90. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  91. }
  92. if entry.Title == "" {
  93. entry.Title = entry.URL
  94. }
  95. entry.Tags = append(entry.Tags, r.Channel.Categories...)
  96. entry.Tags = append(entry.Tags, r.Channel.GetItunesCategories()...)
  97. if r.Channel.GooglePlayCategory.Text != "" {
  98. entry.Tags = append(entry.Tags, r.Channel.GooglePlayCategory.Text)
  99. }
  100. feed.Entries = append(feed.Entries, entry)
  101. }
  102. return feed
  103. }
  104. func (r *rssFeed) siteURL() string {
  105. return strings.TrimSpace(r.Channel.Link)
  106. }
  107. func (r *rssFeed) feedURL() string {
  108. for _, atomLink := range r.Channel.AtomLinks.Links {
  109. if atomLink.Rel == "self" {
  110. return strings.TrimSpace(atomLink.URL)
  111. }
  112. }
  113. return ""
  114. }
  115. func (r rssFeed) feedAuthor() string {
  116. var author string
  117. switch {
  118. case r.Channel.ItunesAuthor != "":
  119. author = r.Channel.ItunesAuthor
  120. case r.Channel.GooglePlayAuthor != "":
  121. author = r.Channel.GooglePlayAuthor
  122. case r.Channel.ItunesOwner.String() != "":
  123. author = r.Channel.ItunesOwner.String()
  124. case r.Channel.ManagingEditor != "":
  125. author = r.Channel.ManagingEditor
  126. case r.Channel.Webmaster != "":
  127. author = r.Channel.Webmaster
  128. }
  129. return sanitizer.StripTags(strings.TrimSpace(author))
  130. }
  131. type rssGUID struct {
  132. XMLName xml.Name
  133. Data string `xml:",chardata"`
  134. IsPermaLink string `xml:"isPermaLink,attr"`
  135. }
  136. type rssAuthor struct {
  137. XMLName xml.Name
  138. Data string `xml:",chardata"`
  139. Inner string `xml:",innerxml"`
  140. }
  141. type rssEnclosure struct {
  142. URL string `xml:"url,attr"`
  143. Type string `xml:"type,attr"`
  144. Length string `xml:"length,attr"`
  145. }
  146. func (enclosure *rssEnclosure) Size() int64 {
  147. if enclosure.Length == "" {
  148. return 0
  149. }
  150. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  151. return size
  152. }
  153. type rssItem struct {
  154. GUID rssGUID `xml:"rss guid"`
  155. Title string `xml:"rss title"`
  156. Link string `xml:"rss link"`
  157. Description string `xml:"rss description"`
  158. PubDate string `xml:"rss pubDate"`
  159. Author rssAuthor `xml:"rss author"`
  160. Comments string `xml:"rss comments"`
  161. EnclosureLinks []rssEnclosure `xml:"rss enclosure"`
  162. Categories []string `xml:"rss category"`
  163. dublincore.DublinCoreItemElement
  164. FeedBurnerElement
  165. media.Element
  166. AtomAuthor
  167. AtomLinks
  168. itunes.ItunesItemElement
  169. googleplay.GooglePlayItemElement
  170. }
  171. func (r *rssItem) Transform() *model.Entry {
  172. entry := model.NewEntry()
  173. entry.URL = r.entryURL()
  174. entry.CommentsURL = r.entryCommentsURL()
  175. entry.Date = r.entryDate()
  176. entry.Author = r.entryAuthor()
  177. entry.Hash = r.entryHash()
  178. entry.Content = r.entryContent()
  179. entry.Title = r.entryTitle()
  180. entry.Enclosures = r.entryEnclosures()
  181. entry.Tags = r.Categories
  182. if duration, err := normalizeDuration(r.ItunesDuration); err == nil {
  183. entry.ReadingTime = duration
  184. }
  185. return entry
  186. }
  187. func (r *rssItem) entryDate() time.Time {
  188. value := r.PubDate
  189. if r.DublinCoreDate != "" {
  190. value = r.DublinCoreDate
  191. }
  192. if value != "" {
  193. result, err := date.Parse(value)
  194. if err != nil {
  195. slog.Debug("Unable to parse date from RSS feed",
  196. slog.String("date", value),
  197. slog.String("guid", r.GUID.Data),
  198. slog.Any("error", err),
  199. )
  200. return time.Now()
  201. }
  202. return result
  203. }
  204. return time.Now()
  205. }
  206. func (r *rssItem) entryAuthor() string {
  207. var author string
  208. switch {
  209. case r.GooglePlayAuthor != "":
  210. author = r.GooglePlayAuthor
  211. case r.ItunesAuthor != "":
  212. author = r.ItunesAuthor
  213. case r.DublinCoreCreator != "":
  214. author = r.DublinCoreCreator
  215. case r.AtomAuthor.String() != "":
  216. author = r.AtomAuthor.String()
  217. case strings.Contains(r.Author.Inner, "<![CDATA["):
  218. author = r.Author.Data
  219. default:
  220. author = r.Author.Inner
  221. }
  222. return strings.TrimSpace(sanitizer.StripTags(author))
  223. }
  224. func (r *rssItem) entryHash() string {
  225. for _, value := range []string{r.GUID.Data, r.entryURL()} {
  226. if value != "" {
  227. return crypto.Hash(value)
  228. }
  229. }
  230. return ""
  231. }
  232. func (r *rssItem) entryTitle() string {
  233. title := r.Title
  234. if r.DublinCoreTitle != "" {
  235. title = r.DublinCoreTitle
  236. }
  237. return html.UnescapeString(strings.TrimSpace(title))
  238. }
  239. func (r *rssItem) entryContent() string {
  240. for _, value := range []string{
  241. r.DublinCoreContent,
  242. r.Description,
  243. r.GooglePlayDescription,
  244. r.ItunesSummary,
  245. r.ItunesSubtitle,
  246. } {
  247. if value != "" {
  248. return value
  249. }
  250. }
  251. return ""
  252. }
  253. func (r *rssItem) entryURL() string {
  254. for _, link := range []string{r.FeedBurnerLink, r.Link} {
  255. if link != "" {
  256. return strings.TrimSpace(link)
  257. }
  258. }
  259. for _, atomLink := range r.AtomLinks.Links {
  260. if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  261. return strings.TrimSpace(atomLink.URL)
  262. }
  263. }
  264. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  265. // isPermaLink is optional, its default value is true.
  266. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  267. if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" {
  268. return strings.TrimSpace(r.GUID.Data)
  269. }
  270. return ""
  271. }
  272. func (r *rssItem) entryEnclosures() model.EnclosureList {
  273. enclosures := make(model.EnclosureList, 0)
  274. duplicates := make(map[string]bool)
  275. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  276. if _, found := duplicates[mediaThumbnail.URL]; !found {
  277. duplicates[mediaThumbnail.URL] = true
  278. enclosures = append(enclosures, &model.Enclosure{
  279. URL: mediaThumbnail.URL,
  280. MimeType: mediaThumbnail.MimeType(),
  281. Size: mediaThumbnail.Size(),
  282. })
  283. }
  284. }
  285. for _, enclosure := range r.EnclosureLinks {
  286. enclosureURL := enclosure.URL
  287. if r.FeedBurnerEnclosureLink != "" {
  288. filename := path.Base(r.FeedBurnerEnclosureLink)
  289. if strings.Contains(enclosureURL, filename) {
  290. enclosureURL = r.FeedBurnerEnclosureLink
  291. }
  292. }
  293. if enclosureURL == "" {
  294. continue
  295. }
  296. if _, found := duplicates[enclosureURL]; !found {
  297. duplicates[enclosureURL] = true
  298. enclosures = append(enclosures, &model.Enclosure{
  299. URL: enclosureURL,
  300. MimeType: enclosure.Type,
  301. Size: enclosure.Size(),
  302. })
  303. }
  304. }
  305. for _, mediaContent := range r.AllMediaContents() {
  306. if _, found := duplicates[mediaContent.URL]; !found {
  307. duplicates[mediaContent.URL] = true
  308. enclosures = append(enclosures, &model.Enclosure{
  309. URL: mediaContent.URL,
  310. MimeType: mediaContent.MimeType(),
  311. Size: mediaContent.Size(),
  312. })
  313. }
  314. }
  315. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  316. if _, found := duplicates[mediaPeerLink.URL]; !found {
  317. duplicates[mediaPeerLink.URL] = true
  318. enclosures = append(enclosures, &model.Enclosure{
  319. URL: mediaPeerLink.URL,
  320. MimeType: mediaPeerLink.MimeType(),
  321. Size: mediaPeerLink.Size(),
  322. })
  323. }
  324. }
  325. return enclosures
  326. }
  327. func (r *rssItem) entryCommentsURL() string {
  328. commentsURL := strings.TrimSpace(r.Comments)
  329. if commentsURL != "" && urllib.IsAbsoluteURL(commentsURL) {
  330. return commentsURL
  331. }
  332. return ""
  333. }