adapter.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "cmp"
  6. "html"
  7. "iter"
  8. "log/slog"
  9. "path"
  10. "slices"
  11. "strconv"
  12. "strings"
  13. "time"
  14. "miniflux.app/v2/internal/crypto"
  15. "miniflux.app/v2/internal/model"
  16. "miniflux.app/v2/internal/reader/date"
  17. "miniflux.app/v2/internal/reader/sanitizer"
  18. "miniflux.app/v2/internal/urllib"
  19. )
  20. type rssAdapter struct {
  21. rss *rss
  22. }
  23. func (r *rssAdapter) buildFeed(baseURL string) *model.Feed {
  24. feed := &model.Feed{
  25. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  26. FeedURL: strings.TrimSpace(baseURL),
  27. SiteURL: strings.TrimSpace(r.rss.Channel.Link),
  28. Description: strings.TrimSpace(r.rss.Channel.Description),
  29. }
  30. // Ensure the Site URL is absolute.
  31. if absoluteSiteURL, err := urllib.ResolveToAbsoluteURL(baseURL, feed.SiteURL); err == nil {
  32. feed.SiteURL = absoluteSiteURL
  33. }
  34. // Try to find the feed URL from the Channel links.
  35. for _, link := range r.rss.Channel.Links {
  36. href := strings.TrimSpace(link.Href)
  37. if href == "" || link.Rel != "self" {
  38. continue
  39. }
  40. if absoluteFeedURL, err := urllib.ResolveToAbsoluteURL(feed.FeedURL, href); err == nil {
  41. feed.FeedURL = absoluteFeedURL
  42. break
  43. }
  44. }
  45. // Fallback to the site URL if the title is empty.
  46. if feed.Title == "" {
  47. feed.Title = feed.SiteURL
  48. }
  49. // Get TTL if defined.
  50. if r.rss.Channel.TTL != "" {
  51. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  52. feed.TTL = time.Duration(ttl) * time.Minute
  53. }
  54. }
  55. // Get the feed icon URL if defined.
  56. if r.rss.Channel.Image != nil {
  57. if absoluteIconURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  58. feed.IconURL = absoluteIconURL
  59. }
  60. }
  61. // Track GUIDs already seen in this feed to disambiguate items from
  62. // non-conformant feeds that reuse the same <guid> for every entry.
  63. seenGUIDs := make(map[string]int)
  64. for _, item := range r.rss.Channel.Items {
  65. entry := model.NewEntry()
  66. entry.Date = findEntryDate(&item)
  67. entry.Content = findEntryContent(&item)
  68. entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
  69. // Populate the entry URL.
  70. entryURL := findEntryURL(&item)
  71. if entryURL != "" {
  72. entry.URL = entryURL
  73. if absoluteEntryURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, entryURL); err == nil {
  74. entry.URL = absoluteEntryURL
  75. }
  76. }
  77. if entry.URL == "" {
  78. // Fallback to the feed URL if no entry URL is found.
  79. entry.URL = feed.SiteURL
  80. // Fallback to the first enclosure URL if it exists.
  81. if len(entry.Enclosures) > 0 && entry.Enclosures[0].URL != "" {
  82. entry.URL = entry.Enclosures[0].URL
  83. }
  84. }
  85. // Populate the entry title.
  86. entry.Title = findEntryTitle(&item)
  87. if entry.Title == "" {
  88. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  89. if entry.Title == "" {
  90. entry.Title = entry.URL
  91. }
  92. }
  93. entry.Author = findEntryAuthor(&item)
  94. if entry.Author == "" {
  95. entry.Author = findFeedAuthor(&r.rss.Channel)
  96. }
  97. // Generate the entry hash.
  98. //
  99. // The RSS 2.0 spec requires <guid> to uniquely identify the item, but
  100. // some feeds ship the same GUID for every entry. Keep the first
  101. // occurrence stable (so existing stored entries still match) and
  102. // disambiguate later collisions using the entry URL or, as a last
  103. // resort, the item position.
  104. switch {
  105. case item.GUID.Data != "":
  106. n := seenGUIDs[item.GUID.Data]
  107. seenGUIDs[item.GUID.Data] = n + 1
  108. switch {
  109. case n == 0:
  110. entry.Hash = crypto.SHA256(item.GUID.Data)
  111. case entry.URL != "":
  112. entry.Hash = crypto.SHA256(item.GUID.Data + "|" + entry.URL)
  113. default:
  114. entry.Hash = crypto.SHA256(item.GUID.Data + "|" + strconv.Itoa(n))
  115. }
  116. case entryURL != "":
  117. entry.Hash = crypto.SHA256(entryURL)
  118. default:
  119. entry.Hash = crypto.SHA256(entry.Title + entry.Content)
  120. }
  121. // Find CommentsURL if defined.
  122. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  123. entry.CommentsURL = absoluteCommentsURL
  124. }
  125. // Set podcast listening time.
  126. if item.ItunesDuration != "" {
  127. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  128. entry.ReadingTime = duration
  129. }
  130. }
  131. // Populate entry categories.
  132. entry.Tags = findEntryTags(&item)
  133. if len(entry.Tags) == 0 {
  134. entry.Tags = findFeedTags(&r.rss.Channel)
  135. }
  136. feed.Entries = append(feed.Entries, entry)
  137. }
  138. return feed
  139. }
  140. func findFeedAuthor(rssChannel *rssChannel) string {
  141. var author string
  142. switch {
  143. case rssChannel.ItunesAuthor != "":
  144. author = rssChannel.ItunesAuthor
  145. case rssChannel.GooglePlayAuthor != "":
  146. author = rssChannel.GooglePlayAuthor
  147. case rssChannel.ItunesOwner.String() != "":
  148. author = rssChannel.ItunesOwner.String()
  149. case rssChannel.ManagingEditor != "":
  150. author = rssChannel.ManagingEditor
  151. case rssChannel.Webmaster != "":
  152. author = rssChannel.Webmaster
  153. default:
  154. return ""
  155. }
  156. return sanitizer.StripTags(author)
  157. }
  158. func findFeedTags(rssChannel *rssChannel) []string {
  159. tags := make([]string, 0, len(rssChannel.Categories)+2*len(rssChannel.ItunesCategories)+1)
  160. tags = appendSorted(tags, strings.TrimSpace, rssChannel.Categories...)
  161. tags = appendSortedSeq(tags, strings.TrimSpace, rssChannel.ItunesCategoriesSeq())
  162. tags = appendSorted(tags, strings.TrimSpace, rssChannel.GooglePlayCategory.Text)
  163. return tags
  164. }
  165. func findEntryTitle(rssItem *rssItem) string {
  166. title := rssItem.Title.Content
  167. if rssItem.DublinCoreTitle != "" {
  168. title = rssItem.DublinCoreTitle
  169. }
  170. return html.UnescapeString(html.UnescapeString(strings.TrimSpace(title)))
  171. }
  172. func findEntryURL(rssItem *rssItem) string {
  173. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  174. if link != "" {
  175. return strings.TrimSpace(link)
  176. }
  177. }
  178. for _, atomLink := range rssItem.Links {
  179. if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  180. return strings.TrimSpace(atomLink.Href)
  181. }
  182. }
  183. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  184. // isPermaLink is optional, its default value is true.
  185. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  186. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  187. return strings.TrimSpace(rssItem.GUID.Data)
  188. }
  189. return ""
  190. }
  191. func findEntryContent(rssItem *rssItem) string {
  192. for _, value := range []string{
  193. rssItem.DublinCoreContent,
  194. rssItem.Description,
  195. rssItem.GooglePlayDescription,
  196. rssItem.ItunesSummary,
  197. rssItem.ItunesSubtitle,
  198. } {
  199. if value != "" {
  200. return value
  201. }
  202. }
  203. return ""
  204. }
  205. func findEntryDate(rssItem *rssItem) time.Time {
  206. value := rssItem.PubDate
  207. if rssItem.DublinCoreDate != "" {
  208. value = rssItem.DublinCoreDate
  209. }
  210. if value = strings.TrimSpace(value); value == "" {
  211. return time.Now()
  212. }
  213. parsedDate, err := date.Parse(value)
  214. if err != nil {
  215. slog.Debug("Unable to parse date from RSS feed",
  216. slog.String("date", value),
  217. slog.String("guid", rssItem.GUID.Data),
  218. slog.Any("error", err),
  219. )
  220. return time.Now()
  221. }
  222. return parsedDate
  223. }
  224. func findEntryAuthor(rssItem *rssItem) string {
  225. var author string
  226. switch {
  227. case rssItem.GooglePlayAuthor != "":
  228. author = rssItem.GooglePlayAuthor
  229. case rssItem.ItunesAuthor != "":
  230. author = rssItem.ItunesAuthor
  231. case rssItem.DublinCoreCreator != "":
  232. author = rssItem.DublinCoreCreator
  233. case rssItem.PersonName() != "":
  234. author = rssItem.PersonName()
  235. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  236. author = rssItem.Author.Data
  237. case rssItem.Author.Inner != "":
  238. author = rssItem.Author.Inner
  239. default:
  240. return ""
  241. }
  242. return sanitizer.StripTags(author)
  243. }
  244. func findEntryTags(rssItem *rssItem) []string {
  245. tags := make([]string, 0, len(rssItem.Categories)+len(rssItem.MediaCategories))
  246. tags = appendSorted(tags, strings.TrimSpace, rssItem.Categories...)
  247. tags = appendSortedSeq(tags, strings.TrimSpace, rssItem.MediaCategories.LabelsSeq())
  248. return tags
  249. }
  250. func findEntryEnclosures(rssItem *rssItem, siteURL string) model.EnclosureList {
  251. mediaThumbnails := rssItem.AllMediaThumbnails()
  252. mediaContents := rssItem.AllMediaContents()
  253. mediaPeerLinks := rssItem.AllMediaPeerLinks()
  254. capacity := len(mediaThumbnails) + len(rssItem.Enclosures) + len(mediaContents) + len(mediaPeerLinks)
  255. enclosures := make(model.EnclosureList, 0, capacity)
  256. duplicates := make(map[string]bool, capacity)
  257. for _, mediaThumbnail := range mediaThumbnails {
  258. mediaURL := strings.TrimSpace(mediaThumbnail.URL)
  259. if mediaURL == "" {
  260. continue
  261. }
  262. mediaURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL)
  263. if err != nil {
  264. slog.Debug("Unable to build absolute URL for media thumbnail",
  265. slog.String("url", mediaThumbnail.URL),
  266. slog.String("site_url", siteURL),
  267. slog.Any("error", err),
  268. )
  269. continue
  270. }
  271. if _, found := duplicates[mediaURL]; found {
  272. continue
  273. }
  274. duplicates[mediaURL] = true
  275. enclosures = append(enclosures, &model.Enclosure{
  276. URL: mediaURL,
  277. MimeType: mediaThumbnail.MimeType(),
  278. Size: mediaThumbnail.Size(),
  279. })
  280. }
  281. for _, enclosure := range rssItem.Enclosures {
  282. enclosureURL := enclosure.URL
  283. if rssItem.FeedBurnerEnclosureLink != "" {
  284. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  285. if strings.HasSuffix(enclosureURL, filename) {
  286. enclosureURL = rssItem.FeedBurnerEnclosureLink
  287. }
  288. }
  289. enclosureURL = strings.TrimSpace(enclosureURL)
  290. if enclosureURL == "" {
  291. continue
  292. }
  293. if absoluteEnclosureURL, err := urllib.ResolveToAbsoluteURL(siteURL, enclosureURL); err == nil {
  294. enclosureURL = absoluteEnclosureURL
  295. }
  296. if _, found := duplicates[enclosureURL]; found {
  297. continue
  298. }
  299. duplicates[enclosureURL] = true
  300. enclosures = append(enclosures, &model.Enclosure{
  301. URL: enclosureURL,
  302. MimeType: enclosure.Type,
  303. Size: enclosure.Size(),
  304. })
  305. }
  306. for _, mediaContent := range mediaContents {
  307. mediaURL := strings.TrimSpace(mediaContent.URL)
  308. if mediaURL == "" {
  309. continue
  310. }
  311. mediaURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL)
  312. if err != nil {
  313. slog.Debug("Unable to build absolute URL for media content",
  314. slog.String("url", mediaContent.URL),
  315. slog.String("site_url", siteURL),
  316. slog.Any("error", err),
  317. )
  318. continue
  319. }
  320. if _, found := duplicates[mediaURL]; found {
  321. continue
  322. }
  323. duplicates[mediaURL] = true
  324. enclosures = append(enclosures, &model.Enclosure{
  325. URL: mediaURL,
  326. MimeType: mediaContent.MimeType(),
  327. Size: mediaContent.Size(),
  328. })
  329. }
  330. for _, mediaPeerLink := range mediaPeerLinks {
  331. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  332. if mediaURL == "" {
  333. continue
  334. }
  335. mediaURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL)
  336. if err != nil {
  337. slog.Debug("Unable to build absolute URL for media peer link",
  338. slog.String("url", mediaPeerLink.URL),
  339. slog.String("site_url", siteURL),
  340. slog.Any("error", err),
  341. )
  342. continue
  343. }
  344. if _, found := duplicates[mediaURL]; found {
  345. continue
  346. }
  347. duplicates[mediaURL] = true
  348. enclosures = append(enclosures, &model.Enclosure{
  349. URL: mediaURL,
  350. MimeType: mediaPeerLink.MimeType(),
  351. Size: mediaPeerLink.Size(),
  352. })
  353. }
  354. return enclosures
  355. }
  356. // appendSorted is identical to [appendSortedSeq] except receives variadic values rather than [iter.Seq].
  357. func appendSorted[I any, O cmp.Ordered](sorted []O, fn func(I) O, values ...I) []O {
  358. sorted = slices.Grow(sorted, len(values))
  359. return appendSortedSeq(sorted, fn, slices.Values(values))
  360. }
  361. // appendSortedSeq appends elements from "values" iterator into "sorted" slice.
  362. // - "fn" applied to every element of "values"
  363. // - elements inserted into "sorted" slice so it stays sorted
  364. // - duplicate elements are not inserted
  365. func appendSortedSeq[I any, O cmp.Ordered](sorted []O, fn func(I) O, values iter.Seq[I]) []O {
  366. var zero O
  367. for in := range values {
  368. out := fn(in)
  369. if out == zero {
  370. continue
  371. }
  372. where, found := slices.BinarySearch(sorted, out)
  373. if found {
  374. continue
  375. }
  376. // Insert sorted to avoid duplicates.
  377. sorted = slices.Insert(sorted, where, out)
  378. }
  379. return sorted
  380. }