adapter.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "html"
  6. "log/slog"
  7. "path"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/sanitizer"
  16. "miniflux.app/v2/internal/urllib"
  17. )
  18. type rssAdapter struct {
  19. rss *rss
  20. }
  21. func (r *rssAdapter) buildFeed(baseURL string) *model.Feed {
  22. feed := &model.Feed{
  23. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  24. FeedURL: strings.TrimSpace(baseURL),
  25. SiteURL: strings.TrimSpace(r.rss.Channel.Link),
  26. Description: strings.TrimSpace(r.rss.Channel.Description),
  27. }
  28. // Ensure the Site URL is absolute.
  29. if absoluteSiteURL, err := urllib.ResolveToAbsoluteURL(baseURL, feed.SiteURL); err == nil {
  30. feed.SiteURL = absoluteSiteURL
  31. }
  32. // Try to find the feed URL from the Atom links.
  33. for _, atomLink := range r.rss.Channel.Links {
  34. atomLinkHref := strings.TrimSpace(atomLink.Href)
  35. if atomLinkHref != "" && atomLink.Rel == "self" {
  36. if absoluteFeedURL, err := urllib.ResolveToAbsoluteURL(feed.FeedURL, atomLinkHref); err == nil {
  37. feed.FeedURL = absoluteFeedURL
  38. break
  39. }
  40. }
  41. }
  42. // Fallback to the site URL if the title is empty.
  43. if feed.Title == "" {
  44. feed.Title = feed.SiteURL
  45. }
  46. // Get TTL if defined.
  47. if r.rss.Channel.TTL != "" {
  48. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  49. feed.TTL = time.Duration(ttl) * time.Minute
  50. }
  51. }
  52. // Get the feed icon URL if defined.
  53. if r.rss.Channel.Image != nil {
  54. if absoluteIconURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  55. feed.IconURL = absoluteIconURL
  56. }
  57. }
  58. // Track GUIDs already seen in this feed to disambiguate items from
  59. // non-conformant feeds that reuse the same <guid> for every entry.
  60. seenGUIDs := make(map[string]int)
  61. for _, item := range r.rss.Channel.Items {
  62. entry := model.NewEntry()
  63. entry.Date = findEntryDate(&item)
  64. entry.Content = findEntryContent(&item)
  65. entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
  66. // Populate the entry URL.
  67. entryURL := findEntryURL(&item)
  68. if entryURL == "" {
  69. // Fallback to the first enclosure URL if it exists.
  70. if len(entry.Enclosures) > 0 && entry.Enclosures[0].URL != "" {
  71. entry.URL = entry.Enclosures[0].URL
  72. } else {
  73. // Fallback to the feed URL if no entry URL is found.
  74. entry.URL = feed.SiteURL
  75. }
  76. } else {
  77. if absoluteEntryURL, err := urllib.ResolveToAbsoluteURL(feed.SiteURL, entryURL); err == nil {
  78. entry.URL = absoluteEntryURL
  79. } else {
  80. entry.URL = entryURL
  81. }
  82. }
  83. // Populate the entry title.
  84. entry.Title = findEntryTitle(&item)
  85. if entry.Title == "" {
  86. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  87. if entry.Title == "" {
  88. entry.Title = entry.URL
  89. }
  90. }
  91. entry.Author = findEntryAuthor(&item)
  92. if entry.Author == "" {
  93. entry.Author = findFeedAuthor(&r.rss.Channel)
  94. }
  95. // Generate the entry hash.
  96. //
  97. // The RSS 2.0 spec requires <guid> to uniquely identify the item, but
  98. // some feeds ship the same GUID for every entry. Keep the first
  99. // occurrence stable (so existing stored entries still match) and
  100. // disambiguate later collisions using the entry URL or, as a last
  101. // resort, the item position.
  102. switch {
  103. case item.GUID.Data != "":
  104. n := seenGUIDs[item.GUID.Data]
  105. seenGUIDs[item.GUID.Data] = n + 1
  106. switch {
  107. case n == 0:
  108. entry.Hash = crypto.SHA256(item.GUID.Data)
  109. case entry.URL != "":
  110. entry.Hash = crypto.SHA256(item.GUID.Data + "|" + entry.URL)
  111. default:
  112. entry.Hash = crypto.SHA256(item.GUID.Data + "|" + strconv.Itoa(n))
  113. }
  114. case entryURL != "":
  115. entry.Hash = crypto.SHA256(entryURL)
  116. default:
  117. entry.Hash = crypto.SHA256(entry.Title + entry.Content)
  118. }
  119. // Find CommentsURL if defined.
  120. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  121. entry.CommentsURL = absoluteCommentsURL
  122. }
  123. // Set podcast listening time.
  124. if item.ItunesDuration != "" {
  125. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  126. entry.ReadingTime = duration
  127. }
  128. }
  129. // Populate entry categories.
  130. entry.Tags = findEntryTags(&item)
  131. if len(entry.Tags) == 0 {
  132. entry.Tags = findFeedTags(&r.rss.Channel)
  133. }
  134. // Sort and deduplicate tags.
  135. slices.Sort(entry.Tags)
  136. entry.Tags = slices.Compact(entry.Tags)
  137. feed.Entries = append(feed.Entries, entry)
  138. }
  139. return feed
  140. }
  141. func findFeedAuthor(rssChannel *rssChannel) string {
  142. var author string
  143. switch {
  144. case rssChannel.ItunesAuthor != "":
  145. author = rssChannel.ItunesAuthor
  146. case rssChannel.GooglePlayAuthor != "":
  147. author = rssChannel.GooglePlayAuthor
  148. case rssChannel.ItunesOwner.String() != "":
  149. author = rssChannel.ItunesOwner.String()
  150. case rssChannel.ManagingEditor != "":
  151. author = rssChannel.ManagingEditor
  152. case rssChannel.Webmaster != "":
  153. author = rssChannel.Webmaster
  154. default:
  155. return ""
  156. }
  157. return strings.TrimSpace(sanitizer.StripTags(author))
  158. }
  159. func findFeedTags(rssChannel *rssChannel) []string {
  160. itunesCategories := rssChannel.GetItunesCategories()
  161. tags := make([]string, 0, len(rssChannel.Categories)+len(itunesCategories)+1)
  162. for _, tag := range rssChannel.Categories {
  163. tag = strings.TrimSpace(tag)
  164. if tag != "" {
  165. tags = append(tags, tag)
  166. }
  167. }
  168. for _, tag := range itunesCategories {
  169. tag = strings.TrimSpace(tag)
  170. if tag != "" {
  171. tags = append(tags, tag)
  172. }
  173. }
  174. if tag := strings.TrimSpace(rssChannel.GooglePlayCategory.Text); tag != "" {
  175. tags = append(tags, tag)
  176. }
  177. return tags
  178. }
  179. func findEntryTitle(rssItem *rssItem) string {
  180. title := rssItem.Title.Content
  181. if rssItem.DublinCoreTitle != "" {
  182. title = rssItem.DublinCoreTitle
  183. }
  184. return html.UnescapeString(html.UnescapeString(strings.TrimSpace(title)))
  185. }
  186. func findEntryURL(rssItem *rssItem) string {
  187. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  188. if link != "" {
  189. return strings.TrimSpace(link)
  190. }
  191. }
  192. for _, atomLink := range rssItem.Links {
  193. if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  194. return strings.TrimSpace(atomLink.Href)
  195. }
  196. }
  197. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  198. // isPermaLink is optional, its default value is true.
  199. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  200. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  201. return strings.TrimSpace(rssItem.GUID.Data)
  202. }
  203. return ""
  204. }
  205. func findEntryContent(rssItem *rssItem) string {
  206. for _, value := range []string{
  207. rssItem.DublinCoreContent,
  208. rssItem.Description,
  209. rssItem.GooglePlayDescription,
  210. rssItem.ItunesSummary,
  211. rssItem.ItunesSubtitle,
  212. } {
  213. if value != "" {
  214. return value
  215. }
  216. }
  217. return ""
  218. }
  219. func findEntryDate(rssItem *rssItem) time.Time {
  220. value := rssItem.PubDate
  221. if rssItem.DublinCoreDate != "" {
  222. value = rssItem.DublinCoreDate
  223. }
  224. if value != "" {
  225. result, err := date.Parse(value)
  226. if err != nil {
  227. slog.Debug("Unable to parse date from RSS feed",
  228. slog.String("date", value),
  229. slog.String("guid", rssItem.GUID.Data),
  230. slog.Any("error", err),
  231. )
  232. return time.Now()
  233. }
  234. return result
  235. }
  236. return time.Now()
  237. }
  238. func findEntryAuthor(rssItem *rssItem) string {
  239. var author string
  240. switch {
  241. case rssItem.GooglePlayAuthor != "":
  242. author = rssItem.GooglePlayAuthor
  243. case rssItem.ItunesAuthor != "":
  244. author = rssItem.ItunesAuthor
  245. case rssItem.DublinCoreCreator != "":
  246. author = rssItem.DublinCoreCreator
  247. case rssItem.PersonName() != "":
  248. author = rssItem.PersonName()
  249. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  250. author = rssItem.Author.Data
  251. case rssItem.Author.Inner != "":
  252. author = rssItem.Author.Inner
  253. default:
  254. return ""
  255. }
  256. return strings.TrimSpace(sanitizer.StripTags(author))
  257. }
  258. func findEntryTags(rssItem *rssItem) []string {
  259. mediaLabels := rssItem.MediaCategories.Labels()
  260. tags := make([]string, 0, len(rssItem.Categories)+len(mediaLabels))
  261. for _, tag := range rssItem.Categories {
  262. tag = strings.TrimSpace(tag)
  263. if tag != "" {
  264. tags = append(tags, tag)
  265. }
  266. }
  267. for _, tag := range mediaLabels {
  268. tag = strings.TrimSpace(tag)
  269. if tag != "" {
  270. tags = append(tags, tag)
  271. }
  272. }
  273. return tags
  274. }
  275. func findEntryEnclosures(rssItem *rssItem, siteURL string) model.EnclosureList {
  276. mediaThumbnails := rssItem.AllMediaThumbnails()
  277. mediaContents := rssItem.AllMediaContents()
  278. mediaPeerLinks := rssItem.AllMediaPeerLinks()
  279. capacity := len(mediaThumbnails) + len(rssItem.Enclosures) + len(mediaContents) + len(mediaPeerLinks)
  280. enclosures := make(model.EnclosureList, 0, capacity)
  281. duplicates := make(map[string]bool, capacity)
  282. for _, mediaThumbnail := range mediaThumbnails {
  283. mediaURL := strings.TrimSpace(mediaThumbnail.URL)
  284. if mediaURL == "" {
  285. continue
  286. }
  287. if _, found := duplicates[mediaURL]; !found {
  288. if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
  289. slog.Debug("Unable to build absolute URL for media thumbnail",
  290. slog.String("url", mediaThumbnail.URL),
  291. slog.String("site_url", siteURL),
  292. slog.Any("error", err),
  293. )
  294. } else {
  295. duplicates[mediaAbsoluteURL] = true
  296. enclosures = append(enclosures, &model.Enclosure{
  297. URL: mediaAbsoluteURL,
  298. MimeType: mediaThumbnail.MimeType(),
  299. Size: mediaThumbnail.Size(),
  300. })
  301. }
  302. }
  303. }
  304. for _, enclosure := range rssItem.Enclosures {
  305. enclosureURL := enclosure.URL
  306. if rssItem.FeedBurnerEnclosureLink != "" {
  307. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  308. if strings.HasSuffix(enclosureURL, filename) {
  309. enclosureURL = rssItem.FeedBurnerEnclosureLink
  310. }
  311. }
  312. enclosureURL = strings.TrimSpace(enclosureURL)
  313. if enclosureURL == "" {
  314. continue
  315. }
  316. if absoluteEnclosureURL, err := urllib.ResolveToAbsoluteURL(siteURL, enclosureURL); err == nil {
  317. enclosureURL = absoluteEnclosureURL
  318. }
  319. if _, found := duplicates[enclosureURL]; !found {
  320. duplicates[enclosureURL] = true
  321. enclosures = append(enclosures, &model.Enclosure{
  322. URL: enclosureURL,
  323. MimeType: enclosure.Type,
  324. Size: enclosure.Size(),
  325. })
  326. }
  327. }
  328. for _, mediaContent := range mediaContents {
  329. mediaURL := strings.TrimSpace(mediaContent.URL)
  330. if mediaURL == "" {
  331. continue
  332. }
  333. if _, found := duplicates[mediaURL]; !found {
  334. mediaURL := strings.TrimSpace(mediaContent.URL)
  335. if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
  336. slog.Debug("Unable to build absolute URL for media content",
  337. slog.String("url", mediaContent.URL),
  338. slog.String("site_url", siteURL),
  339. slog.Any("error", err),
  340. )
  341. } else {
  342. duplicates[mediaAbsoluteURL] = true
  343. enclosures = append(enclosures, &model.Enclosure{
  344. URL: mediaAbsoluteURL,
  345. MimeType: mediaContent.MimeType(),
  346. Size: mediaContent.Size(),
  347. })
  348. }
  349. }
  350. }
  351. for _, mediaPeerLink := range mediaPeerLinks {
  352. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  353. if mediaURL == "" {
  354. continue
  355. }
  356. if _, found := duplicates[mediaURL]; !found {
  357. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  358. if mediaAbsoluteURL, err := urllib.ResolveToAbsoluteURL(siteURL, mediaURL); err != nil {
  359. slog.Debug("Unable to build absolute URL for media peer link",
  360. slog.String("url", mediaPeerLink.URL),
  361. slog.String("site_url", siteURL),
  362. slog.Any("error", err),
  363. )
  364. } else {
  365. duplicates[mediaAbsoluteURL] = true
  366. enclosures = append(enclosures, &model.Enclosure{
  367. URL: mediaAbsoluteURL,
  368. MimeType: mediaPeerLink.MimeType(),
  369. Size: mediaPeerLink.Size(),
  370. })
  371. }
  372. }
  373. }
  374. return enclosures
  375. }