adapter.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "html"
  6. "log/slog"
  7. "path"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "miniflux.app/v2/internal/crypto"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/sanitizer"
  16. "miniflux.app/v2/internal/urllib"
  17. )
  18. type RSSAdapter struct {
  19. rss *RSS
  20. }
  21. func NewRSSAdapter(rss *RSS) *RSSAdapter {
  22. return &RSSAdapter{rss}
  23. }
  24. func (r *RSSAdapter) BuildFeed(baseURL string) *model.Feed {
  25. feed := &model.Feed{
  26. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  27. FeedURL: strings.TrimSpace(baseURL),
  28. SiteURL: strings.TrimSpace(r.rss.Channel.Link),
  29. Description: strings.TrimSpace(r.rss.Channel.Description),
  30. }
  31. // Ensure the Site URL is absolute.
  32. if absoluteSiteURL, err := urllib.AbsoluteURL(baseURL, feed.SiteURL); err == nil {
  33. feed.SiteURL = absoluteSiteURL
  34. }
  35. // Try to find the feed URL from the Atom links.
  36. for _, atomLink := range r.rss.Channel.Links {
  37. atomLinkHref := strings.TrimSpace(atomLink.Href)
  38. if atomLinkHref != "" && atomLink.Rel == "self" {
  39. if absoluteFeedURL, err := urllib.AbsoluteURL(feed.FeedURL, atomLinkHref); err == nil {
  40. feed.FeedURL = absoluteFeedURL
  41. break
  42. }
  43. }
  44. }
  45. // Fallback to the site URL if the title is empty.
  46. if feed.Title == "" {
  47. feed.Title = feed.SiteURL
  48. }
  49. // Get TTL if defined.
  50. if r.rss.Channel.TTL != "" {
  51. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  52. feed.TTL = ttl
  53. }
  54. }
  55. // Get the feed icon URL if defined.
  56. if r.rss.Channel.Image != nil {
  57. if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  58. feed.IconURL = absoluteIconURL
  59. }
  60. }
  61. for _, item := range r.rss.Channel.Items {
  62. entry := model.NewEntry()
  63. entry.Date = findEntryDate(&item)
  64. entry.Content = findEntryContent(&item)
  65. entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
  66. // Populate the entry URL.
  67. entryURL := findEntryURL(&item)
  68. if entryURL == "" {
  69. // Fallback to the first enclosure URL if it exists.
  70. if len(entry.Enclosures) > 0 && entry.Enclosures[0].URL != "" {
  71. entry.URL = entry.Enclosures[0].URL
  72. } else {
  73. // Fallback to the feed URL if no entry URL is found.
  74. entry.URL = feed.SiteURL
  75. }
  76. } else {
  77. if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil {
  78. entry.URL = absoluteEntryURL
  79. } else {
  80. entry.URL = entryURL
  81. }
  82. }
  83. // Populate the entry title.
  84. entry.Title = findEntryTitle(&item)
  85. if entry.Title == "" {
  86. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  87. if entry.Title == "" {
  88. entry.Title = entry.URL
  89. }
  90. }
  91. entry.Author = findEntryAuthor(&item)
  92. if entry.Author == "" {
  93. entry.Author = findFeedAuthor(&r.rss.Channel)
  94. }
  95. // Generate the entry hash.
  96. switch {
  97. case item.GUID.Data != "":
  98. entry.Hash = crypto.SHA256(item.GUID.Data)
  99. case entryURL != "":
  100. entry.Hash = crypto.SHA256(entryURL)
  101. default:
  102. entry.Hash = crypto.SHA256(entry.Title + entry.Content)
  103. }
  104. // Find CommentsURL if defined.
  105. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  106. entry.CommentsURL = absoluteCommentsURL
  107. }
  108. // Set podcast listening time.
  109. if item.ItunesDuration != "" {
  110. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  111. entry.ReadingTime = duration
  112. }
  113. }
  114. // Populate entry categories.
  115. entry.Tags = findEntryTags(&item)
  116. if len(entry.Tags) == 0 {
  117. entry.Tags = findFeedTags(&r.rss.Channel)
  118. }
  119. // Sort and deduplicate tags.
  120. slices.Sort(entry.Tags)
  121. entry.Tags = slices.Compact(entry.Tags)
  122. feed.Entries = append(feed.Entries, entry)
  123. }
  124. return feed
  125. }
  126. func findFeedAuthor(rssChannel *RSSChannel) string {
  127. var author string
  128. switch {
  129. case rssChannel.ItunesAuthor != "":
  130. author = rssChannel.ItunesAuthor
  131. case rssChannel.GooglePlayAuthor != "":
  132. author = rssChannel.GooglePlayAuthor
  133. case rssChannel.ItunesOwner.String() != "":
  134. author = rssChannel.ItunesOwner.String()
  135. case rssChannel.ManagingEditor != "":
  136. author = rssChannel.ManagingEditor
  137. case rssChannel.Webmaster != "":
  138. author = rssChannel.Webmaster
  139. default:
  140. return ""
  141. }
  142. return strings.TrimSpace(sanitizer.StripTags(author))
  143. }
  144. func findFeedTags(rssChannel *RSSChannel) []string {
  145. tags := make([]string, 0)
  146. for _, tag := range rssChannel.Categories {
  147. tag = strings.TrimSpace(tag)
  148. if tag != "" {
  149. tags = append(tags, tag)
  150. }
  151. }
  152. for _, tag := range rssChannel.GetItunesCategories() {
  153. tag = strings.TrimSpace(tag)
  154. if tag != "" {
  155. tags = append(tags, tag)
  156. }
  157. }
  158. if tag := strings.TrimSpace(rssChannel.GooglePlayCategory.Text); tag != "" {
  159. tags = append(tags, tag)
  160. }
  161. return tags
  162. }
  163. func findEntryTitle(rssItem *RSSItem) string {
  164. title := rssItem.Title.Content
  165. if rssItem.DublinCoreTitle != "" {
  166. title = rssItem.DublinCoreTitle
  167. }
  168. return html.UnescapeString(html.UnescapeString(strings.TrimSpace(title)))
  169. }
  170. func findEntryURL(rssItem *RSSItem) string {
  171. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  172. if link != "" {
  173. return strings.TrimSpace(link)
  174. }
  175. }
  176. for _, atomLink := range rssItem.Links {
  177. if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  178. return strings.TrimSpace(atomLink.Href)
  179. }
  180. }
  181. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  182. // isPermaLink is optional, its default value is true.
  183. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  184. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  185. return strings.TrimSpace(rssItem.GUID.Data)
  186. }
  187. return ""
  188. }
  189. func findEntryContent(rssItem *RSSItem) string {
  190. for _, value := range []string{
  191. rssItem.DublinCoreContent,
  192. rssItem.Description,
  193. rssItem.GooglePlayDescription,
  194. rssItem.ItunesSummary,
  195. rssItem.ItunesSubtitle,
  196. } {
  197. if value != "" {
  198. return value
  199. }
  200. }
  201. return ""
  202. }
  203. func findEntryDate(rssItem *RSSItem) time.Time {
  204. value := rssItem.PubDate
  205. if rssItem.DublinCoreDate != "" {
  206. value = rssItem.DublinCoreDate
  207. }
  208. if value != "" {
  209. result, err := date.Parse(value)
  210. if err != nil {
  211. slog.Debug("Unable to parse date from RSS feed",
  212. slog.String("date", value),
  213. slog.String("guid", rssItem.GUID.Data),
  214. slog.Any("error", err),
  215. )
  216. return time.Now()
  217. }
  218. return result
  219. }
  220. return time.Now()
  221. }
  222. func findEntryAuthor(rssItem *RSSItem) string {
  223. var author string
  224. switch {
  225. case rssItem.GooglePlayAuthor != "":
  226. author = rssItem.GooglePlayAuthor
  227. case rssItem.ItunesAuthor != "":
  228. author = rssItem.ItunesAuthor
  229. case rssItem.DublinCoreCreator != "":
  230. author = rssItem.DublinCoreCreator
  231. case rssItem.PersonName() != "":
  232. author = rssItem.PersonName()
  233. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  234. author = rssItem.Author.Data
  235. case rssItem.Author.Inner != "":
  236. author = rssItem.Author.Inner
  237. default:
  238. return ""
  239. }
  240. return strings.TrimSpace(sanitizer.StripTags(author))
  241. }
  242. func findEntryTags(rssItem *RSSItem) []string {
  243. tags := make([]string, 0)
  244. for _, tag := range rssItem.Categories {
  245. tag = strings.TrimSpace(tag)
  246. if tag != "" {
  247. tags = append(tags, tag)
  248. }
  249. }
  250. for _, tag := range rssItem.MediaCategories.Labels() {
  251. tag = strings.TrimSpace(tag)
  252. if tag != "" {
  253. tags = append(tags, tag)
  254. }
  255. }
  256. return tags
  257. }
  258. func findEntryEnclosures(rssItem *RSSItem, siteURL string) model.EnclosureList {
  259. enclosures := make(model.EnclosureList, 0)
  260. duplicates := make(map[string]bool)
  261. for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
  262. mediaURL := strings.TrimSpace(mediaThumbnail.URL)
  263. if mediaURL == "" {
  264. continue
  265. }
  266. if _, found := duplicates[mediaURL]; !found {
  267. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  268. slog.Debug("Unable to build absolute URL for media thumbnail",
  269. slog.String("url", mediaThumbnail.URL),
  270. slog.String("site_url", siteURL),
  271. slog.Any("error", err),
  272. )
  273. } else {
  274. duplicates[mediaAbsoluteURL] = true
  275. enclosures = append(enclosures, &model.Enclosure{
  276. URL: mediaAbsoluteURL,
  277. MimeType: mediaThumbnail.MimeType(),
  278. Size: mediaThumbnail.Size(),
  279. })
  280. }
  281. }
  282. }
  283. for _, enclosure := range rssItem.Enclosures {
  284. enclosureURL := enclosure.URL
  285. if rssItem.FeedBurnerEnclosureLink != "" {
  286. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  287. if strings.HasSuffix(enclosureURL, filename) {
  288. enclosureURL = rssItem.FeedBurnerEnclosureLink
  289. }
  290. }
  291. enclosureURL = strings.TrimSpace(enclosureURL)
  292. if enclosureURL == "" {
  293. continue
  294. }
  295. if absoluteEnclosureURL, err := urllib.AbsoluteURL(siteURL, enclosureURL); err == nil {
  296. enclosureURL = absoluteEnclosureURL
  297. }
  298. if _, found := duplicates[enclosureURL]; !found {
  299. duplicates[enclosureURL] = true
  300. enclosures = append(enclosures, &model.Enclosure{
  301. URL: enclosureURL,
  302. MimeType: enclosure.Type,
  303. Size: enclosure.Size(),
  304. })
  305. }
  306. }
  307. for _, mediaContent := range rssItem.AllMediaContents() {
  308. mediaURL := strings.TrimSpace(mediaContent.URL)
  309. if mediaURL == "" {
  310. continue
  311. }
  312. if _, found := duplicates[mediaURL]; !found {
  313. mediaURL := strings.TrimSpace(mediaContent.URL)
  314. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  315. slog.Debug("Unable to build absolute URL for media content",
  316. slog.String("url", mediaContent.URL),
  317. slog.String("site_url", siteURL),
  318. slog.Any("error", err),
  319. )
  320. } else {
  321. duplicates[mediaAbsoluteURL] = true
  322. enclosures = append(enclosures, &model.Enclosure{
  323. URL: mediaAbsoluteURL,
  324. MimeType: mediaContent.MimeType(),
  325. Size: mediaContent.Size(),
  326. })
  327. }
  328. }
  329. }
  330. for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
  331. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  332. if mediaURL == "" {
  333. continue
  334. }
  335. if _, found := duplicates[mediaURL]; !found {
  336. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  337. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  338. slog.Debug("Unable to build absolute URL for media peer link",
  339. slog.String("url", mediaPeerLink.URL),
  340. slog.String("site_url", siteURL),
  341. slog.Any("error", err),
  342. )
  343. } else {
  344. duplicates[mediaAbsoluteURL] = true
  345. enclosures = append(enclosures, &model.Enclosure{
  346. URL: mediaAbsoluteURL,
  347. MimeType: mediaPeerLink.MimeType(),
  348. Size: mediaPeerLink.Size(),
  349. })
  350. }
  351. }
  352. }
  353. return enclosures
  354. }