adapter.go 9.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "html"
  6. "log/slog"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/model"
  13. "miniflux.app/v2/internal/reader/date"
  14. "miniflux.app/v2/internal/reader/sanitizer"
  15. "miniflux.app/v2/internal/urllib"
  16. )
  17. type RSSAdapter struct {
  18. rss *RSS
  19. }
  20. func NewRSSAdapter(rss *RSS) *RSSAdapter {
  21. return &RSSAdapter{rss}
  22. }
  23. func (r *RSSAdapter) BuildFeed(baseURL string) *model.Feed {
  24. feed := &model.Feed{
  25. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  26. FeedURL: strings.TrimSpace(baseURL),
  27. SiteURL: strings.TrimSpace(r.rss.Channel.Link),
  28. }
  29. // Ensure the Site URL is absolute.
  30. if siteURL, err := urllib.AbsoluteURL(baseURL, feed.SiteURL); err == nil {
  31. feed.SiteURL = siteURL
  32. }
  33. // Try to find the feed URL from the Atom links.
  34. for _, atomLink := range r.rss.Channel.AtomLinks.Links {
  35. atomLinkHref := strings.TrimSpace(atomLink.Href)
  36. if atomLinkHref != "" && atomLink.Rel == "self" {
  37. if absoluteFeedURL, err := urllib.AbsoluteURL(feed.FeedURL, atomLinkHref); err == nil {
  38. feed.FeedURL = absoluteFeedURL
  39. break
  40. }
  41. }
  42. }
  43. // Fallback to the site URL if the title is empty.
  44. if feed.Title == "" {
  45. feed.Title = feed.SiteURL
  46. }
  47. // Get TTL if defined.
  48. if r.rss.Channel.TTL != "" {
  49. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  50. feed.TTL = ttl
  51. }
  52. }
  53. // Get the feed icon URL if defined.
  54. if r.rss.Channel.Image != nil {
  55. if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  56. feed.IconURL = absoluteIconURL
  57. }
  58. }
  59. for _, item := range r.rss.Channel.Items {
  60. entry := model.NewEntry()
  61. entry.Date = findEntryDate(&item)
  62. entry.Content = findEntryContent(&item)
  63. entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
  64. // Populate the entry URL.
  65. entryURL := findEntryURL(&item)
  66. if entryURL == "" {
  67. entry.URL = feed.SiteURL
  68. } else {
  69. if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil {
  70. entry.URL = absoluteEntryURL
  71. } else {
  72. entry.URL = entryURL
  73. }
  74. }
  75. // Populate the entry title.
  76. entry.Title = findEntryTitle(&item)
  77. if entry.Title == "" {
  78. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  79. if entry.Title == "" {
  80. entry.Title = entry.URL
  81. }
  82. }
  83. entry.Author = findEntryAuthor(&item)
  84. if entry.Author == "" {
  85. entry.Author = findFeedAuthor(&r.rss.Channel)
  86. }
  87. // Generate the entry hash.
  88. switch {
  89. case item.GUID.Data != "":
  90. entry.Hash = crypto.Hash(item.GUID.Data)
  91. case entryURL != "":
  92. entry.Hash = crypto.Hash(entryURL)
  93. default:
  94. entry.Hash = crypto.Hash(entry.Title + entry.Content)
  95. }
  96. // Find CommentsURL if defined.
  97. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  98. entry.CommentsURL = absoluteCommentsURL
  99. }
  100. // Set podcast listening time.
  101. if item.ItunesDuration != "" {
  102. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  103. entry.ReadingTime = duration
  104. }
  105. }
  106. // Populate entry categories.
  107. for _, tag := range item.Categories {
  108. if tag != "" {
  109. entry.Tags = append(entry.Tags, tag)
  110. }
  111. }
  112. for _, tag := range item.MediaCategories.Labels() {
  113. if tag != "" {
  114. entry.Tags = append(entry.Tags, tag)
  115. }
  116. }
  117. if len(entry.Tags) == 0 {
  118. for _, tag := range r.rss.Channel.Categories {
  119. if tag != "" {
  120. entry.Tags = append(entry.Tags, tag)
  121. }
  122. }
  123. for _, tag := range r.rss.Channel.GetItunesCategories() {
  124. if tag != "" {
  125. entry.Tags = append(entry.Tags, tag)
  126. }
  127. }
  128. if r.rss.Channel.GooglePlayCategory.Text != "" {
  129. entry.Tags = append(entry.Tags, r.rss.Channel.GooglePlayCategory.Text)
  130. }
  131. }
  132. feed.Entries = append(feed.Entries, entry)
  133. }
  134. return feed
  135. }
  136. func findFeedAuthor(rssChannel *RSSChannel) string {
  137. var author string
  138. switch {
  139. case rssChannel.ItunesAuthor != "":
  140. author = rssChannel.ItunesAuthor
  141. case rssChannel.GooglePlayAuthor != "":
  142. author = rssChannel.GooglePlayAuthor
  143. case rssChannel.ItunesOwner.String() != "":
  144. author = rssChannel.ItunesOwner.String()
  145. case rssChannel.ManagingEditor != "":
  146. author = rssChannel.ManagingEditor
  147. case rssChannel.Webmaster != "":
  148. author = rssChannel.Webmaster
  149. }
  150. return sanitizer.StripTags(strings.TrimSpace(author))
  151. }
  152. func findEntryTitle(rssItem *RSSItem) string {
  153. title := rssItem.Title
  154. if rssItem.DublinCoreTitle != "" {
  155. title = rssItem.DublinCoreTitle
  156. }
  157. return html.UnescapeString(strings.TrimSpace(title))
  158. }
  159. func findEntryURL(rssItem *RSSItem) string {
  160. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  161. if link != "" {
  162. return strings.TrimSpace(link)
  163. }
  164. }
  165. for _, atomLink := range rssItem.AtomLinks.Links {
  166. if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  167. return strings.TrimSpace(atomLink.Href)
  168. }
  169. }
  170. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  171. // isPermaLink is optional, its default value is true.
  172. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  173. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  174. return strings.TrimSpace(rssItem.GUID.Data)
  175. }
  176. return ""
  177. }
  178. func findEntryContent(rssItem *RSSItem) string {
  179. for _, value := range []string{
  180. rssItem.DublinCoreContent,
  181. rssItem.Description,
  182. rssItem.GooglePlayDescription,
  183. rssItem.ItunesSummary,
  184. rssItem.ItunesSubtitle,
  185. } {
  186. if value != "" {
  187. return value
  188. }
  189. }
  190. return ""
  191. }
  192. func findEntryDate(rssItem *RSSItem) time.Time {
  193. value := rssItem.PubDate
  194. if rssItem.DublinCoreDate != "" {
  195. value = rssItem.DublinCoreDate
  196. }
  197. if value != "" {
  198. result, err := date.Parse(value)
  199. if err != nil {
  200. slog.Debug("Unable to parse date from RSS feed",
  201. slog.String("date", value),
  202. slog.String("guid", rssItem.GUID.Data),
  203. slog.Any("error", err),
  204. )
  205. return time.Now()
  206. }
  207. return result
  208. }
  209. return time.Now()
  210. }
  211. func findEntryAuthor(rssItem *RSSItem) string {
  212. var author string
  213. switch {
  214. case rssItem.GooglePlayAuthor != "":
  215. author = rssItem.GooglePlayAuthor
  216. case rssItem.ItunesAuthor != "":
  217. author = rssItem.ItunesAuthor
  218. case rssItem.DublinCoreCreator != "":
  219. author = rssItem.DublinCoreCreator
  220. case rssItem.AtomAuthor.PersonName() != "":
  221. author = rssItem.AtomAuthor.PersonName()
  222. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  223. author = rssItem.Author.Data
  224. default:
  225. author = rssItem.Author.Inner
  226. }
  227. return strings.TrimSpace(sanitizer.StripTags(author))
  228. }
  229. func findEntryEnclosures(rssItem *RSSItem, siteURL string) model.EnclosureList {
  230. enclosures := make(model.EnclosureList, 0)
  231. duplicates := make(map[string]bool)
  232. for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
  233. mediaURL := strings.TrimSpace(mediaThumbnail.URL)
  234. if mediaURL == "" {
  235. continue
  236. }
  237. if _, found := duplicates[mediaURL]; !found {
  238. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  239. slog.Debug("Unable to build absolute URL for media thumbnail",
  240. slog.String("url", mediaThumbnail.URL),
  241. slog.String("site_url", siteURL),
  242. slog.Any("error", err),
  243. )
  244. } else {
  245. duplicates[mediaAbsoluteURL] = true
  246. enclosures = append(enclosures, &model.Enclosure{
  247. URL: mediaAbsoluteURL,
  248. MimeType: mediaThumbnail.MimeType(),
  249. Size: mediaThumbnail.Size(),
  250. })
  251. }
  252. }
  253. }
  254. for _, enclosure := range rssItem.Enclosures {
  255. enclosureURL := enclosure.URL
  256. if rssItem.FeedBurnerEnclosureLink != "" {
  257. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  258. if strings.HasSuffix(enclosureURL, filename) {
  259. enclosureURL = rssItem.FeedBurnerEnclosureLink
  260. }
  261. }
  262. enclosureURL = strings.TrimSpace(enclosureURL)
  263. if enclosureURL == "" {
  264. continue
  265. }
  266. if absoluteEnclosureURL, err := urllib.AbsoluteURL(siteURL, enclosureURL); err == nil {
  267. enclosureURL = absoluteEnclosureURL
  268. }
  269. if _, found := duplicates[enclosureURL]; !found {
  270. duplicates[enclosureURL] = true
  271. enclosures = append(enclosures, &model.Enclosure{
  272. URL: enclosureURL,
  273. MimeType: enclosure.Type,
  274. Size: enclosure.Size(),
  275. })
  276. }
  277. }
  278. for _, mediaContent := range rssItem.AllMediaContents() {
  279. mediaURL := strings.TrimSpace(mediaContent.URL)
  280. if mediaURL == "" {
  281. continue
  282. }
  283. if _, found := duplicates[mediaURL]; !found {
  284. mediaURL := strings.TrimSpace(mediaContent.URL)
  285. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  286. slog.Debug("Unable to build absolute URL for media content",
  287. slog.String("url", mediaContent.URL),
  288. slog.String("site_url", siteURL),
  289. slog.Any("error", err),
  290. )
  291. } else {
  292. duplicates[mediaAbsoluteURL] = true
  293. enclosures = append(enclosures, &model.Enclosure{
  294. URL: mediaAbsoluteURL,
  295. MimeType: mediaContent.MimeType(),
  296. Size: mediaContent.Size(),
  297. })
  298. }
  299. }
  300. }
  301. for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
  302. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  303. if mediaURL == "" {
  304. continue
  305. }
  306. if _, found := duplicates[mediaURL]; !found {
  307. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  308. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  309. slog.Debug("Unable to build absolute URL for media peer link",
  310. slog.String("url", mediaPeerLink.URL),
  311. slog.String("site_url", siteURL),
  312. slog.Any("error", err),
  313. )
  314. } else {
  315. duplicates[mediaAbsoluteURL] = true
  316. enclosures = append(enclosures, &model.Enclosure{
  317. URL: mediaAbsoluteURL,
  318. MimeType: mediaPeerLink.MimeType(),
  319. Size: mediaPeerLink.Size(),
  320. })
  321. }
  322. }
  323. }
  324. return enclosures
  325. }