adapter.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "html"
  6. "log/slog"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/model"
  13. "miniflux.app/v2/internal/reader/date"
  14. "miniflux.app/v2/internal/reader/sanitizer"
  15. "miniflux.app/v2/internal/urllib"
  16. )
  17. type RSSAdapter struct {
  18. rss *RSS
  19. }
  20. func NewRSSAdapter(rss *RSS) *RSSAdapter {
  21. return &RSSAdapter{rss}
  22. }
  23. func (r *RSSAdapter) BuildFeed(baseURL string) *model.Feed {
  24. feed := &model.Feed{
  25. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  26. FeedURL: strings.TrimSpace(baseURL),
  27. SiteURL: strings.TrimSpace(r.rss.Channel.Link),
  28. Description: strings.TrimSpace(r.rss.Channel.Description),
  29. }
  30. // Ensure the Site URL is absolute.
  31. if absoluteSiteURL, err := urllib.AbsoluteURL(baseURL, feed.SiteURL); err == nil {
  32. feed.SiteURL = absoluteSiteURL
  33. }
  34. // Try to find the feed URL from the Atom links.
  35. for _, atomLink := range r.rss.Channel.Links {
  36. atomLinkHref := strings.TrimSpace(atomLink.Href)
  37. if atomLinkHref != "" && atomLink.Rel == "self" {
  38. if absoluteFeedURL, err := urllib.AbsoluteURL(feed.FeedURL, atomLinkHref); err == nil {
  39. feed.FeedURL = absoluteFeedURL
  40. break
  41. }
  42. }
  43. }
  44. // Fallback to the site URL if the title is empty.
  45. if feed.Title == "" {
  46. feed.Title = feed.SiteURL
  47. }
  48. // Get TTL if defined.
  49. if r.rss.Channel.TTL != "" {
  50. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  51. feed.TTL = ttl
  52. }
  53. }
  54. // Get the feed icon URL if defined.
  55. if r.rss.Channel.Image != nil {
  56. if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  57. feed.IconURL = absoluteIconURL
  58. }
  59. }
  60. for _, item := range r.rss.Channel.Items {
  61. entry := model.NewEntry()
  62. entry.Date = findEntryDate(&item)
  63. entry.Content = findEntryContent(&item)
  64. entry.Enclosures = findEntryEnclosures(&item, feed.SiteURL)
  65. // Populate the entry URL.
  66. entryURL := findEntryURL(&item)
  67. if entryURL == "" {
  68. entry.URL = feed.SiteURL
  69. } else {
  70. if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil {
  71. entry.URL = absoluteEntryURL
  72. } else {
  73. entry.URL = entryURL
  74. }
  75. }
  76. // Populate the entry title.
  77. entry.Title = findEntryTitle(&item)
  78. if entry.Title == "" {
  79. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  80. if entry.Title == "" {
  81. entry.Title = entry.URL
  82. }
  83. }
  84. entry.Author = findEntryAuthor(&item)
  85. if entry.Author == "" {
  86. entry.Author = findFeedAuthor(&r.rss.Channel)
  87. }
  88. // Generate the entry hash.
  89. switch {
  90. case item.GUID.Data != "":
  91. entry.Hash = crypto.SHA256(item.GUID.Data)
  92. case entryURL != "":
  93. entry.Hash = crypto.SHA256(entryURL)
  94. default:
  95. entry.Hash = crypto.SHA256(entry.Title + entry.Content)
  96. }
  97. // Find CommentsURL if defined.
  98. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  99. entry.CommentsURL = absoluteCommentsURL
  100. }
  101. // Set podcast listening time.
  102. if item.ItunesDuration != "" {
  103. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  104. entry.ReadingTime = duration
  105. }
  106. }
  107. // Populate entry categories.
  108. for _, tag := range item.Categories {
  109. if tag != "" {
  110. entry.Tags = append(entry.Tags, tag)
  111. }
  112. }
  113. for _, tag := range item.MediaCategories.Labels() {
  114. if tag != "" {
  115. entry.Tags = append(entry.Tags, tag)
  116. }
  117. }
  118. if len(entry.Tags) == 0 {
  119. for _, tag := range r.rss.Channel.Categories {
  120. if tag != "" {
  121. entry.Tags = append(entry.Tags, tag)
  122. }
  123. }
  124. for _, tag := range r.rss.Channel.GetItunesCategories() {
  125. if tag != "" {
  126. entry.Tags = append(entry.Tags, tag)
  127. }
  128. }
  129. if r.rss.Channel.GooglePlayCategory.Text != "" {
  130. entry.Tags = append(entry.Tags, r.rss.Channel.GooglePlayCategory.Text)
  131. }
  132. }
  133. feed.Entries = append(feed.Entries, entry)
  134. }
  135. return feed
  136. }
  137. func findFeedAuthor(rssChannel *RSSChannel) string {
  138. var author string
  139. switch {
  140. case rssChannel.ItunesAuthor != "":
  141. author = rssChannel.ItunesAuthor
  142. case rssChannel.GooglePlayAuthor != "":
  143. author = rssChannel.GooglePlayAuthor
  144. case rssChannel.ItunesOwner.String() != "":
  145. author = rssChannel.ItunesOwner.String()
  146. case rssChannel.ManagingEditor != "":
  147. author = rssChannel.ManagingEditor
  148. case rssChannel.Webmaster != "":
  149. author = rssChannel.Webmaster
  150. default:
  151. return ""
  152. }
  153. return strings.TrimSpace(sanitizer.StripTags(author))
  154. }
  155. func findEntryTitle(rssItem *RSSItem) string {
  156. title := rssItem.Title.Content
  157. if rssItem.DublinCoreTitle != "" {
  158. title = rssItem.DublinCoreTitle
  159. }
  160. return html.UnescapeString(html.UnescapeString(strings.TrimSpace(title)))
  161. }
  162. func findEntryURL(rssItem *RSSItem) string {
  163. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  164. if link != "" {
  165. return strings.TrimSpace(link)
  166. }
  167. }
  168. for _, atomLink := range rssItem.Links {
  169. if atomLink.Href != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  170. return strings.TrimSpace(atomLink.Href)
  171. }
  172. }
  173. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  174. // isPermaLink is optional, its default value is true.
  175. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  176. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  177. return strings.TrimSpace(rssItem.GUID.Data)
  178. }
  179. return ""
  180. }
  181. func findEntryContent(rssItem *RSSItem) string {
  182. for _, value := range []string{
  183. rssItem.DublinCoreContent,
  184. rssItem.Description,
  185. rssItem.GooglePlayDescription,
  186. rssItem.ItunesSummary,
  187. rssItem.ItunesSubtitle,
  188. } {
  189. if value != "" {
  190. return value
  191. }
  192. }
  193. return ""
  194. }
  195. func findEntryDate(rssItem *RSSItem) time.Time {
  196. value := rssItem.PubDate
  197. if rssItem.DublinCoreDate != "" {
  198. value = rssItem.DublinCoreDate
  199. }
  200. if value != "" {
  201. result, err := date.Parse(value)
  202. if err != nil {
  203. slog.Debug("Unable to parse date from RSS feed",
  204. slog.String("date", value),
  205. slog.String("guid", rssItem.GUID.Data),
  206. slog.Any("error", err),
  207. )
  208. return time.Now()
  209. }
  210. return result
  211. }
  212. return time.Now()
  213. }
  214. func findEntryAuthor(rssItem *RSSItem) string {
  215. var author string
  216. switch {
  217. case rssItem.GooglePlayAuthor != "":
  218. author = rssItem.GooglePlayAuthor
  219. case rssItem.ItunesAuthor != "":
  220. author = rssItem.ItunesAuthor
  221. case rssItem.DublinCoreCreator != "":
  222. author = rssItem.DublinCoreCreator
  223. case rssItem.PersonName() != "":
  224. author = rssItem.PersonName()
  225. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  226. author = rssItem.Author.Data
  227. case rssItem.Author.Inner != "":
  228. author = rssItem.Author.Inner
  229. default:
  230. return ""
  231. }
  232. return strings.TrimSpace(sanitizer.StripTags(author))
  233. }
  234. func findEntryEnclosures(rssItem *RSSItem, siteURL string) model.EnclosureList {
  235. enclosures := make(model.EnclosureList, 0)
  236. duplicates := make(map[string]bool)
  237. for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
  238. mediaURL := strings.TrimSpace(mediaThumbnail.URL)
  239. if mediaURL == "" {
  240. continue
  241. }
  242. if _, found := duplicates[mediaURL]; !found {
  243. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  244. slog.Debug("Unable to build absolute URL for media thumbnail",
  245. slog.String("url", mediaThumbnail.URL),
  246. slog.String("site_url", siteURL),
  247. slog.Any("error", err),
  248. )
  249. } else {
  250. duplicates[mediaAbsoluteURL] = true
  251. enclosures = append(enclosures, &model.Enclosure{
  252. URL: mediaAbsoluteURL,
  253. MimeType: mediaThumbnail.MimeType(),
  254. Size: mediaThumbnail.Size(),
  255. })
  256. }
  257. }
  258. }
  259. for _, enclosure := range rssItem.Enclosures {
  260. enclosureURL := enclosure.URL
  261. if rssItem.FeedBurnerEnclosureLink != "" {
  262. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  263. if strings.HasSuffix(enclosureURL, filename) {
  264. enclosureURL = rssItem.FeedBurnerEnclosureLink
  265. }
  266. }
  267. enclosureURL = strings.TrimSpace(enclosureURL)
  268. if enclosureURL == "" {
  269. continue
  270. }
  271. if absoluteEnclosureURL, err := urllib.AbsoluteURL(siteURL, enclosureURL); err == nil {
  272. enclosureURL = absoluteEnclosureURL
  273. }
  274. if _, found := duplicates[enclosureURL]; !found {
  275. duplicates[enclosureURL] = true
  276. enclosures = append(enclosures, &model.Enclosure{
  277. URL: enclosureURL,
  278. MimeType: enclosure.Type,
  279. Size: enclosure.Size(),
  280. })
  281. }
  282. }
  283. for _, mediaContent := range rssItem.AllMediaContents() {
  284. mediaURL := strings.TrimSpace(mediaContent.URL)
  285. if mediaURL == "" {
  286. continue
  287. }
  288. if _, found := duplicates[mediaURL]; !found {
  289. mediaURL := strings.TrimSpace(mediaContent.URL)
  290. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  291. slog.Debug("Unable to build absolute URL for media content",
  292. slog.String("url", mediaContent.URL),
  293. slog.String("site_url", siteURL),
  294. slog.Any("error", err),
  295. )
  296. } else {
  297. duplicates[mediaAbsoluteURL] = true
  298. enclosures = append(enclosures, &model.Enclosure{
  299. URL: mediaAbsoluteURL,
  300. MimeType: mediaContent.MimeType(),
  301. Size: mediaContent.Size(),
  302. })
  303. }
  304. }
  305. }
  306. for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
  307. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  308. if mediaURL == "" {
  309. continue
  310. }
  311. if _, found := duplicates[mediaURL]; !found {
  312. mediaURL := strings.TrimSpace(mediaPeerLink.URL)
  313. if mediaAbsoluteURL, err := urllib.AbsoluteURL(siteURL, mediaURL); err != nil {
  314. slog.Debug("Unable to build absolute URL for media peer link",
  315. slog.String("url", mediaPeerLink.URL),
  316. slog.String("site_url", siteURL),
  317. slog.Any("error", err),
  318. )
  319. } else {
  320. duplicates[mediaAbsoluteURL] = true
  321. enclosures = append(enclosures, &model.Enclosure{
  322. URL: mediaAbsoluteURL,
  323. MimeType: mediaPeerLink.MimeType(),
  324. Size: mediaPeerLink.Size(),
  325. })
  326. }
  327. }
  328. }
  329. return enclosures
  330. }