adapter.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "html"
  6. "log/slog"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/model"
  13. "miniflux.app/v2/internal/reader/date"
  14. "miniflux.app/v2/internal/reader/sanitizer"
  15. "miniflux.app/v2/internal/urllib"
  16. )
  17. type RSSAdapter struct {
  18. rss *RSS
  19. }
  20. func NewRSSAdapter(rss *RSS) *RSSAdapter {
  21. return &RSSAdapter{rss}
  22. }
  23. func (r *RSSAdapter) BuildFeed(feedURL string) *model.Feed {
  24. feed := &model.Feed{
  25. Title: html.UnescapeString(strings.TrimSpace(r.rss.Channel.Title)),
  26. FeedURL: feedURL,
  27. SiteURL: r.rss.Channel.Link,
  28. }
  29. if siteURL, err := urllib.AbsoluteURL(feedURL, r.rss.Channel.Link); err == nil {
  30. feed.SiteURL = siteURL
  31. }
  32. // Try to find the feed URL from the Atom links.
  33. for _, atomLink := range r.rss.Channel.AtomLinks.Links {
  34. atomLinkHref := strings.TrimSpace(atomLink.URL)
  35. if atomLinkHref != "" && atomLink.Rel == "self" {
  36. if absoluteFeedURL, err := urllib.AbsoluteURL(feedURL, atomLinkHref); err == nil {
  37. feed.FeedURL = absoluteFeedURL
  38. break
  39. }
  40. }
  41. }
  42. // Fallback to the site URL if the title is empty.
  43. if feed.Title == "" {
  44. feed.Title = feed.SiteURL
  45. }
  46. // Get TTL if defined.
  47. if r.rss.Channel.TTL != "" {
  48. if ttl, err := strconv.Atoi(r.rss.Channel.TTL); err == nil {
  49. feed.TTL = ttl
  50. }
  51. }
  52. // Get the feed icon URL if defined.
  53. if r.rss.Channel.Image != nil {
  54. if absoluteIconURL, err := urllib.AbsoluteURL(feed.SiteURL, r.rss.Channel.Image.URL); err == nil {
  55. feed.IconURL = absoluteIconURL
  56. }
  57. }
  58. for _, item := range r.rss.Channel.Items {
  59. entry := model.NewEntry()
  60. entry.Date = findEntryDate(&item)
  61. entry.Content = findEntryContent(&item)
  62. entry.Enclosures = findEntryEnclosures(&item)
  63. // Populate the entry URL.
  64. entryURL := findEntryURL(&item)
  65. if entryURL == "" {
  66. entry.URL = feed.SiteURL
  67. } else {
  68. if absoluteEntryURL, err := urllib.AbsoluteURL(feed.SiteURL, entryURL); err == nil {
  69. entry.URL = absoluteEntryURL
  70. } else {
  71. entry.URL = entryURL
  72. }
  73. }
  74. // Populate the entry title.
  75. entry.Title = findEntryTitle(&item)
  76. if entry.Title == "" {
  77. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  78. }
  79. if entry.Title == "" {
  80. entry.Title = entry.URL
  81. }
  82. entry.Author = findEntryAuthor(&item)
  83. if entry.Author == "" {
  84. entry.Author = findFeedAuthor(&r.rss.Channel)
  85. }
  86. // Generate the entry hash.
  87. for _, value := range []string{item.GUID.Data, entryURL} {
  88. if value != "" {
  89. entry.Hash = crypto.Hash(value)
  90. break
  91. }
  92. }
  93. // Find CommentsURL if defined.
  94. if absoluteCommentsURL := strings.TrimSpace(item.CommentsURL); absoluteCommentsURL != "" && urllib.IsAbsoluteURL(absoluteCommentsURL) {
  95. entry.CommentsURL = absoluteCommentsURL
  96. }
  97. // Set podcast listening time.
  98. if item.ItunesDuration != "" {
  99. if duration, err := getDurationInMinutes(item.ItunesDuration); err == nil {
  100. entry.ReadingTime = duration
  101. }
  102. }
  103. // Populate entry categories.
  104. entry.Tags = append(entry.Tags, item.Categories...)
  105. entry.Tags = append(entry.Tags, item.MediaCategories.Labels()...)
  106. entry.Tags = append(entry.Tags, r.rss.Channel.Categories...)
  107. entry.Tags = append(entry.Tags, r.rss.Channel.GetItunesCategories()...)
  108. if r.rss.Channel.GooglePlayCategory.Text != "" {
  109. entry.Tags = append(entry.Tags, r.rss.Channel.GooglePlayCategory.Text)
  110. }
  111. feed.Entries = append(feed.Entries, entry)
  112. }
  113. return feed
  114. }
  115. func findFeedAuthor(rssChannel *RSSChannel) string {
  116. var author string
  117. switch {
  118. case rssChannel.ItunesAuthor != "":
  119. author = rssChannel.ItunesAuthor
  120. case rssChannel.GooglePlayAuthor != "":
  121. author = rssChannel.GooglePlayAuthor
  122. case rssChannel.ItunesOwner.String() != "":
  123. author = rssChannel.ItunesOwner.String()
  124. case rssChannel.ManagingEditor != "":
  125. author = rssChannel.ManagingEditor
  126. case rssChannel.Webmaster != "":
  127. author = rssChannel.Webmaster
  128. }
  129. return sanitizer.StripTags(strings.TrimSpace(author))
  130. }
  131. func findEntryTitle(rssItem *RSSItem) string {
  132. title := rssItem.Title
  133. if rssItem.DublinCoreTitle != "" {
  134. title = rssItem.DublinCoreTitle
  135. }
  136. return html.UnescapeString(strings.TrimSpace(title))
  137. }
  138. func findEntryURL(rssItem *RSSItem) string {
  139. for _, link := range []string{rssItem.FeedBurnerLink, rssItem.Link} {
  140. if link != "" {
  141. return strings.TrimSpace(link)
  142. }
  143. }
  144. for _, atomLink := range rssItem.AtomLinks.Links {
  145. if atomLink.URL != "" && (strings.EqualFold(atomLink.Rel, "alternate") || atomLink.Rel == "") {
  146. return strings.TrimSpace(atomLink.URL)
  147. }
  148. }
  149. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  150. // isPermaLink is optional, its default value is true.
  151. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  152. if rssItem.GUID.IsPermaLink == "true" || rssItem.GUID.IsPermaLink == "" {
  153. return strings.TrimSpace(rssItem.GUID.Data)
  154. }
  155. return ""
  156. }
  157. func findEntryContent(rssItem *RSSItem) string {
  158. for _, value := range []string{
  159. rssItem.DublinCoreContent,
  160. rssItem.Description,
  161. rssItem.GooglePlayDescription,
  162. rssItem.ItunesSummary,
  163. rssItem.ItunesSubtitle,
  164. } {
  165. if value != "" {
  166. return value
  167. }
  168. }
  169. return ""
  170. }
  171. func findEntryDate(rssItem *RSSItem) time.Time {
  172. value := rssItem.PubDate
  173. if rssItem.DublinCoreDate != "" {
  174. value = rssItem.DublinCoreDate
  175. }
  176. if value != "" {
  177. result, err := date.Parse(value)
  178. if err != nil {
  179. slog.Debug("Unable to parse date from RSS feed",
  180. slog.String("date", value),
  181. slog.String("guid", rssItem.GUID.Data),
  182. slog.Any("error", err),
  183. )
  184. return time.Now()
  185. }
  186. return result
  187. }
  188. return time.Now()
  189. }
  190. func findEntryAuthor(rssItem *RSSItem) string {
  191. var author string
  192. switch {
  193. case rssItem.GooglePlayAuthor != "":
  194. author = rssItem.GooglePlayAuthor
  195. case rssItem.ItunesAuthor != "":
  196. author = rssItem.ItunesAuthor
  197. case rssItem.DublinCoreCreator != "":
  198. author = rssItem.DublinCoreCreator
  199. case rssItem.AtomAuthor.String() != "":
  200. author = rssItem.AtomAuthor.String()
  201. case strings.Contains(rssItem.Author.Inner, "<![CDATA["):
  202. author = rssItem.Author.Data
  203. default:
  204. author = rssItem.Author.Inner
  205. }
  206. return strings.TrimSpace(sanitizer.StripTags(author))
  207. }
  208. func findEntryEnclosures(rssItem *RSSItem) model.EnclosureList {
  209. enclosures := make(model.EnclosureList, 0)
  210. duplicates := make(map[string]bool)
  211. for _, mediaThumbnail := range rssItem.AllMediaThumbnails() {
  212. if _, found := duplicates[mediaThumbnail.URL]; !found {
  213. duplicates[mediaThumbnail.URL] = true
  214. enclosures = append(enclosures, &model.Enclosure{
  215. URL: mediaThumbnail.URL,
  216. MimeType: mediaThumbnail.MimeType(),
  217. Size: mediaThumbnail.Size(),
  218. })
  219. }
  220. }
  221. for _, enclosure := range rssItem.Enclosures {
  222. enclosureURL := enclosure.URL
  223. if rssItem.FeedBurnerEnclosureLink != "" {
  224. filename := path.Base(rssItem.FeedBurnerEnclosureLink)
  225. if strings.Contains(enclosureURL, filename) {
  226. enclosureURL = rssItem.FeedBurnerEnclosureLink
  227. }
  228. }
  229. if enclosureURL == "" {
  230. continue
  231. }
  232. if _, found := duplicates[enclosureURL]; !found {
  233. duplicates[enclosureURL] = true
  234. enclosures = append(enclosures, &model.Enclosure{
  235. URL: enclosureURL,
  236. MimeType: enclosure.Type,
  237. Size: enclosure.Size(),
  238. })
  239. }
  240. }
  241. for _, mediaContent := range rssItem.AllMediaContents() {
  242. if _, found := duplicates[mediaContent.URL]; !found {
  243. duplicates[mediaContent.URL] = true
  244. enclosures = append(enclosures, &model.Enclosure{
  245. URL: mediaContent.URL,
  246. MimeType: mediaContent.MimeType(),
  247. Size: mediaContent.Size(),
  248. })
  249. }
  250. }
  251. for _, mediaPeerLink := range rssItem.AllMediaPeerLinks() {
  252. if _, found := duplicates[mediaPeerLink.URL]; !found {
  253. duplicates[mediaPeerLink.URL] = true
  254. enclosures = append(enclosures, &model.Enclosure{
  255. URL: mediaPeerLink.URL,
  256. MimeType: mediaPeerLink.MimeType(),
  257. Size: mediaPeerLink.Size(),
  258. })
  259. }
  260. }
  261. return enclosures
  262. }