rss.go 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package rss // import "miniflux.app/v2/internal/reader/rss"
  4. import (
  5. "encoding/xml"
  6. "html"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/v2/internal/crypto"
  12. "miniflux.app/v2/internal/logger"
  13. "miniflux.app/v2/internal/model"
  14. "miniflux.app/v2/internal/reader/date"
  15. "miniflux.app/v2/internal/reader/dublincore"
  16. "miniflux.app/v2/internal/reader/media"
  17. "miniflux.app/v2/internal/reader/sanitizer"
  18. "miniflux.app/v2/internal/urllib"
  19. )
  20. // Specs: https://cyber.harvard.edu/rss/rss.html
  21. type rssFeed struct {
  22. XMLName xml.Name `xml:"rss"`
  23. Version string `xml:"version,attr"`
  24. Title string `xml:"channel>title"`
  25. Links []rssLink `xml:"channel>link"`
  26. ImageURL string `xml:"channel>image>url"`
  27. Language string `xml:"channel>language"`
  28. Description string `xml:"channel>description"`
  29. PubDate string `xml:"channel>pubDate"`
  30. ManagingEditor string `xml:"channel>managingEditor"`
  31. Webmaster string `xml:"channel>webMaster"`
  32. Items []rssItem `xml:"channel>item"`
  33. PodcastFeedElement
  34. }
  35. func (r *rssFeed) Transform(baseURL string) *model.Feed {
  36. var err error
  37. feed := new(model.Feed)
  38. siteURL := r.siteURL()
  39. feed.SiteURL, err = urllib.AbsoluteURL(baseURL, siteURL)
  40. if err != nil {
  41. feed.SiteURL = siteURL
  42. }
  43. feedURL := r.feedURL()
  44. feed.FeedURL, err = urllib.AbsoluteURL(baseURL, feedURL)
  45. if err != nil {
  46. feed.FeedURL = feedURL
  47. }
  48. feed.Title = html.UnescapeString(strings.TrimSpace(r.Title))
  49. if feed.Title == "" {
  50. feed.Title = feed.SiteURL
  51. }
  52. feed.IconURL = strings.TrimSpace(r.ImageURL)
  53. for _, item := range r.Items {
  54. entry := item.Transform()
  55. if entry.Author == "" {
  56. entry.Author = r.feedAuthor()
  57. }
  58. if entry.URL == "" {
  59. entry.URL = feed.SiteURL
  60. } else {
  61. entryURL, err := urllib.AbsoluteURL(feed.SiteURL, entry.URL)
  62. if err == nil {
  63. entry.URL = entryURL
  64. }
  65. }
  66. if entry.Title == "" {
  67. entry.Title = sanitizer.TruncateHTML(entry.Content, 100)
  68. }
  69. if entry.Title == "" {
  70. entry.Title = entry.URL
  71. }
  72. feed.Entries = append(feed.Entries, entry)
  73. }
  74. return feed
  75. }
  76. func (r *rssFeed) siteURL() string {
  77. for _, element := range r.Links {
  78. if element.XMLName.Space == "" {
  79. return strings.TrimSpace(element.Data)
  80. }
  81. }
  82. return ""
  83. }
  84. func (r *rssFeed) feedURL() string {
  85. for _, element := range r.Links {
  86. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  87. return strings.TrimSpace(element.Href)
  88. }
  89. }
  90. return ""
  91. }
  92. func (r rssFeed) feedAuthor() string {
  93. author := r.PodcastAuthor()
  94. switch {
  95. case r.ManagingEditor != "":
  96. author = r.ManagingEditor
  97. case r.Webmaster != "":
  98. author = r.Webmaster
  99. }
  100. return sanitizer.StripTags(strings.TrimSpace(author))
  101. }
  102. type rssGUID struct {
  103. XMLName xml.Name
  104. Data string `xml:",chardata"`
  105. IsPermaLink string `xml:"isPermaLink,attr"`
  106. }
  107. type rssLink struct {
  108. XMLName xml.Name
  109. Data string `xml:",chardata"`
  110. Href string `xml:"href,attr"`
  111. Rel string `xml:"rel,attr"`
  112. }
  113. type rssCommentLink struct {
  114. XMLName xml.Name
  115. Data string `xml:",chardata"`
  116. }
  117. type rssAuthor struct {
  118. XMLName xml.Name
  119. Data string `xml:",chardata"`
  120. Name string `xml:"name"`
  121. Email string `xml:"email"`
  122. Inner string `xml:",innerxml"`
  123. }
  124. type rssTitle struct {
  125. XMLName xml.Name
  126. Data string `xml:",chardata"`
  127. Inner string `xml:",innerxml"`
  128. }
  129. type rssEnclosure struct {
  130. URL string `xml:"url,attr"`
  131. Type string `xml:"type,attr"`
  132. Length string `xml:"length,attr"`
  133. }
  134. type rssCategory struct {
  135. XMLName xml.Name
  136. Data string `xml:",chardata"`
  137. Inner string `xml:",innerxml"`
  138. }
  139. func (enclosure *rssEnclosure) Size() int64 {
  140. if enclosure.Length == "" {
  141. return 0
  142. }
  143. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  144. return size
  145. }
  146. type rssItem struct {
  147. GUID rssGUID `xml:"guid"`
  148. Title []rssTitle `xml:"title"`
  149. Links []rssLink `xml:"link"`
  150. Description string `xml:"description"`
  151. PubDate string `xml:"pubDate"`
  152. Authors []rssAuthor `xml:"author"`
  153. CommentLinks []rssCommentLink `xml:"comments"`
  154. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  155. Categories []rssCategory `xml:"category"`
  156. dublincore.DublinCoreItemElement
  157. FeedBurnerElement
  158. PodcastEntryElement
  159. media.Element
  160. }
  161. func (r *rssItem) Transform() *model.Entry {
  162. entry := model.NewEntry()
  163. entry.URL = r.entryURL()
  164. entry.CommentsURL = r.entryCommentsURL()
  165. entry.Date = r.entryDate()
  166. entry.Author = r.entryAuthor()
  167. entry.Hash = r.entryHash()
  168. entry.Content = r.entryContent()
  169. entry.Title = r.entryTitle()
  170. entry.Enclosures = r.entryEnclosures()
  171. entry.Tags = r.entryCategories()
  172. if duration, err := normalizeDuration(r.Duration); err == nil {
  173. entry.ReadingTime = duration
  174. }
  175. return entry
  176. }
  177. func (r *rssItem) entryDate() time.Time {
  178. value := r.PubDate
  179. if r.DublinCoreDate != "" {
  180. value = r.DublinCoreDate
  181. }
  182. if value != "" {
  183. result, err := date.Parse(value)
  184. if err != nil {
  185. logger.Error("rss: %v (entry GUID = %s)", err, r.GUID)
  186. return time.Now()
  187. }
  188. return result
  189. }
  190. return time.Now()
  191. }
  192. func (r *rssItem) entryAuthor() string {
  193. author := ""
  194. for _, rssAuthor := range r.Authors {
  195. switch rssAuthor.XMLName.Space {
  196. case "http://www.itunes.com/dtds/podcast-1.0.dtd", "http://www.google.com/schemas/play-podcasts/1.0":
  197. author = rssAuthor.Data
  198. case "http://www.w3.org/2005/Atom":
  199. if rssAuthor.Name != "" {
  200. author = rssAuthor.Name
  201. } else if rssAuthor.Email != "" {
  202. author = rssAuthor.Email
  203. }
  204. default:
  205. if rssAuthor.Name != "" {
  206. author = rssAuthor.Name
  207. } else if strings.Contains(rssAuthor.Inner, "<![CDATA[") {
  208. author = rssAuthor.Data
  209. } else {
  210. author = rssAuthor.Inner
  211. }
  212. }
  213. }
  214. if author == "" {
  215. author = r.GetSanitizedCreator()
  216. }
  217. return sanitizer.StripTags(strings.TrimSpace(author))
  218. }
  219. func (r *rssItem) entryHash() string {
  220. for _, value := range []string{r.GUID.Data, r.entryURL()} {
  221. if value != "" {
  222. return crypto.Hash(value)
  223. }
  224. }
  225. return ""
  226. }
  227. func (r *rssItem) entryTitle() string {
  228. var title string
  229. for _, rssTitle := range r.Title {
  230. switch rssTitle.XMLName.Space {
  231. case "http://search.yahoo.com/mrss/":
  232. // Ignore title in media namespace
  233. case "http://purl.org/dc/elements/1.1/":
  234. title = rssTitle.Data
  235. default:
  236. title = rssTitle.Data
  237. }
  238. if title != "" {
  239. break
  240. }
  241. }
  242. return html.UnescapeString(strings.TrimSpace(title))
  243. }
  244. func (r *rssItem) entryContent() string {
  245. for _, value := range []string{r.DublinCoreContent, r.Description, r.PodcastDescription()} {
  246. if value != "" {
  247. return value
  248. }
  249. }
  250. return ""
  251. }
  252. func (r *rssItem) entryURL() string {
  253. if r.FeedBurnerLink != "" {
  254. return r.FeedBurnerLink
  255. }
  256. for _, link := range r.Links {
  257. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  258. return strings.TrimSpace(link.Href)
  259. }
  260. if link.Data != "" {
  261. return strings.TrimSpace(link.Data)
  262. }
  263. }
  264. // Specs: https://cyber.harvard.edu/rss/rss.html#ltguidgtSubelementOfLtitemgt
  265. // isPermaLink is optional, its default value is true.
  266. // If its value is false, the guid may not be assumed to be a url, or a url to anything in particular.
  267. if r.GUID.IsPermaLink == "true" || r.GUID.IsPermaLink == "" {
  268. return strings.TrimSpace(r.GUID.Data)
  269. }
  270. return ""
  271. }
  272. func (r *rssItem) entryEnclosures() model.EnclosureList {
  273. enclosures := make(model.EnclosureList, 0)
  274. duplicates := make(map[string]bool)
  275. for _, mediaThumbnail := range r.AllMediaThumbnails() {
  276. if _, found := duplicates[mediaThumbnail.URL]; !found {
  277. duplicates[mediaThumbnail.URL] = true
  278. enclosures = append(enclosures, &model.Enclosure{
  279. URL: mediaThumbnail.URL,
  280. MimeType: mediaThumbnail.MimeType(),
  281. Size: mediaThumbnail.Size(),
  282. })
  283. }
  284. }
  285. for _, enclosure := range r.EnclosureLinks {
  286. enclosureURL := enclosure.URL
  287. if r.FeedBurnerEnclosureLink != "" {
  288. filename := path.Base(r.FeedBurnerEnclosureLink)
  289. if strings.Contains(enclosureURL, filename) {
  290. enclosureURL = r.FeedBurnerEnclosureLink
  291. }
  292. }
  293. if enclosureURL == "" {
  294. continue
  295. }
  296. if _, found := duplicates[enclosureURL]; !found {
  297. duplicates[enclosureURL] = true
  298. enclosures = append(enclosures, &model.Enclosure{
  299. URL: enclosureURL,
  300. MimeType: enclosure.Type,
  301. Size: enclosure.Size(),
  302. })
  303. }
  304. }
  305. for _, mediaContent := range r.AllMediaContents() {
  306. if _, found := duplicates[mediaContent.URL]; !found {
  307. duplicates[mediaContent.URL] = true
  308. enclosures = append(enclosures, &model.Enclosure{
  309. URL: mediaContent.URL,
  310. MimeType: mediaContent.MimeType(),
  311. Size: mediaContent.Size(),
  312. })
  313. }
  314. }
  315. for _, mediaPeerLink := range r.AllMediaPeerLinks() {
  316. if _, found := duplicates[mediaPeerLink.URL]; !found {
  317. duplicates[mediaPeerLink.URL] = true
  318. enclosures = append(enclosures, &model.Enclosure{
  319. URL: mediaPeerLink.URL,
  320. MimeType: mediaPeerLink.MimeType(),
  321. Size: mediaPeerLink.Size(),
  322. })
  323. }
  324. }
  325. return enclosures
  326. }
  327. func (r *rssItem) entryCategories() []string {
  328. categoryList := make([]string, 0)
  329. for _, rssCategory := range r.Categories {
  330. if strings.Contains(rssCategory.Inner, "<![CDATA[") {
  331. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Data))
  332. } else {
  333. categoryList = append(categoryList, strings.TrimSpace(rssCategory.Inner))
  334. }
  335. }
  336. return categoryList
  337. }
  338. func (r *rssItem) entryCommentsURL() string {
  339. for _, commentLink := range r.CommentLinks {
  340. if commentLink.XMLName.Space == "" {
  341. commentsURL := strings.TrimSpace(commentLink.Data)
  342. // The comments URL is supposed to be absolute (some feeds publishes incorrect comments URL)
  343. // See https://cyber.harvard.edu/rss/rss.html#ltcommentsgtSubelementOfLtitemgt
  344. if urllib.IsAbsoluteURL(commentsURL) {
  345. return commentsURL
  346. }
  347. }
  348. }
  349. return ""
  350. }
  351. func isValidLinkRelation(rel string) bool {
  352. switch rel {
  353. case "", "alternate", "enclosure", "related", "self", "via":
  354. return true
  355. default:
  356. if strings.HasPrefix(rel, "http") {
  357. return true
  358. }
  359. return false
  360. }
  361. }