rss.go 8.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss // import "miniflux.app/reader/rss"
  5. import (
  6. "encoding/xml"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "miniflux.app/crypto"
  12. "miniflux.app/logger"
  13. "miniflux.app/model"
  14. "miniflux.app/reader/date"
  15. "miniflux.app/reader/sanitizer"
  16. "miniflux.app/url"
  17. )
  18. type rssFeed struct {
  19. XMLName xml.Name `xml:"rss"`
  20. Version string `xml:"version,attr"`
  21. Title string `xml:"channel>title"`
  22. Links []rssLink `xml:"channel>link"`
  23. Language string `xml:"channel>language"`
  24. Description string `xml:"channel>description"`
  25. PubDate string `xml:"channel>pubDate"`
  26. ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
  27. Items []rssItem `xml:"channel>item"`
  28. }
  29. type rssLink struct {
  30. XMLName xml.Name
  31. Data string `xml:",chardata"`
  32. Href string `xml:"href,attr"`
  33. Rel string `xml:"rel,attr"`
  34. }
  35. type rssCommentLink struct {
  36. XMLName xml.Name
  37. Data string `xml:",chardata"`
  38. }
  39. type rssAuthor struct {
  40. XMLName xml.Name
  41. Data string `xml:",chardata"`
  42. Name string `xml:"name"`
  43. Inner string `xml:",innerxml"`
  44. }
  45. type rssEnclosure struct {
  46. URL string `xml:"url,attr"`
  47. Type string `xml:"type,attr"`
  48. Length string `xml:"length,attr"`
  49. }
  50. func (enclosure *rssEnclosure) Size() int64 {
  51. if enclosure.Length == "" {
  52. return 0
  53. }
  54. size, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  55. return size
  56. }
  57. type rssItem struct {
  58. GUID string `xml:"guid"`
  59. Title string `xml:"title"`
  60. Links []rssLink `xml:"link"`
  61. OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
  62. CommentLinks []rssCommentLink `xml:"comments"`
  63. Description string `xml:"description"`
  64. EncodedContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
  65. PubDate string `xml:"pubDate"`
  66. Date string `xml:"http://purl.org/dc/elements/1.1/ date"`
  67. Authors []rssAuthor `xml:"author"`
  68. Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
  69. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  70. OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
  71. MediaGroup []rssMediaGroup `xml:"http://search.yahoo.com/mrss/ group"`
  72. MediaContents []rssMediaContent `xml:"http://search.yahoo.com/mrss/ content"`
  73. MediaThumbnails []rssMediaThumbnails `xml:"http://search.yahoo.com/mrss/ thumbnail"`
  74. }
  75. type rssMediaGroup struct {
  76. MediaList []rssMediaContent `xml:"content"`
  77. }
  78. type rssMediaContent struct {
  79. URL string `xml:"url,attr"`
  80. Type string `xml:"type,attr"`
  81. FileSize string `xml:"fileSize,attr"`
  82. Medium string `xml:"medium,attr"`
  83. }
  84. func (mediaContent *rssMediaContent) MimeType() string {
  85. switch {
  86. case mediaContent.Type == "" && mediaContent.Medium == "image":
  87. return "image/*"
  88. case mediaContent.Type == "" && mediaContent.Medium == "video":
  89. return "video/*"
  90. case mediaContent.Type == "" && mediaContent.Medium == "audio":
  91. return "audio/*"
  92. case mediaContent.Type == "" && mediaContent.Medium == "video":
  93. return "video/*"
  94. case mediaContent.Type != "":
  95. return mediaContent.Type
  96. default:
  97. return "application/octet-stream"
  98. }
  99. }
  100. func (mediaContent *rssMediaContent) Size() int64 {
  101. if mediaContent.FileSize == "" {
  102. return 0
  103. }
  104. size, _ := strconv.ParseInt(mediaContent.FileSize, 10, 0)
  105. return size
  106. }
  107. type rssMediaThumbnails struct {
  108. URL string `xml:"url,attr"`
  109. }
  110. func (r *rssFeed) SiteURL() string {
  111. for _, element := range r.Links {
  112. if element.XMLName.Space == "" {
  113. return strings.TrimSpace(element.Data)
  114. }
  115. }
  116. return ""
  117. }
  118. func (r *rssFeed) FeedURL() string {
  119. for _, element := range r.Links {
  120. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  121. return strings.TrimSpace(element.Href)
  122. }
  123. }
  124. return ""
  125. }
  126. func (r *rssFeed) Transform() *model.Feed {
  127. feed := new(model.Feed)
  128. feed.SiteURL = r.SiteURL()
  129. feed.FeedURL = r.FeedURL()
  130. feed.Title = strings.TrimSpace(r.Title)
  131. if feed.Title == "" {
  132. feed.Title = feed.SiteURL
  133. }
  134. for _, item := range r.Items {
  135. entry := item.Transform()
  136. if entry.Author == "" && r.ItunesAuthor != "" {
  137. entry.Author = r.ItunesAuthor
  138. }
  139. entry.Author = strings.TrimSpace(sanitizer.StripTags(entry.Author))
  140. if entry.URL == "" {
  141. entry.URL = feed.SiteURL
  142. } else {
  143. entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL)
  144. if err == nil {
  145. entry.URL = entryURL
  146. }
  147. }
  148. if entry.Title == "" {
  149. entry.Title = entry.URL
  150. }
  151. feed.Entries = append(feed.Entries, entry)
  152. }
  153. return feed
  154. }
  155. func (r *rssItem) PublishedDate() time.Time {
  156. value := r.PubDate
  157. if r.Date != "" {
  158. value = r.Date
  159. }
  160. if value != "" {
  161. result, err := date.Parse(value)
  162. if err != nil {
  163. logger.Error("rss: %v", err)
  164. return time.Now()
  165. }
  166. return result
  167. }
  168. return time.Now()
  169. }
  170. func (r *rssItem) Author() string {
  171. for _, element := range r.Authors {
  172. if element.Name != "" {
  173. return element.Name
  174. }
  175. if element.Inner != "" {
  176. return element.Inner
  177. }
  178. }
  179. return r.Creator
  180. }
  181. func (r *rssItem) Hash() string {
  182. for _, value := range []string{r.GUID, r.URL()} {
  183. if value != "" {
  184. return crypto.Hash(value)
  185. }
  186. }
  187. return ""
  188. }
  189. func (r *rssItem) Content() string {
  190. if r.EncodedContent != "" {
  191. return r.EncodedContent
  192. }
  193. return r.Description
  194. }
  195. func (r *rssItem) URL() string {
  196. if r.OriginalLink != "" {
  197. return r.OriginalLink
  198. }
  199. for _, link := range r.Links {
  200. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  201. return strings.TrimSpace(link.Href)
  202. }
  203. if link.Data != "" {
  204. return strings.TrimSpace(link.Data)
  205. }
  206. }
  207. return ""
  208. }
  209. func (r *rssItem) Enclosures() model.EnclosureList {
  210. enclosures := make(model.EnclosureList, 0)
  211. duplicates := make(map[string]bool, 0)
  212. for _, mediaThumbnail := range r.MediaThumbnails {
  213. if _, found := duplicates[mediaThumbnail.URL]; !found {
  214. duplicates[mediaThumbnail.URL] = true
  215. enclosures = append(enclosures, &model.Enclosure{
  216. URL: mediaThumbnail.URL,
  217. MimeType: "image/*",
  218. Size: 0,
  219. })
  220. }
  221. }
  222. for _, enclosure := range r.EnclosureLinks {
  223. enclosureURL := enclosure.URL
  224. if r.OrigEnclosureLink != "" {
  225. filename := path.Base(r.OrigEnclosureLink)
  226. if strings.Contains(enclosureURL, filename) {
  227. enclosureURL = r.OrigEnclosureLink
  228. }
  229. }
  230. if _, found := duplicates[enclosureURL]; !found {
  231. duplicates[enclosureURL] = true
  232. enclosures = append(enclosures, &model.Enclosure{
  233. URL: enclosureURL,
  234. MimeType: enclosure.Type,
  235. Size: enclosure.Size(),
  236. })
  237. }
  238. }
  239. for _, mediaContentItem := range r.MediaGroup {
  240. for _, mediaContent := range mediaContentItem.MediaList {
  241. r.MediaContents = append(r.MediaContents, mediaContent)
  242. }
  243. }
  244. for _, mediaContent := range r.MediaContents {
  245. if _, found := duplicates[mediaContent.URL]; !found {
  246. duplicates[mediaContent.URL] = true
  247. enclosures = append(enclosures, &model.Enclosure{
  248. URL: mediaContent.URL,
  249. MimeType: mediaContent.MimeType(),
  250. Size: mediaContent.Size(),
  251. })
  252. }
  253. }
  254. return enclosures
  255. }
  256. func (r *rssItem) CommentsURL() string {
  257. for _, commentLink := range r.CommentLinks {
  258. if commentLink.XMLName.Space == "" {
  259. return strings.TrimSpace(commentLink.Data)
  260. }
  261. }
  262. return ""
  263. }
  264. func (r *rssItem) Transform() *model.Entry {
  265. entry := new(model.Entry)
  266. entry.URL = r.URL()
  267. entry.CommentsURL = r.CommentsURL()
  268. entry.Date = r.PublishedDate()
  269. entry.Author = r.Author()
  270. entry.Hash = r.Hash()
  271. entry.Content = r.Content()
  272. entry.Title = strings.TrimSpace(r.Title)
  273. entry.Enclosures = r.Enclosures()
  274. return entry
  275. }
  276. func isValidLinkRelation(rel string) bool {
  277. switch rel {
  278. case "", "alternate", "enclosure", "related", "self", "via":
  279. return true
  280. default:
  281. if strings.HasPrefix(rel, "http") {
  282. return true
  283. }
  284. return false
  285. }
  286. }