rss.go 5.8 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss
  5. import (
  6. "encoding/xml"
  7. "path"
  8. "strconv"
  9. "strings"
  10. "time"
  11. "github.com/miniflux/miniflux/crypto"
  12. "github.com/miniflux/miniflux/logger"
  13. "github.com/miniflux/miniflux/model"
  14. "github.com/miniflux/miniflux/reader/date"
  15. "github.com/miniflux/miniflux/reader/sanitizer"
  16. "github.com/miniflux/miniflux/url"
  17. )
  18. type rssFeed struct {
  19. XMLName xml.Name `xml:"rss"`
  20. Version string `xml:"version,attr"`
  21. Title string `xml:"channel>title"`
  22. Links []rssLink `xml:"channel>link"`
  23. Language string `xml:"channel>language"`
  24. Description string `xml:"channel>description"`
  25. PubDate string `xml:"channel>pubDate"`
  26. ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
  27. Items []rssItem `xml:"channel>item"`
  28. }
  29. type rssLink struct {
  30. XMLName xml.Name
  31. Data string `xml:",chardata"`
  32. Href string `xml:"href,attr"`
  33. Rel string `xml:"rel,attr"`
  34. }
  35. type rssCommentLink struct {
  36. XMLName xml.Name
  37. Data string `xml:",chardata"`
  38. }
  39. type rssAuthor struct {
  40. XMLName xml.Name
  41. Data string `xml:",chardata"`
  42. Name string `xml:"name"`
  43. Inner string `xml:",innerxml"`
  44. }
  45. type rssEnclosure struct {
  46. URL string `xml:"url,attr"`
  47. Type string `xml:"type,attr"`
  48. Length string `xml:"length,attr"`
  49. }
  50. type rssItem struct {
  51. GUID string `xml:"guid"`
  52. Title string `xml:"title"`
  53. Links []rssLink `xml:"link"`
  54. OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
  55. CommentLinks []rssCommentLink `xml:"comments"`
  56. Description string `xml:"description"`
  57. EncodedContent string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
  58. PubDate string `xml:"pubDate"`
  59. Date string `xml:"http://purl.org/dc/elements/1.1/ date"`
  60. Authors []rssAuthor `xml:"author"`
  61. Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
  62. EnclosureLinks []rssEnclosure `xml:"enclosure"`
  63. OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
  64. }
  65. func (r *rssFeed) SiteURL() string {
  66. for _, element := range r.Links {
  67. if element.XMLName.Space == "" {
  68. return strings.TrimSpace(element.Data)
  69. }
  70. }
  71. return ""
  72. }
  73. func (r *rssFeed) FeedURL() string {
  74. for _, element := range r.Links {
  75. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  76. return strings.TrimSpace(element.Href)
  77. }
  78. }
  79. return ""
  80. }
  81. func (r *rssFeed) Transform() *model.Feed {
  82. feed := new(model.Feed)
  83. feed.SiteURL = r.SiteURL()
  84. feed.FeedURL = r.FeedURL()
  85. feed.Title = strings.TrimSpace(r.Title)
  86. if feed.Title == "" {
  87. feed.Title = feed.SiteURL
  88. }
  89. for _, item := range r.Items {
  90. entry := item.Transform()
  91. if entry.Author == "" && r.ItunesAuthor != "" {
  92. entry.Author = r.ItunesAuthor
  93. }
  94. entry.Author = strings.TrimSpace(sanitizer.StripTags(entry.Author))
  95. if entry.URL == "" {
  96. entry.URL = feed.SiteURL
  97. } else {
  98. entryURL, err := url.AbsoluteURL(feed.SiteURL, entry.URL)
  99. if err == nil {
  100. entry.URL = entryURL
  101. }
  102. }
  103. if entry.Title == "" {
  104. entry.Title = entry.URL
  105. }
  106. feed.Entries = append(feed.Entries, entry)
  107. }
  108. return feed
  109. }
  110. func (r *rssItem) PublishedDate() time.Time {
  111. value := r.PubDate
  112. if r.Date != "" {
  113. value = r.Date
  114. }
  115. if value != "" {
  116. result, err := date.Parse(value)
  117. if err != nil {
  118. logger.Error("rss: %v", err)
  119. return time.Now()
  120. }
  121. return result
  122. }
  123. return time.Now()
  124. }
  125. func (r *rssItem) Author() string {
  126. for _, element := range r.Authors {
  127. if element.Name != "" {
  128. return element.Name
  129. }
  130. if element.Inner != "" {
  131. return element.Inner
  132. }
  133. }
  134. return r.Creator
  135. }
  136. func (r *rssItem) Hash() string {
  137. for _, value := range []string{r.GUID, r.URL()} {
  138. if value != "" {
  139. return crypto.Hash(value)
  140. }
  141. }
  142. return ""
  143. }
  144. func (r *rssItem) Content() string {
  145. if r.EncodedContent != "" {
  146. return r.EncodedContent
  147. }
  148. return r.Description
  149. }
  150. func (r *rssItem) URL() string {
  151. if r.OriginalLink != "" {
  152. return r.OriginalLink
  153. }
  154. for _, link := range r.Links {
  155. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  156. return strings.TrimSpace(link.Href)
  157. }
  158. if link.Data != "" {
  159. return strings.TrimSpace(link.Data)
  160. }
  161. }
  162. return ""
  163. }
  164. func (r *rssItem) Enclosures() model.EnclosureList {
  165. enclosures := make(model.EnclosureList, 0)
  166. for _, enclosure := range r.EnclosureLinks {
  167. length, _ := strconv.ParseInt(enclosure.Length, 10, 0)
  168. enclosureURL := enclosure.URL
  169. if r.OrigEnclosureLink != "" {
  170. filename := path.Base(r.OrigEnclosureLink)
  171. if strings.Contains(enclosureURL, filename) {
  172. enclosureURL = r.OrigEnclosureLink
  173. }
  174. }
  175. enclosures = append(enclosures, &model.Enclosure{
  176. URL: enclosureURL,
  177. MimeType: enclosure.Type,
  178. Size: length,
  179. })
  180. }
  181. return enclosures
  182. }
  183. func (r *rssItem) CommentsURL() string {
  184. for _, commentLink := range r.CommentLinks {
  185. if commentLink.XMLName.Space == "" {
  186. return strings.TrimSpace(commentLink.Data)
  187. }
  188. }
  189. return ""
  190. }
  191. func (r *rssItem) Transform() *model.Entry {
  192. entry := new(model.Entry)
  193. entry.URL = r.URL()
  194. entry.CommentsURL = r.CommentsURL()
  195. entry.Date = r.PublishedDate()
  196. entry.Author = r.Author()
  197. entry.Hash = r.Hash()
  198. entry.Content = r.Content()
  199. entry.Title = strings.TrimSpace(r.Title)
  200. entry.Enclosures = r.Enclosures()
  201. return entry
  202. }
  203. func isValidLinkRelation(rel string) bool {
  204. switch rel {
  205. case "", "alternate", "enclosure", "related", "self", "via":
  206. return true
  207. default:
  208. if strings.HasPrefix(rel, "http") {
  209. return true
  210. }
  211. return false
  212. }
  213. }