rss.go 5.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss
  5. import (
  6. "encoding/xml"
  7. "log"
  8. "path"
  9. "strconv"
  10. "strings"
  11. "time"
  12. "github.com/miniflux/miniflux2/helper"
  13. "github.com/miniflux/miniflux2/model"
  14. "github.com/miniflux/miniflux2/reader/date"
  15. "github.com/miniflux/miniflux2/reader/processor"
  16. "github.com/miniflux/miniflux2/reader/sanitizer"
  17. )
  18. type rssFeed struct {
  19. XMLName xml.Name `xml:"rss"`
  20. Version string `xml:"version,attr"`
  21. Title string `xml:"channel>title"`
  22. Links []rssLink `xml:"channel>link"`
  23. Language string `xml:"channel>language"`
  24. Description string `xml:"channel>description"`
  25. PubDate string `xml:"channel>pubDate"`
  26. ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
  27. Items []rssItem `xml:"channel>item"`
  28. }
  29. type rssLink struct {
  30. XMLName xml.Name
  31. Data string `xml:",chardata"`
  32. Href string `xml:"href,attr"`
  33. Rel string `xml:"rel,attr"`
  34. }
  35. type rssItem struct {
  36. GUID string `xml:"guid"`
  37. Title string `xml:"title"`
  38. Links []rssLink `xml:"link"`
  39. OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
  40. Description string `xml:"description"`
  41. Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
  42. PubDate string `xml:"pubDate"`
  43. Date string `xml:"http://purl.org/dc/elements/1.1/ date"`
  44. Authors []rssAuthor `xml:"author"`
  45. Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
  46. Enclosures []rssEnclosure `xml:"enclosure"`
  47. OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
  48. }
  49. type rssAuthor struct {
  50. XMLName xml.Name
  51. Data string `xml:",chardata"`
  52. Name string `xml:"name"`
  53. }
  54. type rssEnclosure struct {
  55. URL string `xml:"url,attr"`
  56. Type string `xml:"type,attr"`
  57. Length string `xml:"length,attr"`
  58. }
  59. func (r *rssFeed) GetSiteURL() string {
  60. for _, element := range r.Links {
  61. if element.XMLName.Space == "" {
  62. return element.Data
  63. }
  64. }
  65. return ""
  66. }
  67. func (r *rssFeed) GetFeedURL() string {
  68. for _, element := range r.Links {
  69. if element.XMLName.Space == "http://www.w3.org/2005/Atom" {
  70. return element.Href
  71. }
  72. }
  73. return ""
  74. }
  75. func (r *rssFeed) Transform() *model.Feed {
  76. feed := new(model.Feed)
  77. feed.SiteURL = r.GetSiteURL()
  78. feed.FeedURL = r.GetFeedURL()
  79. feed.Title = sanitizer.StripTags(r.Title)
  80. if feed.Title == "" {
  81. feed.Title = feed.SiteURL
  82. }
  83. for _, item := range r.Items {
  84. entry := item.Transform()
  85. if entry.Author == "" && r.ItunesAuthor != "" {
  86. entry.Author = r.ItunesAuthor
  87. }
  88. entry.Author = sanitizer.StripTags(entry.Author)
  89. if entry.URL == "" {
  90. entry.URL = feed.SiteURL
  91. }
  92. feed.Entries = append(feed.Entries, entry)
  93. }
  94. return feed
  95. }
  96. func (r *rssItem) GetDate() time.Time {
  97. value := r.PubDate
  98. if r.Date != "" {
  99. value = r.Date
  100. }
  101. if value != "" {
  102. result, err := date.Parse(value)
  103. if err != nil {
  104. log.Println(err)
  105. return time.Now()
  106. }
  107. return result
  108. }
  109. return time.Now()
  110. }
  111. func (r *rssItem) GetAuthor() string {
  112. for _, element := range r.Authors {
  113. if element.Name != "" {
  114. return element.Name
  115. }
  116. if element.Data != "" {
  117. return element.Data
  118. }
  119. }
  120. return r.Creator
  121. }
  122. func (r *rssItem) GetHash() string {
  123. for _, value := range []string{r.GUID, r.GetURL()} {
  124. if value != "" {
  125. return helper.Hash(value)
  126. }
  127. }
  128. return ""
  129. }
  130. func (r *rssItem) GetContent() string {
  131. if r.Content != "" {
  132. return r.Content
  133. }
  134. return r.Description
  135. }
  136. func (r *rssItem) GetURL() string {
  137. if r.OriginalLink != "" {
  138. return r.OriginalLink
  139. }
  140. for _, link := range r.Links {
  141. if link.XMLName.Space == "http://www.w3.org/2005/Atom" && link.Href != "" && isValidLinkRelation(link.Rel) {
  142. return link.Href
  143. }
  144. if link.Data != "" {
  145. return link.Data
  146. }
  147. }
  148. return ""
  149. }
  150. func (r *rssItem) GetEnclosures() model.EnclosureList {
  151. enclosures := make(model.EnclosureList, 0)
  152. for _, enclosure := range r.Enclosures {
  153. length, _ := strconv.Atoi(enclosure.Length)
  154. enclosureURL := enclosure.URL
  155. if r.OrigEnclosureLink != "" {
  156. filename := path.Base(r.OrigEnclosureLink)
  157. if strings.Contains(enclosureURL, filename) {
  158. enclosureURL = r.OrigEnclosureLink
  159. }
  160. }
  161. enclosures = append(enclosures, &model.Enclosure{
  162. URL: enclosureURL,
  163. MimeType: enclosure.Type,
  164. Size: length,
  165. })
  166. }
  167. return enclosures
  168. }
  169. func (r *rssItem) Transform() *model.Entry {
  170. entry := new(model.Entry)
  171. entry.URL = r.GetURL()
  172. entry.Date = r.GetDate()
  173. entry.Author = r.GetAuthor()
  174. entry.Hash = r.GetHash()
  175. entry.Content = processor.ItemContentProcessor(entry.URL, r.GetContent())
  176. entry.Title = sanitizer.StripTags(strings.Trim(r.Title, " \n\t"))
  177. entry.Enclosures = r.GetEnclosures()
  178. if entry.Title == "" {
  179. entry.Title = entry.URL
  180. }
  181. return entry
  182. }
  183. func isValidLinkRelation(rel string) bool {
  184. switch rel {
  185. case "", "alternate", "enclosure", "related", "self", "via":
  186. return true
  187. default:
  188. if strings.HasPrefix(rel, "http") {
  189. return true
  190. }
  191. return false
  192. }
  193. }