rss.go 4.7 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package rss
  5. import (
  6. "encoding/xml"
  7. "github.com/miniflux/miniflux2/helper"
  8. "github.com/miniflux/miniflux2/model"
  9. "github.com/miniflux/miniflux2/reader/feed/date"
  10. "github.com/miniflux/miniflux2/reader/processor"
  11. "github.com/miniflux/miniflux2/reader/sanitizer"
  12. "log"
  13. "path"
  14. "strconv"
  15. "strings"
  16. "time"
  17. )
  18. type RssLink struct {
  19. XMLName xml.Name
  20. Data string `xml:",chardata"`
  21. Href string `xml:"href,attr"`
  22. }
  23. type RssFeed struct {
  24. XMLName xml.Name `xml:"rss"`
  25. Version string `xml:"version,attr"`
  26. Title string `xml:"channel>title"`
  27. Links []RssLink `xml:"channel>link"`
  28. Language string `xml:"channel>language"`
  29. Description string `xml:"channel>description"`
  30. PubDate string `xml:"channel>pubDate"`
  31. ItunesAuthor string `xml:"http://www.itunes.com/dtds/podcast-1.0.dtd channel>author"`
  32. Items []RssItem `xml:"channel>item"`
  33. }
  34. type RssItem struct {
  35. Guid string `xml:"guid"`
  36. Title string `xml:"title"`
  37. Link string `xml:"link"`
  38. OriginalLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origLink"`
  39. Description string `xml:"description"`
  40. Content string `xml:"http://purl.org/rss/1.0/modules/content/ encoded"`
  41. PubDate string `xml:"pubDate"`
  42. Date string `xml:"http://purl.org/dc/elements/1.1/ date"`
  43. Authors []RssAuthor `xml:"author"`
  44. Creator string `xml:"http://purl.org/dc/elements/1.1/ creator"`
  45. Enclosures []RssEnclosure `xml:"enclosure"`
  46. OrigEnclosureLink string `xml:"http://rssnamespace.org/feedburner/ext/1.0 origEnclosureLink"`
  47. }
  48. type RssAuthor struct {
  49. XMLName xml.Name
  50. Data string `xml:",chardata"`
  51. Name string `xml:"name"`
  52. }
  53. type RssEnclosure struct {
  54. Url string `xml:"url,attr"`
  55. Type string `xml:"type,attr"`
  56. Length string `xml:"length,attr"`
  57. }
  58. func (r *RssFeed) GetSiteURL() string {
  59. for _, elem := range r.Links {
  60. if elem.XMLName.Space == "" {
  61. return elem.Data
  62. }
  63. }
  64. return ""
  65. }
  66. func (r *RssFeed) GetFeedURL() string {
  67. for _, elem := range r.Links {
  68. if elem.XMLName.Space == "http://www.w3.org/2005/Atom" {
  69. return elem.Href
  70. }
  71. }
  72. return ""
  73. }
  74. func (r *RssFeed) Transform() *model.Feed {
  75. feed := new(model.Feed)
  76. feed.SiteURL = r.GetSiteURL()
  77. feed.FeedURL = r.GetFeedURL()
  78. feed.Title = sanitizer.StripTags(r.Title)
  79. if feed.Title == "" {
  80. feed.Title = feed.SiteURL
  81. }
  82. for _, item := range r.Items {
  83. entry := item.Transform()
  84. if entry.Author == "" && r.ItunesAuthor != "" {
  85. entry.Author = r.ItunesAuthor
  86. }
  87. entry.Author = sanitizer.StripTags(entry.Author)
  88. feed.Entries = append(feed.Entries, entry)
  89. }
  90. return feed
  91. }
  92. func (i *RssItem) GetDate() time.Time {
  93. value := i.PubDate
  94. if i.Date != "" {
  95. value = i.Date
  96. }
  97. if value != "" {
  98. result, err := date.Parse(value)
  99. if err != nil {
  100. log.Println(err)
  101. return time.Now()
  102. }
  103. return result
  104. }
  105. return time.Now()
  106. }
  107. func (i *RssItem) GetAuthor() string {
  108. for _, element := range i.Authors {
  109. if element.Name != "" {
  110. return element.Name
  111. }
  112. if element.Data != "" {
  113. return element.Data
  114. }
  115. }
  116. return i.Creator
  117. }
  118. func (i *RssItem) GetHash() string {
  119. for _, value := range []string{i.Guid, i.Link} {
  120. if value != "" {
  121. return helper.Hash(value)
  122. }
  123. }
  124. return ""
  125. }
  126. func (i *RssItem) GetContent() string {
  127. if i.Content != "" {
  128. return i.Content
  129. }
  130. return i.Description
  131. }
  132. func (i *RssItem) GetURL() string {
  133. if i.OriginalLink != "" {
  134. return i.OriginalLink
  135. }
  136. return i.Link
  137. }
  138. func (i *RssItem) GetEnclosures() model.EnclosureList {
  139. enclosures := make(model.EnclosureList, 0)
  140. for _, enclosure := range i.Enclosures {
  141. length, _ := strconv.Atoi(enclosure.Length)
  142. enclosureURL := enclosure.Url
  143. if i.OrigEnclosureLink != "" {
  144. filename := path.Base(i.OrigEnclosureLink)
  145. if strings.Contains(enclosureURL, filename) {
  146. enclosureURL = i.OrigEnclosureLink
  147. }
  148. }
  149. enclosures = append(enclosures, &model.Enclosure{
  150. URL: enclosureURL,
  151. MimeType: enclosure.Type,
  152. Size: length,
  153. })
  154. }
  155. return enclosures
  156. }
  157. func (i *RssItem) Transform() *model.Entry {
  158. entry := new(model.Entry)
  159. entry.URL = i.GetURL()
  160. entry.Date = i.GetDate()
  161. entry.Author = i.GetAuthor()
  162. entry.Hash = i.GetHash()
  163. entry.Content = processor.ItemContentProcessor(entry.URL, i.GetContent())
  164. entry.Title = sanitizer.StripTags(strings.Trim(i.Title, " \n\t"))
  165. entry.Enclosures = i.GetEnclosures()
  166. if entry.Title == "" {
  167. entry.Title = entry.URL
  168. }
  169. return entry
  170. }