processor.go 6.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233
  1. // Copyright 2018 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package processor
  5. import (
  6. "errors"
  7. "fmt"
  8. "math"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. "time"
  13. "unicode/utf8"
  14. "miniflux.app/config"
  15. "miniflux.app/http/client"
  16. "miniflux.app/logger"
  17. "miniflux.app/metric"
  18. "miniflux.app/model"
  19. "miniflux.app/reader/browser"
  20. "miniflux.app/reader/rewrite"
  21. "miniflux.app/reader/sanitizer"
  22. "miniflux.app/reader/scraper"
  23. "miniflux.app/storage"
  24. "github.com/PuerkitoBio/goquery"
  25. "github.com/rylans/getlang"
  26. )
  27. var (
  28. youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
  29. iso8601Regex = regexp.MustCompile(`^P((?P<year>\d+)Y)?((?P<month>\d+)M)?((?P<week>\d+)W)?((?P<day>\d+)D)?(T((?P<hour>\d+)H)?((?P<minute>\d+)M)?((?P<second>\d+)S)?)?$`)
  30. )
  31. // ProcessFeedEntries downloads original web page for entries and apply filters.
  32. func ProcessFeedEntries(store *storage.Storage, feed *model.Feed) {
  33. var filteredEntries model.Entries
  34. for _, entry := range feed.Entries {
  35. logger.Debug("[Processor] Processing entry %q from feed %q", entry.URL, feed.FeedURL)
  36. if isBlockedEntry(feed, entry) || !isAllowedEntry(feed, entry) {
  37. continue
  38. }
  39. if feed.Crawler {
  40. if !store.EntryURLExists(feed.ID, entry.URL) {
  41. logger.Debug("[Processor] Crawling entry %q from feed %q", entry.URL, feed.FeedURL)
  42. startTime := time.Now()
  43. content, scraperErr := scraper.Fetch(
  44. entry.URL,
  45. feed.ScraperRules,
  46. feed.UserAgent,
  47. feed.AllowSelfSignedCertificates,
  48. )
  49. if config.Opts.HasMetricsCollector() {
  50. status := "success"
  51. if scraperErr != nil {
  52. status = "error"
  53. }
  54. metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())
  55. }
  56. if scraperErr != nil {
  57. logger.Error(`[Processor] Unable to crawl this entry: %q => %v`, entry.URL, scraperErr)
  58. } else if content != "" {
  59. // We replace the entry content only if the scraper doesn't return any error.
  60. entry.Content = content
  61. }
  62. }
  63. }
  64. entry.Content = rewrite.Rewriter(entry.URL, entry.Content, feed.RewriteRules)
  65. // The sanitizer should always run at the end of the process to make sure unsafe HTML is filtered.
  66. entry.Content = sanitizer.Sanitize(entry.URL, entry.Content)
  67. if config.Opts.FetchYouTubeWatchTime() {
  68. if matches := youtubeRegex.FindStringSubmatch(entry.URL); len(matches) == 2 {
  69. watchTime, err := fetchYouTubeWatchTime(entry.URL)
  70. if err != nil {
  71. logger.Error("[Processor] Unable to fetch YouTube watch time: %q => %v", entry.URL, err)
  72. }
  73. entry.ReadingTime = watchTime
  74. }
  75. }
  76. if entry.ReadingTime == 0 {
  77. entry.ReadingTime = calculateReadingTime(entry.Content)
  78. }
  79. filteredEntries = append(filteredEntries, entry)
  80. }
  81. feed.Entries = filteredEntries
  82. }
  83. func isBlockedEntry(feed *model.Feed, entry *model.Entry) bool {
  84. if feed.BlocklistRules != "" {
  85. match, _ := regexp.MatchString(feed.BlocklistRules, entry.Title)
  86. if match {
  87. logger.Debug("[Processor] Blocking entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.BlocklistRules)
  88. return true
  89. }
  90. }
  91. return false
  92. }
  93. func isAllowedEntry(feed *model.Feed, entry *model.Entry) bool {
  94. if feed.KeeplistRules != "" {
  95. match, _ := regexp.MatchString(feed.KeeplistRules, entry.Title)
  96. if match {
  97. logger.Debug("[Processor] Allow entry %q from feed %q based on rule %q", entry.Title, feed.FeedURL, feed.KeeplistRules)
  98. return true
  99. }
  100. return false
  101. }
  102. return true
  103. }
  104. // ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
  105. func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry) error {
  106. startTime := time.Now()
  107. content, scraperErr := scraper.Fetch(
  108. entry.URL,
  109. entry.Feed.ScraperRules,
  110. entry.Feed.UserAgent,
  111. feed.AllowSelfSignedCertificates,
  112. )
  113. if config.Opts.HasMetricsCollector() {
  114. status := "success"
  115. if scraperErr != nil {
  116. status = "error"
  117. }
  118. metric.ScraperRequestDuration.WithLabelValues(status).Observe(time.Since(startTime).Seconds())
  119. }
  120. if scraperErr != nil {
  121. return scraperErr
  122. }
  123. content = rewrite.Rewriter(entry.URL, content, entry.Feed.RewriteRules)
  124. content = sanitizer.Sanitize(entry.URL, content)
  125. if content != "" {
  126. entry.Content = content
  127. entry.ReadingTime = calculateReadingTime(content)
  128. }
  129. return nil
  130. }
  131. func fetchYouTubeWatchTime(url string) (int, error) {
  132. clt := client.NewClientWithConfig(url, config.Opts)
  133. response, browserErr := browser.Exec(clt)
  134. if browserErr != nil {
  135. return 0, browserErr
  136. }
  137. doc, docErr := goquery.NewDocumentFromReader(response.Body)
  138. if docErr != nil {
  139. return 0, docErr
  140. }
  141. durs, exists := doc.Find(`meta[itemprop="duration"]`).First().Attr("content")
  142. if !exists {
  143. return 0, errors.New("duration has not found")
  144. }
  145. dur, err := parseISO8601(durs)
  146. if err != nil {
  147. return 0, fmt.Errorf("unable to parse duration %s: %v", durs, err)
  148. }
  149. return int(dur.Minutes()), nil
  150. }
  151. // parseISO8601 parses an ISO 8601 duration string.
  152. func parseISO8601(from string) (time.Duration, error) {
  153. var match []string
  154. var d time.Duration
  155. if iso8601Regex.MatchString(from) {
  156. match = iso8601Regex.FindStringSubmatch(from)
  157. } else {
  158. return 0, errors.New("could not parse duration string")
  159. }
  160. for i, name := range iso8601Regex.SubexpNames() {
  161. part := match[i]
  162. if i == 0 || name == "" || part == "" {
  163. continue
  164. }
  165. val, err := strconv.ParseInt(part, 10, 64)
  166. if err != nil {
  167. return 0, err
  168. }
  169. switch name {
  170. case "hour":
  171. d = d + (time.Duration(val) * time.Hour)
  172. case "minute":
  173. d = d + (time.Duration(val) * time.Minute)
  174. case "second":
  175. d = d + (time.Duration(val) * time.Second)
  176. default:
  177. return 0, fmt.Errorf("unknown field %s", name)
  178. }
  179. }
  180. return d, nil
  181. }
  182. func calculateReadingTime(content string) int {
  183. sanitizedContent := sanitizer.StripTags(content)
  184. languageInfo := getlang.FromString(sanitizedContent)
  185. var timeToReadInt int
  186. if languageInfo.LanguageCode() == "ko" || languageInfo.LanguageCode() == "zh" || languageInfo.LanguageCode() == "jp" {
  187. timeToReadInt = int(math.Ceil(float64(utf8.RuneCountInString(sanitizedContent)) / 500))
  188. } else {
  189. nbOfWords := len(strings.Fields(sanitizedContent))
  190. timeToReadInt = int(math.Ceil(float64(nbOfWords) / 265))
  191. }
  192. return timeToReadInt
  193. }