scraper.go 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package scraper // import "miniflux.app/reader/scraper"
  5. import (
  6. "errors"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "miniflux.app/config"
  11. "miniflux.app/http/client"
  12. "miniflux.app/logger"
  13. "miniflux.app/reader/readability"
  14. "miniflux.app/url"
  15. "github.com/PuerkitoBio/goquery"
  16. )
  17. // Fetch downloads a web page and returns relevant contents.
  18. func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
  19. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  20. clt.WithUserAgent(userAgent)
  21. clt.WithCookie(cookie)
  22. if useProxy {
  23. clt.WithProxy()
  24. }
  25. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  26. response, err := clt.Get()
  27. if err != nil {
  28. return "", err
  29. }
  30. if response.HasServerFailure() {
  31. return "", errors.New("scraper: unable to download web page")
  32. }
  33. if !isAllowedContentType(response.ContentType) {
  34. return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
  35. }
  36. if err = response.EnsureUnicodeBody(); err != nil {
  37. return "", err
  38. }
  39. // The entry URL could redirect somewhere else.
  40. sameSite := url.Domain(websiteURL) == url.Domain(response.EffectiveURL)
  41. websiteURL = response.EffectiveURL
  42. if rules == "" {
  43. rules = getPredefinedScraperRules(websiteURL)
  44. }
  45. var content string
  46. if sameSite && rules != "" {
  47. logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
  48. content, err = scrapContent(response.Body, rules)
  49. } else {
  50. logger.Debug(`[Scraper] Using readability for %q`, websiteURL)
  51. content, err = readability.ExtractContent(response.Body)
  52. }
  53. if err != nil {
  54. return "", err
  55. }
  56. return content, nil
  57. }
  58. func scrapContent(page io.Reader, rules string) (string, error) {
  59. document, err := goquery.NewDocumentFromReader(page)
  60. if err != nil {
  61. return "", err
  62. }
  63. contents := ""
  64. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  65. var content string
  66. content, _ = goquery.OuterHtml(s)
  67. contents += content
  68. })
  69. return contents, nil
  70. }
  71. func getPredefinedScraperRules(websiteURL string) string {
  72. urlDomain := url.Domain(websiteURL)
  73. for domain, rules := range predefinedRules {
  74. if strings.Contains(urlDomain, domain) {
  75. return rules
  76. }
  77. }
  78. return ""
  79. }
  80. func isAllowedContentType(contentType string) bool {
  81. contentType = strings.ToLower(contentType)
  82. return strings.HasPrefix(contentType, "text/html") ||
  83. strings.HasPrefix(contentType, "application/xhtml+xml")
  84. }