scraper.go 2.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package scraper // import "miniflux.app/reader/scraper"
  5. import (
  6. "errors"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "miniflux.app/config"
  11. "miniflux.app/http/client"
  12. "miniflux.app/logger"
  13. "miniflux.app/reader/readability"
  14. "miniflux.app/url"
  15. "github.com/PuerkitoBio/goquery"
  16. )
  17. // Fetch downloads a web page and returns relevant contents.
  18. func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
  19. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  20. clt.WithUserAgent(userAgent)
  21. clt.WithCookie(cookie)
  22. if useProxy {
  23. clt.WithProxy()
  24. }
  25. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  26. response, err := clt.Get()
  27. if err != nil {
  28. return "", err
  29. }
  30. if response.HasServerFailure() {
  31. return "", errors.New("scraper: unable to download web page")
  32. }
  33. if !isAllowedContentType(response.ContentType) {
  34. return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
  35. }
  36. if err = response.EnsureUnicodeBody(); err != nil {
  37. return "", err
  38. }
  39. // The entry URL could redirect somewhere else.
  40. websiteURL = response.EffectiveURL
  41. if rules == "" {
  42. rules = getPredefinedScraperRules(websiteURL)
  43. }
  44. var content string
  45. if rules != "" {
  46. logger.Debug(`[Scraper] Using rules %q for %q`, rules, websiteURL)
  47. content, err = scrapContent(response.Body, rules)
  48. } else {
  49. logger.Debug(`[Scraper] Using readability for %q`, websiteURL)
  50. content, err = readability.ExtractContent(response.Body)
  51. }
  52. if err != nil {
  53. return "", err
  54. }
  55. return content, nil
  56. }
  57. func scrapContent(page io.Reader, rules string) (string, error) {
  58. document, err := goquery.NewDocumentFromReader(page)
  59. if err != nil {
  60. return "", err
  61. }
  62. contents := ""
  63. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  64. var content string
  65. content, _ = goquery.OuterHtml(s)
  66. contents += content
  67. })
  68. return contents, nil
  69. }
  70. func getPredefinedScraperRules(websiteURL string) string {
  71. urlDomain := url.Domain(websiteURL)
  72. for domain, rules := range predefinedRules {
  73. if strings.Contains(urlDomain, domain) {
  74. return rules
  75. }
  76. }
  77. return ""
  78. }
  79. func isAllowedContentType(contentType string) bool {
  80. contentType = strings.ToLower(contentType)
  81. return strings.HasPrefix(contentType, "text/html") ||
  82. strings.HasPrefix(contentType, "application/xhtml+xml")
  83. }