scraper.go 1.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package scraper
  5. import (
  6. "errors"
  7. "io"
  8. "log"
  9. "strings"
  10. "github.com/PuerkitoBio/goquery"
  11. "github.com/miniflux/miniflux2/http"
  12. "github.com/miniflux/miniflux2/reader/readability"
  13. "github.com/miniflux/miniflux2/reader/sanitizer"
  14. "github.com/miniflux/miniflux2/url"
  15. )
  16. // Fetch download a web page a returns relevant contents.
  17. func Fetch(websiteURL, rules string) (string, error) {
  18. client := http.NewClient(websiteURL)
  19. response, err := client.Get()
  20. if err != nil {
  21. return "", err
  22. }
  23. if response.HasServerFailure() {
  24. return "", errors.New("unable to download web page")
  25. }
  26. page, err := response.NormalizeBodyEncoding()
  27. if err != nil {
  28. return "", err
  29. }
  30. var content string
  31. if rules == "" {
  32. rules = getPredefinedScraperRules(websiteURL)
  33. }
  34. if rules != "" {
  35. log.Printf(`[Scraper] Using rules "%s" for "%s"`, rules, websiteURL)
  36. content, err = scrapContent(page, rules)
  37. } else {
  38. log.Printf(`[Scraper] Using readability for "%s"`, websiteURL)
  39. content, err = readability.ExtractContent(page)
  40. }
  41. if err != nil {
  42. return "", err
  43. }
  44. return sanitizer.Sanitize(websiteURL, content), nil
  45. }
  46. func scrapContent(page io.Reader, rules string) (string, error) {
  47. document, err := goquery.NewDocumentFromReader(page)
  48. if err != nil {
  49. return "", err
  50. }
  51. contents := ""
  52. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  53. var content string
  54. // For some inline elements, we get the parent.
  55. if s.Is("img") {
  56. content, _ = s.Parent().Html()
  57. } else {
  58. content, _ = s.Html()
  59. }
  60. contents += content
  61. })
  62. return contents, nil
  63. }
  64. func getPredefinedScraperRules(websiteURL string) string {
  65. urlDomain := url.Domain(websiteURL)
  66. for domain, rules := range predefinedRules {
  67. if strings.Contains(urlDomain, domain) {
  68. return rules
  69. }
  70. }
  71. return ""
  72. }