scraper.go 2.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package scraper // import "miniflux.app/v2/internal/reader/scraper"
  4. import (
  5. "errors"
  6. "fmt"
  7. "io"
  8. "log/slog"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/http/client"
  12. "miniflux.app/v2/internal/reader/readability"
  13. "miniflux.app/v2/internal/urllib"
  14. "github.com/PuerkitoBio/goquery"
  15. )
  16. // Fetch downloads a web page and returns relevant contents.
  17. func Fetch(websiteURL, rules, userAgent string, cookie string, allowSelfSignedCertificates, useProxy bool) (string, error) {
  18. clt := client.NewClientWithConfig(websiteURL, config.Opts)
  19. clt.WithUserAgent(userAgent)
  20. clt.WithCookie(cookie)
  21. if useProxy {
  22. clt.WithProxy()
  23. }
  24. clt.AllowSelfSignedCertificates = allowSelfSignedCertificates
  25. response, err := clt.Get()
  26. if err != nil {
  27. return "", err
  28. }
  29. if response.HasServerFailure() {
  30. return "", errors.New("scraper: unable to download web page")
  31. }
  32. if !isAllowedContentType(response.ContentType) {
  33. return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", response.ContentType)
  34. }
  35. if err = response.EnsureUnicodeBody(); err != nil {
  36. return "", err
  37. }
  38. // The entry URL could redirect somewhere else.
  39. sameSite := urllib.Domain(websiteURL) == urllib.Domain(response.EffectiveURL)
  40. websiteURL = response.EffectiveURL
  41. if rules == "" {
  42. rules = getPredefinedScraperRules(websiteURL)
  43. }
  44. var content string
  45. if sameSite && rules != "" {
  46. slog.Debug("Extracting content with custom rules",
  47. "url", websiteURL,
  48. "rules", rules,
  49. )
  50. content, err = scrapContent(response.Body, rules)
  51. } else {
  52. slog.Debug("Extracting content with readability",
  53. "url", websiteURL,
  54. )
  55. content, err = readability.ExtractContent(response.Body)
  56. }
  57. if err != nil {
  58. return "", err
  59. }
  60. return content, nil
  61. }
  62. func scrapContent(page io.Reader, rules string) (string, error) {
  63. document, err := goquery.NewDocumentFromReader(page)
  64. if err != nil {
  65. return "", err
  66. }
  67. contents := ""
  68. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  69. var content string
  70. content, _ = goquery.OuterHtml(s)
  71. contents += content
  72. })
  73. return contents, nil
  74. }
  75. func getPredefinedScraperRules(websiteURL string) string {
  76. urlDomain := urllib.Domain(websiteURL)
  77. for domain, rules := range predefinedRules {
  78. if strings.Contains(urlDomain, domain) {
  79. return rules
  80. }
  81. }
  82. return ""
  83. }
  84. func isAllowedContentType(contentType string) bool {
  85. contentType = strings.ToLower(contentType)
  86. return strings.HasPrefix(contentType, "text/html") ||
  87. strings.HasPrefix(contentType, "application/xhtml+xml")
  88. }