scraper.go 2.6 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package scraper // import "miniflux.app/v2/internal/reader/scraper"
  4. import (
  5. "fmt"
  6. "io"
  7. "log/slog"
  8. "strings"
  9. "miniflux.app/v2/internal/config"
  10. "miniflux.app/v2/internal/reader/fetcher"
  11. "miniflux.app/v2/internal/reader/readability"
  12. "miniflux.app/v2/internal/urllib"
  13. "github.com/PuerkitoBio/goquery"
  14. )
  15. func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
  16. responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
  17. defer responseHandler.Close()
  18. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  19. slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
  20. return "", localizedError.Error()
  21. }
  22. if !isAllowedContentType(responseHandler.ContentType()) {
  23. return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
  24. }
  25. // The entry URL could redirect somewhere else.
  26. sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
  27. websiteURL = responseHandler.EffectiveURL()
  28. if rules == "" {
  29. rules = getPredefinedScraperRules(websiteURL)
  30. }
  31. var content string
  32. var err error
  33. if sameSite && rules != "" {
  34. slog.Debug("Extracting content with custom rules",
  35. "url", websiteURL,
  36. "rules", rules,
  37. )
  38. content, err = findContentUsingCustomRules(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()), rules)
  39. } else {
  40. slog.Debug("Extracting content with readability",
  41. "url", websiteURL,
  42. )
  43. content, err = readability.ExtractContent(responseHandler.Body(config.Opts.HTTPClientMaxBodySize()))
  44. }
  45. if err != nil {
  46. return "", err
  47. }
  48. return content, nil
  49. }
  50. func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
  51. document, err := goquery.NewDocumentFromReader(page)
  52. if err != nil {
  53. return "", err
  54. }
  55. contents := ""
  56. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  57. var content string
  58. content, _ = goquery.OuterHtml(s)
  59. contents += content
  60. })
  61. return contents, nil
  62. }
  63. func getPredefinedScraperRules(websiteURL string) string {
  64. urlDomain := urllib.Domain(websiteURL)
  65. for domain, rules := range predefinedRules {
  66. if strings.Contains(urlDomain, domain) {
  67. return rules
  68. }
  69. }
  70. return ""
  71. }
  72. func isAllowedContentType(contentType string) bool {
  73. contentType = strings.ToLower(contentType)
  74. return strings.HasPrefix(contentType, "text/html") ||
  75. strings.HasPrefix(contentType, "application/xhtml+xml")
  76. }