scraper.go 3.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package scraper // import "miniflux.app/v2/internal/reader/scraper"
  4. import (
  5. "fmt"
  6. "io"
  7. "log/slog"
  8. "strings"
  9. "miniflux.app/v2/internal/config"
  10. "miniflux.app/v2/internal/reader/encoding"
  11. "miniflux.app/v2/internal/reader/fetcher"
  12. "miniflux.app/v2/internal/reader/readability"
  13. "miniflux.app/v2/internal/urllib"
  14. "github.com/PuerkitoBio/goquery"
  15. )
  16. func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
  17. responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
  18. defer responseHandler.Close()
  19. if localizedError := responseHandler.LocalizedError(); localizedError != nil {
  20. slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
  21. return "", "", localizedError.Error()
  22. }
  23. if !isAllowedContentType(responseHandler.ContentType()) {
  24. return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
  25. }
  26. // The entry URL could redirect somewhere else.
  27. sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
  28. pageURL = responseHandler.EffectiveURL()
  29. if rules == "" {
  30. rules = getPredefinedScraperRules(pageURL)
  31. }
  32. htmlDocumentReader, err := encoding.NewCharsetReader(
  33. responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
  34. responseHandler.ContentType(),
  35. )
  36. if err != nil {
  37. return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
  38. }
  39. if sameSite && rules != "" {
  40. slog.Debug("Extracting content with custom rules",
  41. "url", pageURL,
  42. "rules", rules,
  43. )
  44. baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
  45. } else {
  46. slog.Debug("Extracting content with readability",
  47. "url", pageURL,
  48. )
  49. baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
  50. }
  51. if baseURL == "" {
  52. baseURL = pageURL
  53. } else {
  54. slog.Debug("Using base URL from HTML document", "base_url", baseURL)
  55. }
  56. return baseURL, extractedContent, nil
  57. }
  58. func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
  59. document, err := goquery.NewDocumentFromReader(page)
  60. if err != nil {
  61. return "", "", err
  62. }
  63. if hrefValue, exists := document.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
  64. hrefValue = strings.TrimSpace(hrefValue)
  65. if urllib.IsAbsoluteURL(hrefValue) {
  66. baseURL = hrefValue
  67. }
  68. }
  69. document.Find(rules).Each(func(i int, s *goquery.Selection) {
  70. if content, err := goquery.OuterHtml(s); err == nil {
  71. extractedContent += content
  72. }
  73. })
  74. return baseURL, extractedContent, nil
  75. }
  76. func getPredefinedScraperRules(websiteURL string) string {
  77. urlDomain := urllib.DomainWithoutWWW(websiteURL)
  78. if rules, ok := predefinedRules[urlDomain]; ok {
  79. return rules
  80. }
  81. return ""
  82. }
  83. func isAllowedContentType(contentType string) bool {
  84. contentType = strings.ToLower(contentType)
  85. return strings.HasPrefix(contentType, "text/html") ||
  86. strings.HasPrefix(contentType, "application/xhtml+xml")
  87. }