|
|
@@ -18,72 +18,77 @@ import (
|
|
|
"golang.org/x/net/html/charset"
|
|
|
)
|
|
|
|
|
|
-func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, websiteURL, rules string) (string, error) {
|
|
|
- responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(websiteURL))
|
|
|
+func ScrapeWebsite(requestBuilder *fetcher.RequestBuilder, pageURL, rules string) (baseURL string, extractedContent string, err error) {
|
|
|
+ responseHandler := fetcher.NewResponseHandler(requestBuilder.ExecuteRequest(pageURL))
|
|
|
defer responseHandler.Close()
|
|
|
|
|
|
if localizedError := responseHandler.LocalizedError(); localizedError != nil {
|
|
|
- slog.Warn("Unable to scrape website", slog.String("website_url", websiteURL), slog.Any("error", localizedError.Error()))
|
|
|
- return "", localizedError.Error()
|
|
|
+ slog.Warn("Unable to scrape website", slog.String("website_url", pageURL), slog.Any("error", localizedError.Error()))
|
|
|
+ return "", "", localizedError.Error()
|
|
|
}
|
|
|
|
|
|
if !isAllowedContentType(responseHandler.ContentType()) {
|
|
|
- return "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
|
|
+ return "", "", fmt.Errorf("scraper: this resource is not a HTML document (%s)", responseHandler.ContentType())
|
|
|
}
|
|
|
|
|
|
// The entry URL could redirect somewhere else.
|
|
|
- sameSite := urllib.Domain(websiteURL) == urllib.Domain(responseHandler.EffectiveURL())
|
|
|
- websiteURL = responseHandler.EffectiveURL()
|
|
|
+ sameSite := urllib.Domain(pageURL) == urllib.Domain(responseHandler.EffectiveURL())
|
|
|
+ pageURL = responseHandler.EffectiveURL()
|
|
|
|
|
|
if rules == "" {
|
|
|
- rules = getPredefinedScraperRules(websiteURL)
|
|
|
+ rules = getPredefinedScraperRules(pageURL)
|
|
|
}
|
|
|
|
|
|
- var content string
|
|
|
- var err error
|
|
|
-
|
|
|
htmlDocumentReader, err := charset.NewReader(
|
|
|
responseHandler.Body(config.Opts.HTTPClientMaxBodySize()),
|
|
|
responseHandler.ContentType(),
|
|
|
)
|
|
|
if err != nil {
|
|
|
- return "", fmt.Errorf("scraper: unable to read HTML document: %v", err)
|
|
|
+ return "", "", fmt.Errorf("scraper: unable to read HTML document with charset reader: %v", err)
|
|
|
}
|
|
|
|
|
|
if sameSite && rules != "" {
|
|
|
slog.Debug("Extracting content with custom rules",
|
|
|
- "url", websiteURL,
|
|
|
+ "url", pageURL,
|
|
|
"rules", rules,
|
|
|
)
|
|
|
- content, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
|
|
+ baseURL, extractedContent, err = findContentUsingCustomRules(htmlDocumentReader, rules)
|
|
|
} else {
|
|
|
slog.Debug("Extracting content with readability",
|
|
|
- "url", websiteURL,
|
|
|
+ "url", pageURL,
|
|
|
)
|
|
|
- content, err = readability.ExtractContent(htmlDocumentReader)
|
|
|
+ baseURL, extractedContent, err = readability.ExtractContent(htmlDocumentReader)
|
|
|
}
|
|
|
|
|
|
- if err != nil {
|
|
|
- return "", err
|
|
|
+ if baseURL == "" {
|
|
|
+ baseURL = pageURL
|
|
|
+ } else {
|
|
|
+ slog.Debug("Using base URL from HTML document", "base_url", baseURL)
|
|
|
}
|
|
|
|
|
|
- return content, nil
|
|
|
+ return baseURL, extractedContent, nil
|
|
|
}
|
|
|
|
|
|
-func findContentUsingCustomRules(page io.Reader, rules string) (string, error) {
|
|
|
+func findContentUsingCustomRules(page io.Reader, rules string) (baseURL string, extractedContent string, err error) {
|
|
|
document, err := goquery.NewDocumentFromReader(page)
|
|
|
if err != nil {
|
|
|
- return "", err
|
|
|
+ return "", "", err
|
|
|
+ }
|
|
|
+
|
|
|
+ if hrefValue, exists := document.Find("head base").First().Attr("href"); exists {
|
|
|
+ hrefValue = strings.TrimSpace(hrefValue)
|
|
|
+ if urllib.IsAbsoluteURL(hrefValue) {
|
|
|
+ baseURL = hrefValue
|
|
|
+ }
|
|
|
}
|
|
|
|
|
|
- contents := ""
|
|
|
document.Find(rules).Each(func(i int, s *goquery.Selection) {
|
|
|
if content, err := goquery.OuterHtml(s); err == nil {
|
|
|
- contents += content
|
|
|
+ extractedContent += content
|
|
|
}
|
|
|
})
|
|
|
|
|
|
- return contents, nil
|
|
|
+ return baseURL, extractedContent, nil
|
|
|
}
|
|
|
|
|
|
func getPredefinedScraperRules(websiteURL string) string {
|