Просмотр исходного кода

perf(finder): don't parse the candidate page twice

Instead of parsing the HTML once in findSubscriptionsFromWebPage and another
time in findCanonicalURL, do it once in a `parseHTMLDocument` function.
This was also the opportunity to restrict the `link[rel='canonical' i`
query to `head link[rel='canonical' i`.
Julien Voisin 4 месяцев назад
Родитель
Сommit
72784c4943
2 измененных файлов с 96 добавлено и 45 удалено
  1. 31 32
      internal/reader/subscription/finder.go
  2. 65 13
      internal/reader/subscription/finder_test.go

+ 31 - 32
internal/reader/subscription/finder.go

@@ -68,9 +68,16 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
 		return Subscriptions{NewSubscription(responseHandler.EffectiveURL(), responseHandler.EffectiveURL(), feedFormat)}, nil
 	}
 
+	// It's not a feed, so we have to process its HTML.
+	doc, err := parseHTMLDocument(responseHandler.ContentType(), responseBody)
+	if err != nil {
+		return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
+	}
+	baseURL := getBaseURL(websiteURL, doc)
+
 	// Step 2) Find the canonical URL of the website.
 	slog.Debug("Try to find the canonical URL of the website", slog.String("website_url", websiteURL))
-	websiteURL = f.findCanonicalURL(websiteURL, responseHandler.ContentType(), responseBody)
+	websiteURL = f.findCanonicalURL(websiteURL, baseURL, doc)
 
 	// Step 3) Check if the website URL is a YouTube channel.
 	slog.Debug("Try to detect feeds for a YouTube page", slog.String("website_url", websiteURL))
@@ -86,7 +93,8 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
 		slog.String("website_url", websiteURL),
 		slog.String("content_type", responseHandler.ContentType()),
 	)
-	if subscriptions, localizedError := f.findSubscriptionsFromWebPage(websiteURL, responseHandler.ContentType(), responseBody); localizedError != nil {
+
+	if subscriptions, localizedError := f.findSubscriptionsFromWebPage(baseURL, doc); localizedError != nil {
 		return nil, localizedError
 	} else if len(subscriptions) > 0 {
 		slog.Debug("Subscriptions found from web page", slog.String("website_url", websiteURL), slog.Any("subscriptions", subscriptions))
@@ -116,7 +124,7 @@ func (f *subscriptionFinder) FindSubscriptions(websiteURL, rssBridgeURL string,
 	return nil, nil
 }
 
-func (f *subscriptionFinder) findSubscriptionsFromWebPage(websiteURL, contentType string, body []byte) (Subscriptions, *locale.LocalizedErrorWrapper) {
+func (f *subscriptionFinder) findSubscriptionsFromWebPage(websiteURL string, doc *goquery.Document) (Subscriptions, *locale.LocalizedErrorWrapper) {
 	queries := map[string]string{
 		"link[type='application/rss+xml']":   parser.FormatRSS,
 		"link[type='application/atom+xml']":  parser.FormatAtom,
@@ -127,23 +135,6 @@ func (f *subscriptionFinder) findSubscriptionsFromWebPage(websiteURL, contentTyp
 		"link[type='application/json']:not([href*='/wp-json/'])": parser.FormatJSON,
 	}
 
-	htmlDocumentReader, err := encoding.NewCharsetReaderFromBytes(body, contentType)
-	if err != nil {
-		return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
-	}
-
-	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
-	if err != nil {
-		return nil, locale.NewLocalizedErrorWrapper(err, "error.unable_to_parse_html_document", err)
-	}
-
-	if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
-		hrefValue = strings.TrimSpace(hrefValue)
-		if urllib.IsAbsoluteURL(hrefValue) {
-			websiteURL = hrefValue
-		}
-	}
-
 	var subscriptions Subscriptions
 	subscriptionURLs := make(map[string]bool)
 	for feedQuerySelector, feedFormat := range queries {
@@ -152,6 +143,7 @@ func (f *subscriptionFinder) findSubscriptionsFromWebPage(websiteURL, contentTyp
 			subscription.Type = feedFormat
 
 			if feedURL, exists := s.Attr("href"); exists && feedURL != "" {
+				var err error
 				subscription.URL, err = urllib.ResolveToAbsoluteURL(websiteURL, feedURL)
 				if err != nil {
 					return
@@ -326,34 +318,41 @@ func (f *subscriptionFinder) findSubscriptionsFromYouTube(websiteURL string) (Su
 
 // findCanonicalURL extracts the canonical URL from the HTML <link rel="canonical"> tag.
 // Returns the canonical URL if found, otherwise returns the effective URL.
-func (f *subscriptionFinder) findCanonicalURL(effectiveURL, contentType string, body []byte) string {
-	htmlDocumentReader, err := encoding.NewCharsetReaderFromBytes(body, contentType)
-	if err != nil {
+func (f *subscriptionFinder) findCanonicalURL(effectiveURL, baseURL string, doc *goquery.Document) string {
+	canonicalHref, exists := doc.Find("head link[rel='canonical' i]").First().Attr("href")
+	if !exists || strings.TrimSpace(canonicalHref) == "" {
 		return effectiveURL
 	}
 
-	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
+	canonicalURL, err := urllib.ResolveToAbsoluteURL(baseURL, strings.TrimSpace(canonicalHref))
 	if err != nil {
 		return effectiveURL
 	}
 
-	baseURL := effectiveURL
+	return canonicalURL
+}
+
+// getBaseURL returns the url specified in the <base> tag, and `websiteURL` otherwise.
+func getBaseURL(websiteURL string, doc *goquery.Document) string {
+	baseURL := websiteURL
 	if hrefValue, exists := doc.FindMatcher(goquery.Single("head base")).Attr("href"); exists {
 		hrefValue = strings.TrimSpace(hrefValue)
 		if urllib.IsAbsoluteURL(hrefValue) {
 			baseURL = hrefValue
 		}
 	}
+	return baseURL
+}
 
-	canonicalHref, exists := doc.Find("link[rel='canonical' i]").First().Attr("href")
-	if !exists || strings.TrimSpace(canonicalHref) == "" {
-		return effectiveURL
+func parseHTMLDocument(contentType string, body []byte) (*goquery.Document, error) {
+	htmlDocumentReader, err := encoding.NewCharsetReaderFromBytes(body, contentType)
+	if err != nil {
+		return nil, err
 	}
 
-	canonicalURL, err := urllib.ResolveToAbsoluteURL(baseURL, strings.TrimSpace(canonicalHref))
+	doc, err := goquery.NewDocumentFromReader(htmlDocumentReader)
 	if err != nil {
-		return effectiveURL
+		return nil, err
 	}
-
-	return canonicalURL
+	return doc, nil
 }

+ 65 - 13
internal/reader/subscription/finder_test.go

@@ -118,8 +118,12 @@ func TestParseWebPageWithRssFeed(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -151,8 +155,12 @@ func TestParseWebPageWithAtomFeed(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -184,8 +192,12 @@ func TestParseWebPageWithJSONFeed(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -217,8 +229,12 @@ func TestParseWebPageWithOldJSONFeedMimeType(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -251,8 +267,12 @@ func TestParseWebPageWithJSONFeedWpJsonIgnored(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -272,8 +292,12 @@ func TestParseWebPageWithRelativeFeedURL(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -305,8 +329,12 @@ func TestParseWebPageWithEmptyTitle(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -339,8 +367,12 @@ func TestParseWebPageWithMultipleFeeds(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -361,8 +393,12 @@ func TestParseWebPageWithDuplicatedFeeds(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -394,8 +430,12 @@ func TestParseWebPageWithEmptyFeedURL(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -415,8 +455,12 @@ func TestParseWebPageWithNoHref(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", "text/html", []byte(htmlPage))
+	subscriptions, err := NewSubscriptionFinder(nil).findSubscriptionsFromWebPage("http://example.org/", doc)
 	if err != nil {
 		t.Fatalf(`Parsing a correctly formatted HTML page should not return any error: %v`, err)
 	}
@@ -436,8 +480,12 @@ func TestFindCanonicalURL(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", []byte(htmlPage))
+	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "http://example.org", doc)
 	if canonicalURL != "https://example.org/canonical-page" {
 		t.Errorf(`Unexpected canonical URL, got %q, expected %q`, canonicalURL, "https://example.org/canonical-page")
 	}
@@ -452,8 +500,12 @@ func TestFindCanonicalURLNotFound(t *testing.T) {
 		<body>
 		</body>
 	</html>`
+	doc, shouldNeverHappenErr := parseHTMLDocument("text/html", []byte(htmlPage))
+	if shouldNeverHappenErr != nil {
+		t.Fatalf(`Unable to parse the HTML: %v`, shouldNeverHappenErr)
+	}
 
-	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "text/html", []byte(htmlPage))
+	canonicalURL := NewSubscriptionFinder(nil).findCanonicalURL("https://example.org/page", "https://example.org", doc)
 	if canonicalURL != "https://example.org/page" {
 		t.Errorf(`Expected effective URL when canonical not found, got %q`, canonicalURL)
 	}