Procházet zdrojové kódy

perf(reader): optimize RemoveTrackingParameters

A bit more than 10% of processor.ProcessFeedEntries' CPU time is spent in
urlcleaner.RemoveTrackingParameters, specifically calling url.Parse, so let's
extract this operation outside of it, and do it once before calling
urlcleaner.RemoveTrackingParameters multiple times.

Co-authored-by: Frédéric Guillot <f@miniflux.net>
jvoisin před 10 měsíci
rodič
revize
7c857bdc72

+ 7 - 1
internal/reader/processor/processor.go

@@ -5,6 +5,7 @@ package processor // import "miniflux.app/v2/internal/reader/processor"
 
 import (
 	"log/slog"
+	"net/url"
 	"regexp"
 	"time"
 
@@ -36,6 +37,10 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
 		return
 	}
 
+	// The errors are handled in RemoveTrackingParameters.
+	parsedFeedURL, _ := url.Parse(feed.FeedURL)
+	parsedSiteURL, _ := url.Parse(feed.SiteURL)
+
 	// Process older entries first
 	for i := len(feed.Entries) - 1; i >= 0; i-- {
 		entry := feed.Entries[i]
@@ -52,7 +57,8 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
 			continue
 		}
 
-		if cleanedURL, err := urlcleaner.RemoveTrackingParameters(feed.FeedURL, feed.SiteURL, entry.URL); err == nil {
+		parsedInputUrl, _ := url.Parse(entry.URL)
+		if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl); err == nil {
 			entry.URL = cleanedURL
 		}
 

+ 4 - 1
internal/reader/sanitizer/sanitizer.go

@@ -227,6 +227,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
 		isImageLargerThanLayout = imgWidth > 750
 	}
 
+	parsedBaseUrl, _ := url.Parse(baseURL)
+
 	for _, attribute := range attributes {
 		value := attribute.Val
 
@@ -283,7 +285,8 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sa
 				}
 
 				// TODO use feedURL instead of baseURL twice.
-				if cleanedURL, err := urlcleaner.RemoveTrackingParameters(baseURL, baseURL, value); err == nil {
+				parsedValueUrl, _ := url.Parse(value)
+				if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
 					value = cleanedURL
 				}
 			}

+ 8 - 25
internal/reader/urlcleaner/urlcleaner.go

@@ -95,26 +95,12 @@ var trackingParamsOutbound = map[string]bool{
 	"ref": true,
 }
 
-func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error) {
-	parsedURL, err := url.Parse(inputURL)
-	if err != nil {
-		return "", fmt.Errorf("urlcleaner: error parsing URL: %v", err)
+func RemoveTrackingParameters(parsedFeedURL, parsedSiteURL, parsedInputUrl *url.URL) (string, error) {
+	if parsedFeedURL == nil || parsedSiteURL == nil || parsedInputUrl == nil {
+		return "", fmt.Errorf("urlcleaner: one of the URLs is nil")
 	}
 
-	if !strings.HasPrefix(parsedURL.Scheme, "http") {
-		return inputURL, nil
-	}
-
-	parsedBaseUrl, err := url.Parse(baseUrl)
-	if err != nil {
-		return "", fmt.Errorf("urlcleaner: error parsing base URL: %v", err)
-	}
-	parsedFeedUrl, err := url.Parse(feedUrl)
-	if err != nil {
-		return "", fmt.Errorf("urlcleaner: error parsing feed URL: %v", err)
-	}
-
-	queryParams := parsedURL.Query()
+	queryParams := parsedInputUrl.Query()
 	hasTrackers := false
 
 	// Remove tracking parameters
@@ -127,7 +113,7 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
 		if trackingParamsOutbound[lowerParam] {
 			// handle duplicate parameters like ?a=b&a=c&a=d…
 			for _, value := range queryParams[param] {
-				if value == parsedBaseUrl.Hostname() || value == parsedFeedUrl.Hostname() {
+				if value == parsedFeedURL.Hostname() || value == parsedSiteURL.Hostname() {
 					queryParams.Del(param)
 					hasTrackers = true
 					break
@@ -138,14 +124,11 @@ func RemoveTrackingParameters(baseUrl, feedUrl, inputURL string) (string, error)
 
 	// Do not modify the URL if there are no tracking parameters
 	if !hasTrackers {
-		return inputURL, nil
+		return parsedInputUrl.String(), nil
 	}
 
-	parsedURL.RawQuery = queryParams.Encode()
-
-	// Remove trailing "?" if query string is empty
-	cleanedURL := parsedURL.String()
-	cleanedURL = strings.TrimSuffix(cleanedURL, "?")
+	parsedInputUrl.RawQuery = queryParams.Encode()
+	cleanedURL := strings.TrimSuffix(parsedInputUrl.String(), "?")
 
 	return cleanedURL, nil
 }

+ 4 - 1
internal/reader/urlcleaner/urlcleaner_test.go

@@ -121,7 +121,10 @@ func TestRemoveTrackingParams(t *testing.T) {
 
 	for _, tt := range tests {
 		t.Run(tt.name, func(t *testing.T) {
-			result, err := RemoveTrackingParameters(tt.baseUrl, tt.feedUrl, tt.input)
+			parsedBaseUrl, _ := url.Parse(tt.baseUrl)
+			parsedFeedUrl, _ := url.Parse(tt.feedUrl)
+			parsedInputUrl, _ := url.Parse(tt.input)
+			result, err := RemoveTrackingParameters(parsedBaseUrl, parsedFeedUrl, parsedInputUrl)
 			if tt.expected == "" {
 				if err == nil {
 					t.Errorf("Expected an error for invalid URL, but got none")