Przeglądaj źródła

perf(sanitizer): improve the performances of the sanitizer (#3497)

- Grow the underlying buffer of SanitizeHTML's strings.Builder to 3/4 of the
  raw HTML from the start, to reduce the amount of iterative allocations. This
  number is a complete guesstimation, but it sounds reasonable to me.
- Add a `absoluteURLParsedBase` function to avoid parsing baseURL over and over.
Julien Voisin 9 miesięcy temu
rodzic
commit
a8b4e88742
2 zmienionych plików z 48 dodań i 15 usunięć
  1. 30 9
      internal/reader/sanitizer/sanitizer.go
  2. 18 6
      internal/urllib/url.go

+ 30 - 9
internal/reader/sanitizer/sanitizer.go

@@ -204,10 +204,15 @@ func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
 }
 
 func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
-	var buffer strings.Builder
 	var tagStack []string
 	var parentTag string
 	var blockedStack []string
+	var buffer strings.Builder
+
+	// Educated guess about how big the sanitized HTML will be,
+	// to reduce the amount of buffer re-allocations in this function.
+	estimatedRatio := len(rawHTML) * 3 / 4
+	buffer.Grow(estimatedRatio)
 
 	// Errors are a non-issue, so they're handled later in the function.
 	parsedBaseUrl, _ := url.Parse(baseURL)
@@ -259,7 +264,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
 			}
 
 			if len(blockedStack) == 0 && isValidTag(tagName) {
-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
+				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
 				if hasRequiredAttributes(tagName, attrNames) {
 					if len(attrNames) > 0 {
 						// Rewrite the start tag with allowed attributes.
@@ -287,7 +292,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
 				continue
 			}
 			if len(blockedStack) == 0 && isValidTag(tagName) {
-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
+				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
 				if hasRequiredAttributes(tagName, attrNames) {
 					if len(attrNames) > 0 {
 						buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
@@ -300,7 +305,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
 	}
 }
 
-func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
+func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
 	var htmlAttrs, attrNames []string
 	var err error
 	var isAnchorLink bool
@@ -339,11 +344,11 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
 					continue
 				}
 			case "srcset":
-				value = sanitizeSrcsetAttr(baseURL, value)
+				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
 			}
 		case "source":
 			if attribute.Key == "srcset" {
-				value = sanitizeSrcsetAttr(baseURL, value)
+				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
 			}
 		}
 
@@ -360,7 +365,7 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attribu
 				value = attribute.Val
 				isAnchorLink = true
 			default:
-				value, err = urllib.AbsoluteURL(baseURL, value)
+				value, err = absoluteURLParsedBase(parsedBaseUrl, value)
 				if err != nil {
 					continue
 				}
@@ -541,11 +546,11 @@ func isBlockedTag(tagName string) bool {
 	return false
 }
 
-func sanitizeSrcsetAttr(baseURL, value string) string {
+func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
 	imageCandidates := ParseSrcSetAttribute(value)
 
 	for _, imageCandidate := range imageCandidates {
-		if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
+		if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil {
 			imageCandidate.ImageURL = absoluteURL
 		}
 	}
@@ -597,3 +602,19 @@ func isValidDecodingValue(value string) bool {
 	}
 	return false
 }
+
+// absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over.
+func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) {
+	absURL, u, err := urllib.GetAbsoluteURL(input)
+	if err != nil {
+		return "", err
+	}
+	if absURL != "" {
+		return absURL, nil
+	}
+	if parsedBaseURL == nil {
+		return "", nil
+	}
+
+	return parsedBaseURL.ResolveReference(u).String(), nil
+}

+ 18 - 6
internal/urllib/url.go

@@ -18,22 +18,34 @@ func IsAbsoluteURL(link string) bool {
 	return u.IsAbs()
 }
 
-// AbsoluteURL converts the input URL as absolute URL if necessary.
-func AbsoluteURL(baseURL, input string) (string, error) {
+// GetAbsoluteURL return the absolute form of `input` is possible, as well as its parser form.
+func GetAbsoluteURL(input string) (string, *url.URL, error) {
 	if strings.HasPrefix(input, "//") {
-		return "https:" + input, nil
+		return "https:" + input, nil, nil
 	}
 	if strings.HasPrefix(input, "https://") || strings.HasPrefix(input, "http://") {
-		return input, nil
+		return input, nil, nil
 	}
 
 	u, err := url.Parse(input)
 	if err != nil {
-		return "", fmt.Errorf("unable to parse input URL: %v", err)
+		return "", nil, fmt.Errorf("unable to parse input URL: %v", err)
 	}
 
 	if u.IsAbs() {
-		return u.String(), nil
+		return u.String(), u, nil
+	}
+	return "", u, nil
+}
+
+// AbsoluteURL converts the input URL as absolute URL if necessary.
+func AbsoluteURL(baseURL, input string) (string, error) {
+	absURL, u, err := GetAbsoluteURL(input)
+	if err != nil {
+		return "", err
+	}
+	if absURL != "" {
+		return absURL, nil
 	}
 
 	base, err := url.Parse(baseURL)