Procházet zdrojové kódy

refactor(sanitizer): improve how attributes are sanitized

Move hasRequiredAttributes above getExtraAttributes, as extra attributes,
as their name implies, aren't required. This allows to return a []string
instead of a ([]string, []string), as well as simplifying getExtraAttributes
of course. Note that this resulted in an ordering change of <iframe> attributes
so the testsuite had to be updated.

This improves the performances of SanitizeHTML by a bit less than 10% on local
benchmarks.
jvoisin před 5 měsíci
rodič
revize
6a9d7894d1

+ 19 - 29
internal/reader/sanitizer/sanitizer.go

@@ -276,15 +276,15 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 			return
 		}
 
-		attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
-		if !hasRequiredAttributes(tag, attrNames) {
+		htmlAttributes, hasAllRequiredAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
+		if !hasAllRequiredAttributes {
 			// The tag doesn't have every required attributes but we're still interested in its content
 			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
 			return
 		}
 		buf.WriteString("<")
 		buf.WriteString(n.Data)
-		if len(attrNames) > 0 {
+		if len(htmlAttributes) > 0 {
 			buf.WriteString(" " + htmlAttributes)
 		}
 		buf.WriteString(">")
@@ -311,37 +311,30 @@ func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseU
 	}
 }
 
-func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) ([]string, []string) {
+func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) []string {
 	switch tagName {
 	case "a":
-		attributeNames := []string{"rel", "referrerpolicy"}
 		htmlAttributes := []string{`rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`}
 		if sanitizerOptions.OpenLinksInNewTab {
-			attributeNames = append(attributeNames, "target")
 			htmlAttributes = append(htmlAttributes, `target="_blank"`)
 		}
-		return attributeNames, htmlAttributes
+		return htmlAttributes
 	case "video", "audio":
-		return []string{"controls"}, []string{"controls"}
+		return []string{"controls"}
 	case "iframe":
-		extraAttrNames := []string{}
-		extraHTMLAttributes := []string{}
+		extraHTMLAttributes := []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
 
 		// Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
 		// See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
 		if isYouTubeEmbed {
-			extraAttrNames = append(extraAttrNames, "referrerpolicy")
 			extraHTMLAttributes = append(extraHTMLAttributes, `referrerpolicy="strict-origin-when-cross-origin"`)
 		}
 
-		extraAttrNames = append(extraAttrNames, "sandbox", "loading")
-		extraHTMLAttributes = append(extraHTMLAttributes, `sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`)
-
-		return extraAttrNames, extraHTMLAttributes
+		return extraHTMLAttributes
 	case "img":
-		return []string{"loading"}, []string{`loading="lazy"`}
+		return []string{`loading="lazy"`}
 	default:
-		return nil, nil
+		return nil
 	}
 }
 
@@ -499,18 +492,15 @@ func rewriteIframeURL(link string) string {
 	return link
 }
 
-func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
+func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) (string, bool) {
 	htmlAttrs := make([]string, 0, len(attributes))
 	attrNames := make([]string, 0, len(attributes))
 
 	var isAnchorLink bool
 	var isYouTubeEmbed bool
 
-	allowedAttributes, ok := allowedHTMLTagsAndAttributes[tagName]
-	if !ok {
-		// This should never happen, as the tag was validated in the caller of `sanitizeAttributes`
-		return []string{}, ""
-	}
+	// We know the element is present, as the tag was validated in the caller of `sanitizeAttributes`
+	allowedAttributes := allowedHTMLTagsAndAttributes[tagName]
 
 	for _, attribute := range attributes {
 		if !slices.Contains(allowedAttributes, attribute.Key) {
@@ -596,15 +586,15 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []htm
 		htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
 	}
 
+	if !hasRequiredAttributes(tagName, attrNames) {
+		return "", false
+	}
+
 	if !isAnchorLink {
-		extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
-		if len(extraAttrNames) > 0 {
-			attrNames = append(attrNames, extraAttrNames...)
-			htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
-		}
+		htmlAttrs = append(htmlAttrs, getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)...)
 	}
 
-	return attrNames, strings.Join(htmlAttrs, " ")
+	return strings.Join(htmlAttrs, " "), true
 }
 
 func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {

+ 8 - 8
internal/reader/sanitizer/sanitizer_test.go

@@ -443,7 +443,7 @@ func TestIFrameWithChildElements(t *testing.T) {
 	config.Opts = config.NewConfigOptions()
 
 	input := `<iframe src="https://www.youtube.com/"><p>test</p></iframe>`
-	expected := `<iframe src="https://www.youtube.com/" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube.com/" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.com/", input)
 
 	if expected != output {
@@ -455,7 +455,7 @@ func TestIFrameWithReferrerPolicy(t *testing.T) {
 	config.Opts = config.NewConfigOptions()
 
 	input := `<iframe src="https://www.youtube.com/embed/test123" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.com/", input)
 
 	if expected != output {
@@ -722,7 +722,7 @@ func TestReplaceYoutubeURL(t *testing.T) {
 	}
 
 	input := `<iframe src="http://www.youtube.com/embed/test123?version=3&#038;rel=1&#038;fs=1&#038;autohide=2&#038;showsearch=0&#038;showinfo=1&#038;iv_load_policy=1&#038;wmode=transparent"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?version=3&amp;rel=1&amp;fs=1&amp;autohide=2&amp;showsearch=0&amp;showinfo=1&amp;iv_load_policy=1&amp;wmode=transparent" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?version=3&amp;rel=1&amp;fs=1&amp;autohide=2&amp;showsearch=0&amp;showinfo=1&amp;iv_load_policy=1&amp;wmode=transparent" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -740,7 +740,7 @@ func TestReplaceSecureYoutubeURL(t *testing.T) {
 	}
 
 	input := `<iframe src="https://www.youtube.com/embed/test123"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -758,7 +758,7 @@ func TestReplaceSecureYoutubeURLWithParameters(t *testing.T) {
 	}
 
 	input := `<iframe src="https://www.youtube.com/embed/test123?rel=0&amp;controls=0"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?rel=0&amp;controls=0" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?rel=0&amp;controls=0" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -776,7 +776,7 @@ func TestReplaceYoutubeURLAlreadyReplaced(t *testing.T) {
 	}
 
 	input := `<iframe src="https://www.youtube-nocookie.com/embed/test123?rel=0&amp;controls=0" sandbox="allow-scripts allow-same-origin"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?rel=0&amp;controls=0" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/test123?rel=0&amp;controls=0" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -794,7 +794,7 @@ func TestReplaceProtocolRelativeYoutubeURL(t *testing.T) {
 	}
 
 	input := `<iframe src="//www.youtube.com/embed/Bf2W84jrGqs" width="560" height="314" allowfullscreen="allowfullscreen"></iframe>`
-	expected := `<iframe src="https://www.youtube-nocookie.com/embed/Bf2W84jrGqs" width="560" height="314" allowfullscreen="allowfullscreen" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://www.youtube-nocookie.com/embed/Bf2W84jrGqs" width="560" height="314" allowfullscreen="allowfullscreen" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -813,7 +813,7 @@ func TestReplaceYoutubeURLWithCustomURL(t *testing.T) {
 	}
 
 	input := `<iframe src="https://www.youtube.com/embed/test123?version=3&#038;rel=1&#038;fs=1&#038;autohide=2&#038;showsearch=0&#038;showinfo=1&#038;iv_load_policy=1&#038;wmode=transparent"></iframe>`
-	expected := `<iframe src="https://invidious.custom/embed/test123?version=3&amp;rel=1&amp;fs=1&amp;autohide=2&amp;showsearch=0&amp;showinfo=1&amp;iv_load_policy=1&amp;wmode=transparent" referrerpolicy="strict-origin-when-cross-origin" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy"></iframe>`
+	expected := `<iframe src="https://invidious.custom/embed/test123?version=3&amp;rel=1&amp;fs=1&amp;autohide=2&amp;showsearch=0&amp;showinfo=1&amp;iv_load_policy=1&amp;wmode=transparent" sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox" loading="lazy" referrerpolicy="strict-origin-when-cross-origin"></iframe>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {