فهرست منبع

refactor: remove the now-useless maxDepth limit in the sanitizer

As stated in html.Parse's documentation, "Parse will reject HTML that is nested
deeper than 512 elements." So there is no need to do it ourself.
jvoisin 2 هفته پیش
والد
کامیت
87d7891600
2فایلهای تغییر یافته به همراه11 افزوده شده و 20 حذف شده
  1. 7 17
      internal/reader/sanitizer/sanitizer.go
  2. 4 3
      internal/reader/sanitizer/sanitizer_test.go

+ 7 - 17
internal/reader/sanitizer/sanitizer.go

@@ -4,7 +4,6 @@
 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
 
 import (
-	"errors"
 	"io"
 	"net/url"
 	"slices"
@@ -18,10 +17,6 @@ import (
 	"golang.org/x/net/html"
 )
 
-const (
-	maxDepth = 512 // The maximum allowed depths for nested HTML tags, same was WebKit.
-)
-
 var (
 	allowedHTMLTagsAndAttributes = map[string][]string{
 		"a":          {"href", "title", "id"},
@@ -197,8 +192,7 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
 	// Errors are a non-issue, so they're handled in filterAndRenderHTML
 	parsedBaseUrl, _ := url.Parse(baseURL)
 	for c := body.FirstChild; c != nil; c = c.NextSibling {
-		// -2 because of `<html><body>…`
-		if err := filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions, maxDepth-2); err != nil {
+		if err := filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions); err != nil {
 			return ""
 		}
 	}
@@ -224,15 +218,11 @@ func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
 	return "", false
 }
 
-func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
+func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) error {
 	if n == nil {
 		return nil
 	}
 
-	if depth == 0 {
-		return errors.New("maximum nested tags limit reached")
-	}
-
 	switch n.Type {
 	case html.TextNode:
 		buf.WriteString(html.EscapeString(n.Data))
@@ -245,7 +235,7 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 		_, ok := allowedHTMLTagsAndAttributes[tag]
 		if !ok {
 			// The tag isn't allowed, but we're still interested in its content
-			return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
+			return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
 		}
 
 		htmlAttributes, hasAllRequiredAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
@@ -255,7 +245,7 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 				return nil
 			}
 			// The tag doesn't have every required attributes but we're still interested in its content
-			return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
+			return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
 		}
 		buf.WriteByte('<')
 		buf.WriteString(n.Data)
@@ -271,7 +261,7 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 
 		if tag != "iframe" {
 			// iframes aren't allowed to have child nodes.
-			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
 		}
 
 		buf.WriteString("</")
@@ -282,9 +272,9 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 	return nil
 }
 
-func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
+func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) error {
 	for c := n.FirstChild; c != nil; c = c.NextSibling {
-		if err := filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions, depth); err != nil {
+		if err := filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions); err != nil {
 			return err
 		}
 	}

+ 4 - 3
internal/reader/sanitizer/sanitizer_test.go

@@ -1011,9 +1011,10 @@ func TestAttrLowerCase(t *testing.T) {
 }
 
 func TestDeeplyNestedpage(t *testing.T) {
+	maxDepth := 512 // html.Parse has a maximum depth of 512
 	input := "test"
-	// -3 instead of -1 because <html><body> is automatically added.
-	for range maxDepth - 3 {
+	// -2 instead of -1 because <html><body> is automatically added.
+	for range maxDepth - 2 {
 		input = "<div>" + input + "</div>"
 	}
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
@@ -1024,7 +1025,7 @@ func TestDeeplyNestedpage(t *testing.T) {
 	}
 
 	input = "test"
-	for range maxDepth - 2 {
+	for range maxDepth - 1 {
 		input = "<div>" + input + "</div>"
 	}
 	output = sanitizeHTMLWithDefaultOptions("http://example.org/", input)