Kaynağa Gözat

refactor(sanitizer): use a parser instead of a tokenizer

As stated in [golang.org/x/net/html's documentation](https://pkg.go.dev/golang.org/x/net):

> If your use case requires semantically well-formed HTML documents, as defined
by the WHATWG specification, the parser should be used rather than the
tokenizer.

The sanitizer is modifying the HTML document, and I'm pretty sure we don't want
to implement a WHATWG-compliant parser ourself, as there are _so many_
edge cases everywhere. Picking the parser instead of the tokenizer ensures that
miniflux has the same view of the DOM than web browsers, removing the risk of
disparities, and thus injection-related security issues. Another benefit of
this commit is to make the code way more readable/simple.

The only changes made to the testsuites are:

- Removing the trailing `/` on self-contained tags, as the spec doesn't require
  them. Adding them systematically could also have been done, but as the
  testsuite isn't consistent in this regard, it would have significantly
  increased the size of the commit to normalize it.
- Some weird things that were wrong in the first place, like how
  `<td rowspan="<b>injection</b>">text</td>` is interpreted by the browser as
  `text`, and not as `<td rowspan="&lt;b&gt;test&lt;/b&gt;">test</td>`.

Care has been taken to re-use as much code from the tokenizer-based sanitizer
as possible.
jvoisin 2 ay önce
ebeveyn
işleme
d8181b1f65

+ 105 - 96
internal/reader/sanitizer/sanitizer.go

@@ -4,7 +4,6 @@
 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
 
 import (
-	"io"
 	"net/url"
 	"slices"
 	"strconv"
@@ -204,110 +203,125 @@ type SanitizerOptions struct {
 	OpenLinksInNewTab bool
 }
 
-func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
-	var tagStack []string
-	var parentTag string
-	var blockedStack []string
-	var buffer strings.Builder
+func isHidden(n *html.Node) bool {
+	for _, attr := range n.Attr {
+		if strings.ToLower(attr.Key) == "hidden" {
+			return true
+		}
+	}
+	return false
+}
 
-	// Educated guess about how big the sanitized HTML will be,
-	// to reduce the amount of buffer re-allocations in this function.
-	estimatedRatio := len(rawHTML) * 3 / 4
-	buffer.Grow(estimatedRatio)
+func shouldIgnoreTag(n *html.Node, tag string) bool {
+	if isPixelTracker(tag, n.Attr) {
+		return true
+	}
+	if isBlockedTag(tag) {
+		return true
+	}
+	if isHidden(n) {
+		return true
+	}
 
-	// Errors are a non-issue, so they're handled later in the function.
-	parsedBaseUrl, _ := url.Parse(baseURL)
+	return false
+}
 
-	tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
-	for {
-		if tokenizer.Next() == html.ErrorToken {
-			err := tokenizer.Err()
-			if err == io.EOF {
-				return buffer.String()
-			}
+func isSelfContainedTag(tag string) bool {
+	switch tag {
+	case "area", "base", "br", "col", "embed", "hr", "img", "input",
+		"link", "meta", "param", "source", "track", "wbr":
+		return true
+	}
+	return false
+}
 
-			return ""
+func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
+	}
+}
+
+func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
+	if n == nil {
+		return
+	}
+
+	switch n.Type {
+	case html.TextNode:
+		buf.WriteString(html.EscapeString(n.Data))
+	case html.ElementNode:
+		tag := strings.ToLower(n.Data)
+		if shouldIgnoreTag(n, tag) {
+			return
 		}
 
-		token := tokenizer.Token()
+		_, ok := allowedHTMLTagsAndAttributes[tag]
+		if !ok {
+			// The tag isn't allowed, but we're still interested in its content
+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
+			return
+		}
 
-		// Note: MathML elements are not fully supported by golang.org/x/net/html.
-		// See https://github.com/golang/net/blob/master/html/atom/gen.go
-		// and https://github.com/golang/net/blob/master/html/atom/table.go
-		tagName := token.Data
-		if tagName == "" {
-			continue
+		attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
+		if !hasRequiredAttributes(tag, attrNames) {
+			// The tag doesn't have every required attributes but we're still interested in its content
+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
+			return
+		}
+		buf.WriteString("<")
+		buf.WriteString(n.Data)
+		if len(attrNames) > 0 {
+			buf.WriteString(" " + htmlAttributes)
 		}
+		buf.WriteString(">")
 
-		switch token.Type {
-		case html.TextToken:
-			if len(blockedStack) > 0 {
-				continue
-			}
+		if isSelfContainedTag(tag) {
+			return
+		}
 
-			// An iframe element never has fallback content.
-			// See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
-			if parentTag == "iframe" {
-				continue
-			}
+		if tag != "iframe" {
+			// iframes aren't allowed to have child nodes.
+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
+		}
 
-			buffer.WriteString(token.String())
-		case html.StartTagToken:
-			if len(blockedStack) != 0 {
-				continue
-			}
+		buf.WriteString("</")
+		buf.WriteString(n.Data)
+		buf.WriteString(">")
+	case html.DocumentNode:
+		filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
+	default:
+	}
+}
 
-			parentTag = tagName
+func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
+	var buffer strings.Builder
 
-			if isPixelTracker(tagName, token.Attr) {
-				continue
-			}
+	// Educated guess about how big the sanitized HTML will be,
+	// to reduce the amount of buffer re-allocations in this function.
+	estimatedRatio := len(rawHTML) * 3 / 4
+	buffer.Grow(estimatedRatio)
 
-			if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
-				blockedStack = append(blockedStack, tagName)
-				continue
-			}
+	// We need to surround `rawHTML` with body tags so that html.Parse
+	// will consider it a valid html document.
+	doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
+	if err != nil {
+		return ""
+	}
 
-			if isAllowedTag(tagName) {
-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
-				if hasRequiredAttributes(tagName, attrNames) {
-					if len(attrNames) > 0 {
-						// Rewrite the start tag with allowed attributes.
-						buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
-					} else {
-						// Rewrite the start tag without any attributes.
-						buffer.WriteString("<" + tagName + ">")
-					}
-
-					tagStack = append(tagStack, tagName)
-				}
-			}
-		case html.EndTagToken:
-			if len(blockedStack) == 0 {
-				if isAllowedTag(tagName) && slices.Contains(tagStack, tagName) {
-					buffer.WriteString("</" + tagName + ">")
-				}
-			} else {
-				if blockedStack[len(blockedStack)-1] == tagName {
-					blockedStack = blockedStack[:len(blockedStack)-1]
-				}
-			}
-		case html.SelfClosingTagToken:
-			if isPixelTracker(tagName, token.Attr) {
-				continue
-			}
-			if len(blockedStack) == 0 && isAllowedTag(tagName) {
-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
-				if hasRequiredAttributes(tagName, attrNames) {
-					if len(attrNames) > 0 {
-						buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
-					} else {
-						buffer.WriteString("<" + tagName + "/>")
-					}
-				}
-			}
-		}
+	/* The structure of `doc` is always:
+	<html>
+	<head>...</head>
+	<body>..</body>
+	*/
+	body := doc.FirstChild.FirstChild.NextSibling
+
+	// Errors are a non-issue, so they're handled in filterAndRenderHTML
+	parsedBaseUrl, _ := url.Parse(baseURL)
+	for c := body.FirstChild; c != nil; c = c.NextSibling {
+		filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions)
 	}
+
+	return buffer.String()
 }
 
 func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
@@ -444,11 +458,6 @@ func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *S
 	}
 }
 
-func isAllowedTag(tagName string) bool {
-	_, ok := allowedHTMLTagsAndAttributes[tagName]
-	return ok
-}
-
 func isValidAttribute(tagName, attributeName string) bool {
 	if attributes, ok := allowedHTMLTagsAndAttributes[tagName]; ok {
 		return slices.Contains(attributes, attributeName)
@@ -465,8 +474,8 @@ func isExternalResourceAttribute(attribute string) bool {
 	}
 }
 
-func isPixelTracker(tagName string, attributes []html.Attribute) bool {
-	if tagName != "img" {
+func isPixelTracker(tag string, attributes []html.Attribute) bool {
+	if tag != "img" {
 		return false
 	}
 	hasHeight := false

+ 6 - 6
internal/reader/sanitizer/sanitizer_test.go

@@ -304,7 +304,7 @@ func TestMediumImgWithSrcset(t *testing.T) {
 }
 
 func TestSelfClosingTags(t *testing.T) {
-	input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test" loading="lazy"/>.</p>`
+	input := `<p>This <br> is a <strong>text</strong> <br>with an image: <img src="http://example.org/" alt="Test" loading="lazy">.</p>`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if input != output {
@@ -322,8 +322,8 @@ func TestTable(t *testing.T) {
 }
 
 func TestRelativeURL(t *testing.T) {
-	input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>`
-	expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" referrerpolicy="no-referrer" target="_blank">link is relative</a> and this image: <img src="http://example.org/folder/image.png" loading="lazy"/>`
+	input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png">`
+	expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" referrerpolicy="no-referrer" target="_blank">link is relative</a> and this image: <img src="http://example.org/folder/image.png" loading="lazy">`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -798,8 +798,8 @@ func TestXmlEntities(t *testing.T) {
 }
 
 func TestEspaceAttributes(t *testing.T) {
-	input := `<td rowspan="<b>test</b>">test</td>`
-	expected := `<td rowspan="&lt;b&gt;test&lt;/b&gt;">test</td>`
+	input := `<td rowspan="<b>injection</b>">text</td>`
+	expected := `text`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if expected != output {
@@ -978,7 +978,7 @@ func TestHiddenParagraph(t *testing.T) {
 
 func TestAttributesAreStripped(t *testing.T) {
 	input := `<p style="color: red;">Some text.<hr style="color: blue"/>Test.</p>`
-	expected := `<p>Some text.<hr/>Test.</p>`
+	expected := `<p>Some text.</p><hr>Test.<p></p>`
 
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 	if expected != output {