4 luni în urmă · d8181b1f65
--- a/internal/reader/sanitizer/sanitizer.go
+++ b/internal/reader/sanitizer/sanitizer.go
@@ -4,7 +4,6 @@
 
				 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
			
 
				 
			
 
				 import (
			
 
				-	"io"
			
 
				 	"net/url"
			
 
				 	"slices"
			
 
				 	"strconv"
			
@@ -204,110 +203,125 @@ type SanitizerOptions struct {
 
				 	OpenLinksInNewTab bool
			
 
				 }
			
 
				 
			
 
				-func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
			
 
				-	var tagStack []string
			
 
				-	var parentTag string
			
 
				-	var blockedStack []string
			
 
				-	var buffer strings.Builder
			
 
				+func isHidden(n *html.Node) bool {
			
 
				+	for _, attr := range n.Attr {
			
 
				+		if strings.ToLower(attr.Key) == "hidden" {
			
 
				+			return true
			
 
				+		}
			
 
				+	}
			
 
				+	return false
			
 
				+}
			
 
				 
			
 
				-	// Educated guess about how big the sanitized HTML will be,
			
 
				-	// to reduce the amount of buffer re-allocations in this function.
			
 
				-	estimatedRatio := len(rawHTML) * 3 / 4
			
 
				-	buffer.Grow(estimatedRatio)
			
 
				+func shouldIgnoreTag(n *html.Node, tag string) bool {
			
 
				+	if isPixelTracker(tag, n.Attr) {
			
 
				+		return true
			
 
				+	}
			
 
				+	if isBlockedTag(tag) {
			
 
				+		return true
			
 
				+	}
			
 
				+	if isHidden(n) {
			
 
				+		return true
			
 
				+	}
			
 
				 
			
 
				-	// Errors are a non-issue, so they're handled later in the function.
			
 
				-	parsedBaseUrl, _ := url.Parse(baseURL)
			
 
				+	return false
			
 
				+}
			
 
				 
			
 
				-	tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
			
 
				-	for {
			
 
				-		if tokenizer.Next() == html.ErrorToken {
			
 
				-			err := tokenizer.Err()
			
 
				-			if err == io.EOF {
			
 
				-				return buffer.String()
			
 
				-			}
			
 
				+func isSelfContainedTag(tag string) bool {
			
 
				+	switch tag {
			
 
				+	case "area", "base", "br", "col", "embed", "hr", "img", "input",
			
 
				+		"link", "meta", "param", "source", "track", "wbr":
			
 
				+		return true
			
 
				+	}
			
 
				+	return false
			
 
				+}
			
 
				 
			
 
				-			return ""
			
 
				+func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
			
 
				+	for c := n.FirstChild; c != nil; c = c.NextSibling {
			
 
				+		filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
			
 
				+	if n == nil {
			
 
				+		return
			
 
				+	}
			
 
				+
			
 
				+	switch n.Type {
			
 
				+	case html.TextNode:
			
 
				+		buf.WriteString(html.EscapeString(n.Data))
			
 
				+	case html.ElementNode:
			
 
				+		tag := strings.ToLower(n.Data)
			
 
				+		if shouldIgnoreTag(n, tag) {
			
 
				+			return
			
 
				 		}
			
 
				 
			
 
				-		token := tokenizer.Token()
			
 
				+		_, ok := allowedHTMLTagsAndAttributes[tag]
			
 
				+		if !ok {
			
 
				+			// The tag isn't allowed, but we're still interested in its content
			
 
				+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
			
 
				+			return
			
 
				+		}
			
 
				 
			
 
				-		// Note: MathML elements are not fully supported by golang.org/x/net/html.
			
 
				-		// See https://github.com/golang/net/blob/master/html/atom/gen.go
			
 
				-		// and https://github.com/golang/net/blob/master/html/atom/table.go
			
 
				-		tagName := token.Data
			
 
				-		if tagName == "" {
			
 
				-			continue
			
 
				+		attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
			
 
				+		if !hasRequiredAttributes(tag, attrNames) {
			
 
				+			// The tag doesn't have every required attributes but we're still interested in its content
			
 
				+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
			
 
				+			return
			
 
				+		}
			
 
				+		buf.WriteString("<")
			
 
				+		buf.WriteString(n.Data)
			
 
				+		if len(attrNames) > 0 {
			
 
				+			buf.WriteString(" " + htmlAttributes)
			
 
				 		}
			
 
				+		buf.WriteString(">")
			
 
				 
			
 
				-		switch token.Type {
			
 
				-		case html.TextToken:
			
 
				-			if len(blockedStack) > 0 {
			
 
				-				continue
			
 
				-			}
			
 
				+		if isSelfContainedTag(tag) {
			
 
				+			return
			
 
				+		}
			
 
				 
			
 
				-			// An iframe element never has fallback content.
			
 
				-			// See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
			
 
				-			if parentTag == "iframe" {
			
 
				-				continue
			
 
				-			}
			
 
				+		if tag != "iframe" {
			
 
				+			// iframes aren't allowed to have child nodes.
			
 
				+			filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
			
 
				+		}
			
 
				 
			
 
				-			buffer.WriteString(token.String())
			
 
				-		case html.StartTagToken:
			
 
				-			if len(blockedStack) != 0 {
			
 
				-				continue
			
 
				-			}
			
 
				+		buf.WriteString("</")
			
 
				+		buf.WriteString(n.Data)
			
 
				+		buf.WriteString(">")
			
 
				+	case html.DocumentNode:
			
 
				+		filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
			
 
				+	default:
			
 
				+	}
			
 
				+}
			
 
				 
			
 
				-			parentTag = tagName
			
 
				+func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
			
 
				+	var buffer strings.Builder
			
 
				 
			
 
				-			if isPixelTracker(tagName, token.Attr) {
			
 
				-				continue
			
 
				-			}
			
 
				+	// Educated guess about how big the sanitized HTML will be,
			
 
				+	// to reduce the amount of buffer re-allocations in this function.
			
 
				+	estimatedRatio := len(rawHTML) * 3 / 4
			
 
				+	buffer.Grow(estimatedRatio)
			
 
				 
			
 
				-			if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
			
 
				-				blockedStack = append(blockedStack, tagName)
			
 
				-				continue
			
 
				-			}
			
 
				+	// We need to surround `rawHTML` with body tags so that html.Parse
			
 
				+	// will consider it a valid html document.
			
 
				+	doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
			
 
				+	if err != nil {
			
 
				+		return ""
			
 
				+	}
			
 
				 
			
 
				-			if isAllowedTag(tagName) {
			
 
				-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
			
 
				-				if hasRequiredAttributes(tagName, attrNames) {
			
 
				-					if len(attrNames) > 0 {
			
 
				-						// Rewrite the start tag with allowed attributes.
			
 
				-						buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
			
 
				-					} else {
			
 
				-						// Rewrite the start tag without any attributes.
			
 
				-						buffer.WriteString("<" + tagName + ">")
			
 
				-					}
			
 
				-
			
 
				-					tagStack = append(tagStack, tagName)
			
 
				-				}
			
 
				-			}
			
 
				-		case html.EndTagToken:
			
 
				-			if len(blockedStack) == 0 {
			
 
				-				if isAllowedTag(tagName) && slices.Contains(tagStack, tagName) {
			
 
				-					buffer.WriteString("</" + tagName + ">")
			
 
				-				}
			
 
				-			} else {
			
 
				-				if blockedStack[len(blockedStack)-1] == tagName {
			
 
				-					blockedStack = blockedStack[:len(blockedStack)-1]
			
 
				-				}
			
 
				-			}
			
 
				-		case html.SelfClosingTagToken:
			
 
				-			if isPixelTracker(tagName, token.Attr) {
			
 
				-				continue
			
 
				-			}
			
 
				-			if len(blockedStack) == 0 && isAllowedTag(tagName) {
			
 
				-				attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
			
 
				-				if hasRequiredAttributes(tagName, attrNames) {
			
 
				-					if len(attrNames) > 0 {
			
 
				-						buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
			
 
				-					} else {
			
 
				-						buffer.WriteString("<" + tagName + "/>")
			
 
				-					}
			
 
				-				}
			
 
				-			}
			
 
				-		}
			
 
				+	/* The structure of `doc` is always:
			
 
				+	<html>
			
 
				+	<head>...</head>
			
 
				+	<body>..</body>
			
 
				+	*/
			
 
				+	body := doc.FirstChild.FirstChild.NextSibling
			
 
				+
			
 
				+	// Errors are a non-issue, so they're handled in filterAndRenderHTML
			
 
				+	parsedBaseUrl, _ := url.Parse(baseURL)
			
 
				+	for c := body.FirstChild; c != nil; c = c.NextSibling {
			
 
				+		filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions)
			
 
				 	}
			
 
				+
			
 
				+	return buffer.String()
			
 
				 }
			
 
				 
			
 
				 func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
			
@@ -444,11 +458,6 @@ func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *S
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func isAllowedTag(tagName string) bool {
			
 
				-	_, ok := allowedHTMLTagsAndAttributes[tagName]
			
 
				-	return ok
			
 
				-}
			
 
				-
			
 
				 func isValidAttribute(tagName, attributeName string) bool {
			
 
				 	if attributes, ok := allowedHTMLTagsAndAttributes[tagName]; ok {
			
 
				 		return slices.Contains(attributes, attributeName)
			
@@ -465,8 +474,8 @@ func isExternalResourceAttribute(attribute string) bool {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func isPixelTracker(tagName string, attributes []html.Attribute) bool {
			
 
				-	if tagName != "img" {
			
 
				+func isPixelTracker(tag string, attributes []html.Attribute) bool {
			
 
				+	if tag != "img" {
			
 
				 		return false
			
 
				 	}
			
 
				 	hasHeight := false
			
--- a/internal/reader/sanitizer/sanitizer_test.go
+++ b/internal/reader/sanitizer/sanitizer_test.go
@@ -304,7 +304,7 @@ func TestMediumImgWithSrcset(t *testing.T) {
 
				 }
			
 
				 
			
 
				 func TestSelfClosingTags(t *testing.T) {
			
 
				-	input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test" loading="lazy"/>.</p>`
			
 
				+	input := `<p>This <br> is a <strong>text</strong> <br>with an image: <img src="http://example.org/" alt="Test" loading="lazy">.</p>`
			
 
				 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
			
 
				 
			
 
				 	if input != output {
			
@@ -322,8 +322,8 @@ func TestTable(t *testing.T) {
 
				 }
			
 
				 
			
 
				 func TestRelativeURL(t *testing.T) {
			
 
				-	input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png"/>`
			
 
				-	expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" referrerpolicy="no-referrer" target="_blank">link is relative</a> and this image: <img src="http://example.org/folder/image.png" loading="lazy"/>`
			
 
				+	input := `This <a href="/test.html">link is relative</a> and this image: <img src="../folder/image.png">`
			
 
				+	expected := `This <a href="http://example.org/test.html" rel="noopener noreferrer" referrerpolicy="no-referrer" target="_blank">link is relative</a> and this image: <img src="http://example.org/folder/image.png" loading="lazy">`
			
 
				 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
			
 
				 
			
 
				 	if expected != output {
			
@@ -798,8 +798,8 @@ func TestXmlEntities(t *testing.T) {
 
				 }
			
 
				 
			
 
				 func TestEspaceAttributes(t *testing.T) {
			
 
				-	input := `<td rowspan="<b>test</b>">test</td>`
			
 
				-	expected := `<td rowspan="&lt;b&gt;test&lt;/b&gt;">test</td>`
			
 
				+	input := `<td rowspan="<b>injection</b>">text</td>`
			
 
				+	expected := `text`
			
 
				 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
			
 
				 
			
 
				 	if expected != output {
			
@@ -978,7 +978,7 @@ func TestHiddenParagraph(t *testing.T) {
 
				 
			
 
				 func TestAttributesAreStripped(t *testing.T) {
			
 
				 	input := `<p style="color: red;">Some text.<hr style="color: blue"/>Test.</p>`
			
 
				-	expected := `<p>Some text.<hr/>Test.</p>`
			
 
				+	expected := `<p>Some text.</p><hr>Test.<p></p>`
			
 
				 
			
 
				 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
			
 
				 	if expected != output {