Browse Source

Speed the sanitizer up a bit, again

- allow youtube urls to start with `www`
- use `strings.Builder` instead of a `bytes.Buffer`
- use a `strings.NewReader` instead of a `bytes.NewBufferString`
- sprinkles a couple of `continue` to make the code-flow more obvious
- inline calls to `inList`, and put their parameters in the right order
- simplify isPixelTracker
- simplify `isValidIframeSource`, by extracting the hostname and comparing it
  directly, instead of using the full url and checking if it starts with
  multiple variations of the same one (`//`, `http:`, `https://` multiplied by
  ``/`www.`)
- add a benchmark
jvoisin 2 years ago
parent
commit
3d0126be0b

+ 41 - 51
internal/reader/sanitizer/sanitizer.go

@@ -4,7 +4,6 @@
 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
 
 import (
-	"bytes"
 	"fmt"
 	"io"
 	"regexp"
@@ -19,7 +18,7 @@ import (
 )
 
 var (
-	youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)$`)
+	youtubeEmbedRegex = regexp.MustCompile(`//(?:www\.)?youtube\.com/embed/(.+)$`)
 	tagAllowList      = map[string][]string{
 		"a":          {"href", "title", "id"},
 		"abbr":       {"title"},
@@ -80,12 +79,12 @@ var (
 
 // Sanitize returns safe HTML.
 func Sanitize(baseURL, input string) string {
-	var buffer bytes.Buffer
+	var buffer strings.Builder
 	var tagStack []string
 	var parentTag string
 	blacklistedTagDepth := 0
 
-	tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
+	tokenizer := html.NewTokenizer(strings.NewReader(input))
 	for {
 		if tokenizer.Next() == html.ErrorToken {
 			err := tokenizer.Err()
@@ -114,7 +113,10 @@ func Sanitize(baseURL, input string) string {
 			tagName := token.DataAtom.String()
 			parentTag = tagName
 
-			if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+			if isPixelTracker(tagName, token.Attr) {
+				continue
+			}
+			if isValidTag(tagName) {
 				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
 
 				if hasRequiredAttributes(tagName, attrNames) {
@@ -131,16 +133,18 @@ func Sanitize(baseURL, input string) string {
 			}
 		case html.EndTagToken:
 			tagName := token.DataAtom.String()
-			if isValidTag(tagName) && inList(tagName, tagStack) {
-				buffer.WriteString(fmt.Sprintf("</%s>", tagName))
+			if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
+				buffer.WriteString("</" + tagName + ">")
 			} else if isBlockedTag(tagName) {
 				blacklistedTagDepth--
 			}
 		case html.SelfClosingTagToken:
 			tagName := token.DataAtom.String()
-			if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
+			if isPixelTracker(tagName, token.Attr) {
+				continue
+			}
+			if isValidTag(tagName) {
 				attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
-
 				if hasRequiredAttributes(tagName, attrNames) {
 					if len(attrNames) > 0 {
 						buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
@@ -187,11 +191,10 @@ func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([
 
 		if isExternalResourceAttribute(attribute.Key) {
 			if tagName == "iframe" {
-				if isValidIframeSource(baseURL, attribute.Val) {
-					value = rewriteIframeURL(attribute.Val)
-				} else {
+				if !isValidIframeSource(baseURL, attribute.Val) {
 					continue
 				}
+				value = rewriteIframeURL(attribute.Val)
 			} else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
 				value = attribute.Val
 			} else if isAnchor("a", attribute) {
@@ -248,7 +251,7 @@ func isValidTag(tagName string) bool {
 
 func isValidAttribute(tagName, attributeName string) bool {
 	if attributes, ok := tagAllowList[tagName]; ok {
-		return inList(attributeName, attributes)
+		return slices.Contains(attributes, attributeName)
 	}
 	return false
 }
@@ -263,24 +266,23 @@ func isExternalResourceAttribute(attribute string) bool {
 }
 
 func isPixelTracker(tagName string, attributes []html.Attribute) bool {
-	if tagName == "img" {
-		hasHeight := false
-		hasWidth := false
+	if tagName != "img" {
+		return false
+	}
+	hasHeight := false
+	hasWidth := false
 
-		for _, attribute := range attributes {
-			if attribute.Key == "height" && attribute.Val == "1" {
+	for _, attribute := range attributes {
+		if attribute.Val == "1" {
+			if attribute.Key == "height" {
 				hasHeight = true
-			}
-
-			if attribute.Key == "width" && attribute.Val == "1" {
+			} else if attribute.Key == "width" {
 				hasWidth = true
 			}
 		}
-
-		return hasHeight && hasWidth
 	}
 
-	return false
+	return hasHeight && hasWidth
 }
 
 func hasRequiredAttributes(tagName string, attributes []string) bool {
@@ -371,43 +373,31 @@ func isBlockedResource(src string) bool {
 
 func isValidIframeSource(baseURL, src string) bool {
 	whitelist := []string{
-		"//www.youtube.com",
-		"http://www.youtube.com",
-		"https://www.youtube.com",
-		"https://www.youtube-nocookie.com",
-		"http://player.vimeo.com",
-		"https://player.vimeo.com",
-		"http://www.dailymotion.com",
-		"https://www.dailymotion.com",
-		"http://vk.com",
-		"https://vk.com",
-		"http://soundcloud.com",
-		"https://soundcloud.com",
-		"http://w.soundcloud.com",
-		"https://w.soundcloud.com",
-		"http://bandcamp.com",
-		"https://bandcamp.com",
-		"https://cdn.embedly.com",
-		"https://player.bilibili.com",
-		"https://player.twitch.tv",
+		"bandcamp.com",
+		"cdn.embedly.com",
+		"player.bilibili.com",
+		"player.twitch.tv",
+		"player.vimeo.com",
+		"soundcloud.com",
+		"vk.com",
+		"w.soundcloud.com",
+		"dailymotion.com",
+		"youtube-nocookie.com",
+		"youtube.com",
 	}
+	domain := urllib.Domain(src)
 
 	// allow iframe from same origin
-	if urllib.Domain(baseURL) == urllib.Domain(src) {
+	if urllib.Domain(baseURL) == domain {
 		return true
 	}
 
 	// allow iframe from custom invidious instance
-	if config.Opts != nil && config.Opts.InvidiousInstance() == urllib.Domain(src) {
+	if config.Opts != nil && config.Opts.InvidiousInstance() == domain {
 		return true
 	}
 
-	return slices.ContainsFunc(whitelist, func(prefix string) bool {
-		return strings.HasPrefix(src, prefix)
-	})
-}
-func inList(needle string, haystack []string) bool {
-	return slices.Contains(haystack, needle)
+	return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
 }
 
 func rewriteIframeURL(link string) string {

+ 19 - 0
internal/reader/sanitizer/sanitizer_test.go

@@ -16,6 +16,25 @@ func TestMain(m *testing.M) {
 	os.Exit(exitCode)
 }
 
+func BenchmarkSanitize(b *testing.B) {
+	var testCases = map[string][]string{
+		"miniflux_github.html":    {"https://github.com/miniflux/v2", ""},
+		"miniflux_wikipedia.html": {"https://fr.wikipedia.org/wiki/Miniflux", ""},
+	}
+	for filename := range testCases {
+		data, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+		testCases[filename][1] = string(data)
+	}
+	for range b.N {
+		for _, v := range testCases {
+			Sanitize(v[0], v[1])
+		}
+	}
+}
+
 func TestValidInput(t *testing.T) {
 	input := `<p>This is a <strong>text</strong> with an image: <img src="http://example.org/" alt="Test" loading="lazy">.</p>`
 	output := Sanitize("http://example.org/", input)

File diff suppressed because it is too large
+ 1584 - 0
internal/reader/sanitizer/testdata/miniflux_github.html


File diff suppressed because it is too large
+ 480 - 0
internal/reader/sanitizer/testdata/miniflux_wikipedia.html


Some files were not shown because too many files changed in this diff