Quellcode durchsuchen

Improve sanitizer to remove script and noscript contents

These tags where removed but the content was rendered as escaped HTML.

See #157
Dave Z vor 7 Jahren
Ursprung
Commit
d847b10e32
2 geänderte Dateien mit 33 neuen und 0 gelöschten Zeilen
  1. 13 0
      reader/sanitizer/sanitizer.go
  2. 20 0
      reader/sanitizer/sanitizer_test.go

+ 13 - 0
reader/sanitizer/sanitizer.go

@@ -25,6 +25,7 @@ func Sanitize(baseURL, input string) string {
 	tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
 	var buffer bytes.Buffer
 	var tagStack []string
+	scriptTagDepth := 0
 
 	for {
 		if tokenizer.Next() == html.ErrorToken {
@@ -39,6 +40,10 @@ func Sanitize(baseURL, input string) string {
 		token := tokenizer.Token()
 		switch token.Type {
 		case html.TextToken:
+			if scriptTagDepth > 0 {
+				continue
+			}
+
 			buffer.WriteString(html.EscapeString(token.Data))
 		case html.StartTagToken:
 			tagName := token.DataAtom.String()
@@ -55,11 +60,15 @@ func Sanitize(baseURL, input string) string {
 
 					tagStack = append(tagStack, tagName)
 				}
+			} else if isScriptTag(tagName) {
+				scriptTagDepth++
 			}
 		case html.EndTagToken:
 			tagName := token.DataAtom.String()
 			if isValidTag(tagName) && inList(tagName, tagStack) {
 				buffer.WriteString(fmt.Sprintf("</%s>", tagName))
+			} else if isScriptTag(tagName) {
+				scriptTagDepth--
 			}
 		case html.SelfClosingTagToken:
 			tagName := token.DataAtom.String()
@@ -384,3 +393,7 @@ func rewriteIframeURL(link string) string {
 
 	return link
 }
+
+func isScriptTag(tagName string) bool {
+	return tagName == "script" || tagName == "noscript"
+}

+ 20 - 0
reader/sanitizer/sanitizer_test.go

@@ -212,3 +212,23 @@ func TestReplaceIframeURL(t *testing.T) {
 		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
 	}
 }
+
+func TestReplaceNoScript(t *testing.T) {
+	input := `<p>Before paragraph.</p><noscript>Inside <code>noscript</code> tag with an image: <img src="http://example.org/" alt="Test"></noscript><p>After paragraph.</p>`
+	expected := `<p>Before paragraph.</p><p>After paragraph.</p>`
+	output := Sanitize("http://example.org/", input)
+
+	if expected != output {
+		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+	}
+}
+
+func TestReplaceScript(t *testing.T) {
+	input := `<p>Before paragraph.</p><script type="text/javascript">alert("1");</script><p>After paragraph.</p>`
+	expected := `<p>Before paragraph.</p><p>After paragraph.</p>`
+	output := Sanitize("http://example.org/", input)
+
+	if expected != output {
+		t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
+	}
+}