// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer" import ( "fmt" "os" "strings" "testing" "golang.org/x/net/html" "miniflux.app/v2/internal/config" ) func sanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string { return SanitizeHTML(baseURL, rawHTML, &SanitizerOptions{ OpenLinksInNewTab: true, }) } func BenchmarkSanitize(b *testing.B) { var testCases = map[string][]string{ "miniflux_github.html": {"https://github.com/miniflux/v2", ""}, "miniflux_wikipedia.html": {"https://fr.wikipedia.org/wiki/Miniflux", ""}, } for filename := range testCases { data, err := os.ReadFile("testdata/" + filename) if err != nil { b.Fatalf(`Unable to read file %q: %v`, filename, err) } testCases[filename][1] = string(data) } for b.Loop() { for _, v := range testCases { sanitizeHTMLWithDefaultOptions(v[0], v[1]) } } } func FuzzSanitizer(f *testing.F) { f.Fuzz(func(t *testing.T, orig string) { tok := html.NewTokenizer(strings.NewReader(orig)) i := 0 for tok.Next() != html.ErrorToken { i++ } out := sanitizeHTMLWithDefaultOptions("", orig) tok = html.NewTokenizer(strings.NewReader(out)) j := 0 for tok.Next() != html.ErrorToken { j++ } if j > i { t.Errorf("Got more html tokens in the sanitized html.") } }) } func TestValidInput(t *testing.T) { input := `
This is a text with an image: .
`,
expected: `
`,
},
{
name: "invalid-width-attribute",
input: `
`,
expected: `
`,
},
{
name: "invalid-height-attribute",
input: `
`,
expected: `
`,
},
{
name: "srcset-attribute-without-src",
input: `
`,
expected: `
`,
},
{
name: "srcset-attribute-all-candidates-invalid",
input: `
`,
expected: `
`,
},
{
name: "fetchpriority-low",
input: `
`,
expected: `
`,
},
{
name: "fetchpriority-auto",
input: `
`,
expected: `
`,
},
{
name: "fetchpriority-invalid",
input: `
`,
expected: `
`,
},
{
name: "decoding-sync",
input: `
`,
expected: `
`,
},
{
name: "decoding-async",
input: `
`,
expected: `
`,
},
{
name: "decoding-auto",
input: `
`,
expected: `
`,
},
{
name: "decoding-invalid",
input: `
`,
expected: `
`,
},
}
for _, tc := range testCases {
t.Run(tc.name, func(t *testing.T) {
output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input)
if output != tc.expected {
t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output)
}
})
}
}
func TestNonImgWithFetchPriorityAttribute(t *testing.T) {
input := `Text
` expected := `Text
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != expected { t.Errorf(`Wrong output: expected %q, got %q`, expected, output) } } func TestNonImgWithDecodingAttribute(t *testing.T) { input := `Text
` expected := `Text
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != expected { t.Errorf(`Wrong output: expected %q, got %q`, expected, output) } } func TestMediumImgWithSrcset(t *testing.T) { input := `
`
expected := `
`
output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
if output != expected {
t.Errorf(`Wrong output: %s`, output)
}
}
func TestSelfClosingTags(t *testing.T) {
baseURL := "http://example.org/"
testCases := []struct {
name string
input string
expected string
}{
{
name: "br",
input: `Line
Break
Line
Break
Before
After
`, expected: `Before
After
`, }, { name: "img", input: `Image 
Image 
soft
soft
| A | B | |
|---|---|---|
| C | D | E |
`
expected := `This link is relative and this image:
`
output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
if expected != output {
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
}
}
func TestProtocolRelativeURL(t *testing.T) {
input := `This link is relative.`
expected := `This link is relative.`
output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
if expected != output {
t.Errorf(`Wrong output: "%s" != "%s"`, expected, output)
}
}
func TestInvalidTag(t *testing.T) {
input := `My invalid
My invalid tag.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestSourceSanitization(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "srcset-and-media", input: `My valid .
` expected := `My valid .
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAudioAndSourceTag(t *testing.T) { input := `My music .
` expected := `My music .
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestUnknownTag(t *testing.T) { input := `My invalid
My invalid tag.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidNestedTag(t *testing.T) { input := `My invalid
My invalid tag with some valid tag.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestBlockedIFrameWithChildElements(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestSameDomainIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestInvidiousIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestIFrameAllowList(t *testing.T) { config.Opts = config.NewConfigOptions() allowedDomains := []string{ "bandcamp.com", "cdn.embedly.com", "dailymotion.com", "framatube.org", "open.spotify.com", "player.bilibili.com", "player.twitch.tv", "player.vimeo.com", "soundcloud.com", "vk.com", "w.soundcloud.com", "youtube-nocookie.com", "youtube.com", } for _, domain := range allowedDomains { t.Run(domain, func(t *testing.T) { input := fmt.Sprintf(``, domain) output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if !strings.Contains(output, "` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestIFrameWithChildElements(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestIFrameWithReferrerPolicy(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestLinkWithTarget(t *testing.T) { input := `This link is an anchor
` expected := `This link is an anchor
` output := SanitizeHTML("http://example.org/", input, &SanitizerOptions{OpenLinksInNewTab: true}) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestLinkWithNoTarget(t *testing.T) { input := `This link is an anchor
` expected := `This link is an anchor
` output := SanitizeHTML("http://example.org/", input, &SanitizerOptions{OpenLinksInNewTab: false}) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAnchorLink(t *testing.T) { input := `This link is an anchor
` expected := `This link is an anchor
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidURLScheme(t *testing.T) { input := `This link is not valid
` expected := `This link is not valid
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestURISchemes(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "apt", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "bitcoin", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "callto", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "feed-double-slash", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "feed-https", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "geo", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "itms", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "itms-apps", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "magnet", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "mailto", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "news-double-slash", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "news-single-colon", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "nntp", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "rtmp", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "sip", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "sips", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "skype", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "spotify", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "steam", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "svn", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "svn-ssh", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "tel", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "webcal", input: `This link is valid
`, expected: `This link is valid
`, }, { name: "xmpp", input: `This link is valid
`, expected: `This link is valid
`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if tc.expected != output { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestBlacklistedLink(t *testing.T) { input := `This image is not valid
This image is not valid
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestLinkWithTrackers(t *testing.T) { input := `This link has trackers Test
` expected := `This link has trackers Test
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestImageSrcWithTrackers(t *testing.T) { input := `This image has trackers
This image has trackers
and
and
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func Test0x0PixelTracker(t *testing.T) { input := ` and
and
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestXmlEntities(t *testing.T) { input := `echo "test" > /etc/hosts` expected := `
echo "test" > /etc/hosts` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestEspaceAttributes(t *testing.T) { input := `
Before paragraph.
After paragraph.
` expected := `Before paragraph.
After paragraph.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceScript(t *testing.T) { input := `Before paragraph.
After paragraph.
` expected := `Before paragraph.
After paragraph.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceStyle(t *testing.T) { input := `Before paragraph.
After paragraph.
` expected := `Before paragraph.
After paragraph.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestHiddenParagraph(t *testing.T) { input := `Before paragraph.
This should not appear in the output
After paragraph.
` expected := `Before paragraph.
After paragraph.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAttributesAreStripped(t *testing.T) { input := `Some text.
Some text.
Before paragraph.
After paragraph.
` expected := `Before paragraph.
After paragraph.
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } input = `Before paragraph.
After paragraph.
` expected = `Before paragraph.
After paragraph.
` output = sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } input = `Before paragraph.
After paragraph.
` expected = `Before paragraph.
After paragraph.
` output = sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAttrLowerCase(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "href-and-hidden-mixed-case", input: `test`, expected: ``, }, { name: "href-mixed-case", input: `test`, expected: `test`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if tc.expected != output { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestDeeplyNestedpage(t *testing.T) { input := "test" // -3 instead of -1 because is automatically added. for range maxDepth - 3 { input = "
`,
expected: `
`,
},
{
name: "srcset-attribute-with-blocked-candidate",
input: `