// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved. // SPDX-License-Identifier: Apache-2.0 package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer" import ( "fmt" "os" "strings" "testing" "golang.org/x/net/html" "miniflux.app/v2/internal/config" ) func sanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string { return SanitizeHTML(baseURL, rawHTML, &SanitizerOptions{ OpenLinksInNewTab: true, }) } func BenchmarkSanitize(b *testing.B) { var testCases = map[string][]string{ "miniflux_github.html": {"https://github.com/miniflux/v2", ""}, "miniflux_wikipedia.html": {"https://fr.wikipedia.org/wiki/Miniflux", ""}, } for filename := range testCases { data, err := os.ReadFile("testdata/" + filename) if err != nil { b.Fatalf(`Unable to read file %q: %v`, filename, err) } testCases[filename][1] = string(data) } for b.Loop() { for _, v := range testCases { sanitizeHTMLWithDefaultOptions(v[0], v[1]) } } } func FuzzSanitizer(f *testing.F) { f.Fuzz(func(t *testing.T, orig string) { tok := html.NewTokenizer(strings.NewReader(orig)) i := 0 for tok.Next() != html.ErrorToken { i++ } out := sanitizeHTMLWithDefaultOptions("", orig) tok = html.NewTokenizer(strings.NewReader(out)) j := 0 for tok.Next() != html.ErrorToken { j++ } if j > i { t.Errorf("Got more html tokens in the sanitized html.") } }) } func TestValidInput(t *testing.T) { input := `

This is a text with an image: Test.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if input != output { t.Errorf(`Wrong output: "%s" != "%s"`, input, output) } } func TestImgSanitization(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "width-and-height-attributes", input: ``, expected: ``, }, { name: "invalid-width-and-height-attributes", input: ``, expected: ``, }, { name: "invalid-width-attribute", input: ``, expected: ``, }, { name: "empty-width-and-height-attributes", input: ``, expected: ``, }, { name: "invalid-height-attribute", input: ``, expected: ``, }, { name: "negative-width-attribute", input: ``, expected: ``, }, { name: "negative-height-attribute", input: ``, expected: ``, }, { name: "text-data-url", input: `Example`, expected: ``, }, { name: "image-data-url", input: `Example`, expected: `Example`, }, { name: "srcset-attribute", input: `Example`, expected: `Example`, }, { name: "srcset-attribute-without-src", input: `Example`, expected: `Example`, }, { name: "srcset-attribute-with-blocked-candidate", input: `Example`, expected: `Example`, }, { name: "srcset-attribute-all-candidates-invalid", input: `Example`, expected: ``, }, { name: "fetchpriority-high", input: ``, expected: ``, }, { name: "fetchpriority-low", input: ``, expected: ``, }, { name: "fetchpriority-auto", input: ``, expected: ``, }, { name: "fetchpriority-invalid", input: ``, expected: ``, }, { name: "decoding-sync", input: ``, expected: ``, }, { name: "decoding-async", input: ``, expected: ``, }, { name: "decoding-auto", input: ``, expected: ``, }, { name: "decoding-invalid", input: ``, expected: ``, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if output != tc.expected { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestNonImgWithFetchPriorityAttribute(t *testing.T) { input := `

Text

` expected := `

Text

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != expected { t.Errorf(`Wrong output: expected %q, got %q`, expected, output) } } func TestNonImgWithDecodingAttribute(t *testing.T) { input := `

Text

` expected := `

Text

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != expected { t.Errorf(`Wrong output: expected %q, got %q`, expected, output) } } func TestMediumImgWithSrcset(t *testing.T) { input := `Image for post` expected := `Image for post` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != expected { t.Errorf(`Wrong output: %s`, output) } } func TestSelfClosingTags(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "br", input: `

Line
Break

`, expected: `

Line
Break

`, }, { name: "hr", input: `

Before


After

`, expected: `

Before


After

`, }, { name: "img", input: `

Image Test

`, expected: `

Image Test

`, }, { name: "source", input: ``, expected: ``, }, { name: "wbr", input: `

softbreak

`, expected: `

softbreak

`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if output != tc.expected { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestTable(t *testing.T) { input := `
AB
CDE
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if input != output { t.Errorf(`Wrong output: "%s" != "%s"`, input, output) } } func TestRelativeURL(t *testing.T) { input := `This link is relative and this image: ` expected := `This link is relative and this image: ` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestProtocolRelativeURL(t *testing.T) { input := `This link is relative.` expected := `This link is relative.` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidTag(t *testing.T) { input := `

My invalid tag.

` expected := `

My invalid tag.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestSourceSanitization(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "srcset-and-media", input: ``, expected: ``, }, { name: "src-attribute", input: ``, expected: ``, }, { name: "srcset-with-blocked-candidate", input: ``, expected: ``, }, { name: "srcset-all-invalid", input: ``, expected: ``, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if output != tc.expected { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestVideoTag(t *testing.T) { input := `

My valid .

` expected := `

My valid .

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAudioAndSourceTag(t *testing.T) { input := `

My music .

` expected := `

My music .

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestUnknownTag(t *testing.T) { input := `

My invalid tag.

` expected := `

My invalid tag.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidNestedTag(t *testing.T) { input := `

My invalid tag with some valid tag.

` expected := `

My invalid tag with some valid tag.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestBlockedIFrameWithChildElements(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestSameDomainIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestInvidiousIFrame(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestIFrameAllowList(t *testing.T) { config.Opts = config.NewConfigOptions() allowedDomains := []string{ "bandcamp.com", "cdn.embedly.com", "dailymotion.com", "framatube.org", "open.spotify.com", "player.bilibili.com", "player.twitch.tv", "player.vimeo.com", "soundcloud.com", "vk.com", "w.soundcloud.com", "youtube-nocookie.com", "youtube.com", } for _, domain := range allowedDomains { t.Run(domain, func(t *testing.T) { input := fmt.Sprintf(``, domain) output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if !strings.Contains(output, "` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestIFrameWithChildElements(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestIFrameWithReferrerPolicy(t *testing.T) { config.Opts = config.NewConfigOptions() input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.com/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestLinkWithTarget(t *testing.T) { input := `

This link is an anchor

` expected := `

This link is an anchor

` output := SanitizeHTML("http://example.org/", input, &SanitizerOptions{OpenLinksInNewTab: true}) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestLinkWithNoTarget(t *testing.T) { input := `

This link is an anchor

` expected := `

This link is an anchor

` output := SanitizeHTML("http://example.org/", input, &SanitizerOptions{OpenLinksInNewTab: false}) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAnchorLink(t *testing.T) { input := `

This link is an anchor

` expected := `

This link is an anchor

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidURLScheme(t *testing.T) { input := `

This link is not valid

` expected := `

This link is not valid

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestURISchemes(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "apt", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "bitcoin", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "callto", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "feed-double-slash", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "feed-https", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "geo", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "itms", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "itms-apps", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "magnet", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "mailto", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "news-double-slash", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "news-single-colon", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "nntp", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "rtmp", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "sip", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "sips", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "skype", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "spotify", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "steam", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "svn", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "svn-ssh", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "tel", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "webcal", input: `

This link is valid

`, expected: `

This link is valid

`, }, { name: "xmpp", input: `

This link is valid

`, expected: `

This link is valid

`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if tc.expected != output { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestBlacklistedLink(t *testing.T) { input := `

This image is not valid

` expected := `

This image is not valid

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestLinkWithTrackers(t *testing.T) { input := `

This link has trackers Test

` expected := `

This link has trackers Test

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestImageSrcWithTrackers(t *testing.T) { input := `

This image has trackers

` expected := `

This image has trackers

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func Test1x1PixelTracker(t *testing.T) { input := `

and

` expected := `

and

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func Test0x0PixelTracker(t *testing.T) { input := `

and

` expected := `

and

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestXmlEntities(t *testing.T) { input := `
echo "test" > /etc/hosts
` expected := `
echo "test" > /etc/hosts
` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestEspaceAttributes(t *testing.T) { input := `text` expected := `text` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceYoutubeURL(t *testing.T) { os.Clearenv() var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceSecureYoutubeURL(t *testing.T) { os.Clearenv() var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceSecureYoutubeURLWithParameters(t *testing.T) { os.Clearenv() var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceYoutubeURLAlreadyReplaced(t *testing.T) { os.Clearenv() var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceProtocolRelativeYoutubeURL(t *testing.T) { os.Clearenv() var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceYoutubeURLWithCustomURL(t *testing.T) { defer os.Clearenv() os.Setenv("YOUTUBE_EMBED_URL_OVERRIDE", "https://invidious.custom/embed/") var err error config.Opts, err = config.NewConfigParser().ParseEnvironmentVariables() if err != nil { t.Fatalf(`Parsing failure: %v`, err) } input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestVimeoIframeRewriteWithQueryString(t *testing.T) { input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestVimeoIframeRewriteWithoutQueryString(t *testing.T) { input := `` expected := `` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: %q != %q`, expected, output) } } func TestReplaceNoScript(t *testing.T) { input := `

Before paragraph.

After paragraph.

` expected := `

Before paragraph.

After paragraph.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceScript(t *testing.T) { input := `

Before paragraph.

After paragraph.

` expected := `

Before paragraph.

After paragraph.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestReplaceStyle(t *testing.T) { input := `

Before paragraph.

After paragraph.

` expected := `

Before paragraph.

After paragraph.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestHiddenParagraph(t *testing.T) { input := `

Before paragraph.

After paragraph.

` expected := `

Before paragraph.

After paragraph.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAttributesAreStripped(t *testing.T) { input := `

Some text.


Test.

` expected := `

Some text.


Test.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestMathML(t *testing.T) { input := `x2` expected := `x2` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestInvalidMathMLXMLNamespace(t *testing.T) { input := `x2` expected := `x2` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestBlockedResourcesSubstrings(t *testing.T) { input := `

Before paragraph.

Blocked Resource

After paragraph.

` expected := `

Before paragraph.

After paragraph.

` output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } input = `

Before paragraph.

Blocked Resource

After paragraph.

` expected = `

Before paragraph.

After paragraph.

` output = sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } input = `

Before paragraph.

Blocked Resource

After paragraph.

` expected = `

Before paragraph.

After paragraph.

` output = sanitizeHTMLWithDefaultOptions("http://example.org/", input) if expected != output { t.Errorf(`Wrong output: "%s" != "%s"`, expected, output) } } func TestAttrLowerCase(t *testing.T) { baseURL := "http://example.org/" testCases := []struct { name string input string expected string }{ { name: "href-and-hidden-mixed-case", input: ``, expected: ``, }, { name: "href-mixed-case", input: `test`, expected: `test`, }, } for _, tc := range testCases { t.Run(tc.name, func(t *testing.T) { output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input) if tc.expected != output { t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output) } }) } } func TestDeeplyNestedpage(t *testing.T) { input := "test" // -3 instead of -1 because is automatically added. for range maxDepth - 3 { input = "
" + input + "
" } output := sanitizeHTMLWithDefaultOptions("http://example.org/", input) want := "test" if output != want { t.Errorf(`Wrong output: "%s" != "%s"`, want, output) } input = "test" for range maxDepth - 2 { input = "
" + input + "
" } output = sanitizeHTMLWithDefaultOptions("http://example.org/", input) if output != "" { t.Errorf(`Wrong output: "%s" != "%s"`, "", output) } }