// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
// SPDX-License-Identifier: Apache-2.0
package sanitizer
import (
"os"
"strconv"
"testing"
)
func TestTruncateHTML(t *testing.T) {
tests := []struct {
name string
input string
maxLen int
expected string
}{
{
name: "text lower than limit",
input: "This is a bug 🐛.",
maxLen: 50,
expected: "This is a bug 🐛.",
},
{
name: "text above limit",
input: "This is HTML.",
maxLen: 4,
expected: "This…",
},
{
name: "unicode text above limit",
input: "This is a bike 🚲.",
maxLen: 4,
expected: "This…",
},
{
name: "multiline text above limit",
input: "\n\t\tThis is a bike\n\t\t🚲.\n\n\t",
maxLen: 15,
expected: "This is a bike…",
},
{
name: "multiline text lower than limit",
input: "\n\t\tThis is a bike\n 🚲.\n\n\t",
maxLen: 20,
expected: "This is a bike 🚲.",
},
{
name: "multiple spaces",
input: "hello world test",
maxLen: 20,
expected: "hello world test",
},
{
name: "tabs and newlines",
input: "hello\t\tworld\n\ntest",
maxLen: 20,
expected: "hello world test",
},
{
name: "truncation with unicode",
input: "hello world 你好",
maxLen: 11,
expected: "hello world…",
},
{
name: "html stripping",
input: "
hello world test
",
maxLen: 20,
expected: "hello world test",
},
{
name: "no truncation needed",
input: "hello world",
maxLen: 20,
expected: "hello world",
},
{
name: "just enough characters",
input: "Hello",
maxLen: 5,
expected: "Hello",
},
{
name: "just enough unicode characters",
input: "Привет",
maxLen: 6,
expected: "Привет",
},
{
name: "spaces around tag",
input: "hello
world",
maxLen: 20,
expected: "hello world",
},
{
name: "leading spaces",
input: " hello world",
maxLen: 5,
expected: "hello…",
},
{
name: "text above limit with space at the end",
input: "hello world",
maxLen: 6,
expected: "hello…",
},
{
name: "leading space before tag",
input: " hello",
maxLen: 15,
expected: "hello",
},
{
name: "space-only tokens in between tags",
input: "hello
\t world",
maxLen: 15,
expected: "hello world",
},
{
name: "truncate mid-word",
input: "hello world",
maxLen: 8,
expected: "hello wo…",
},
{
name: "truncate mid-word with unicode",
input: "Съешь ещё этих мягких французских булок, да выпей же чаю",
maxLen: 25,
expected: "Съешь ещё этих мягких фра…",
},
{
name: "negative limit",
input: "whatever",
maxLen: -10,
expected: "…",
},
{
name: "zero limit",
input: "whatever",
maxLen: 0,
expected: "…",
},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
result := TruncateHTML(tt.input, tt.maxLen)
if result != tt.expected {
t.Errorf("TruncateHTML(%q, %d) = %q, want %q",
tt.input, tt.maxLen, result, tt.expected)
}
})
}
}
func BenchmarkTruncateHTML(b *testing.B) {
benches := []struct {
filename string
limit int
}{
{
filename: "miniflux_github.html",
limit: 100,
},
{
filename: "miniflux_github.html",
limit: 10_000,
},
{
filename: "miniflux_wikipedia.html",
limit: 100,
},
{
filename: "miniflux_wikipedia.html",
limit: 100_000,
},
}
for _, f := range benches {
data, err := os.ReadFile("testdata/" + f.filename)
if err != nil {
b.Fatalf(`Unable to read file %q: %v`, f.filename, err)
}
b.Run(f.filename+"_"+strconv.Itoa(f.limit), func(b *testing.B) {
var junk string
str := string(data)
for b.Loop() {
junk = TruncateHTML(str, 100)
}
_ = junk
})
}
}