Pārlūkot izejas kodu

feat(sanitizer): improve text truncation with better space handling

Frédéric Guillot 1 gadu atpakaļ
vecāks
revīzija
f2f60a8f73

+ 3 - 2
internal/reader/sanitizer/truncate.go

@@ -9,8 +9,9 @@ func TruncateHTML(input string, max int) string {
 	text := StripTags(input)
 	text = strings.ReplaceAll(text, "\n", " ")
 	text = strings.ReplaceAll(text, "\t", " ")
-	text = strings.ReplaceAll(text, "  ", " ")
-	text = strings.TrimSpace(text)
+
+	// Collapse multiple spaces into a single space
+	text = strings.Join(strings.Fields(text), " ")
 
 	// Convert to runes to be safe with unicode
 	runes := []rune(text)

+ 50 - 0
internal/reader/sanitizer/truncate_test.go

@@ -62,3 +62,53 @@ func TestTruncateHTMLWithMultilineTextLowerThanLimit(t *testing.T) {
 		t.Errorf(`Wrong output: %q != %q`, expected, output)
 	}
 }
+
+func TestTruncateHTMLWithMultipleSpaces(t *testing.T) {
+	tests := []struct {
+		name     string
+		input    string
+		maxLen   int
+		expected string
+	}{
+		{
+			name:     "multiple spaces",
+			input:    "hello    world   test",
+			maxLen:   20,
+			expected: "hello world test",
+		},
+		{
+			name:     "tabs and newlines",
+			input:    "hello\t\tworld\n\ntest",
+			maxLen:   20,
+			expected: "hello world test",
+		},
+		{
+			name:     "truncation with unicode",
+			input:    "hello world 你好",
+			maxLen:   11,
+			expected: "hello world…",
+		},
+		{
+			name:     "html stripping",
+			input:    "<p>hello    <b>world</b>   test</p>",
+			maxLen:   20,
+			expected: "hello world test",
+		},
+		{
+			name:     "no truncation needed",
+			input:    "hello world",
+			maxLen:   20,
+			expected: "hello world",
+		},
+	}
+
+	for _, tt := range tests {
+		t.Run(tt.name, func(t *testing.T) {
+			result := TruncateHTML(tt.input, tt.maxLen)
+			if result != tt.expected {
+				t.Errorf("TruncateHTML(%q, %d) = %q, want %q",
+					tt.input, tt.maxLen, result, tt.expected)
+			}
+		})
+	}
+}