Bläddra i källkod

feat(sanitizer): speed up TruncateHTML by a lot

gudvinr 5 dagar sedan
förälder
incheckning
f66772e911
2 ändrade filer med 143 tillägg och 10 borttagningar
  1. 77 10
      internal/reader/sanitizer/truncate.go
  2. 66 0
      internal/reader/sanitizer/truncate_test.go

+ 77 - 10
internal/reader/sanitizer/truncate.go

@@ -3,19 +3,86 @@
 
 
 package sanitizer
 package sanitizer
 
 
-import "strings"
+import (
+	"strings"
+	"unicode"
+	"unicode/utf8"
+)
 
 
-func TruncateHTML(input string, max int) string {
-	text := StripTags(input)
+// TruncateHTML returns cleaned up and shortened version of input.
+//   - HTML tags are removed
+//   - Consecutive whitespace characters replaced with single SPACE (0x20) character
+//   - If input has more runes than limit, it's truncated
+func TruncateHTML(input string, limit int) string {
+	dst := &strings.Builder{}
+	src := strings.NewReader(input)
 
 
-	// Collapse multiple spaces into a single space
-	text = strings.Join(strings.Fields(text), " ")
+	words := 0
+	count := 0
+	needspace := false
 
 
-	// Convert to runes to be safe with unicode
-	runes := []rune(text)
-	if len(runes) > max {
-		return strings.TrimSpace(string(runes[:max])) + "…"
+	err := stripIter(src, func(token string) bool {
+		// Skip leading space.
+		if words > 0 {
+			// Add a space between tokens if there's one before HTML tag.
+			r, _ := utf8.DecodeRuneInString(token)
+			needspace = needspace || unicode.IsSpace(r)
+		}
+
+		for word := range strings.FieldsSeq(token) {
+			if needspace {
+				if count += 1; count > limit {
+					return false
+				}
+			}
+
+			// Compute how much of the word we can use later.
+			wordlen := 0
+			for wordlen < len(word) {
+				if count += 1; count > limit {
+					break
+				}
+
+				r, w := utf8.DecodeRuneInString(word[wordlen:])
+				if r == utf8.RuneError {
+					wordlen += 1
+					continue
+				}
+
+				wordlen += w
+			}
+
+			// This is the only place where space being placed.
+			// That way any sequence of space characters ends up as a singular SPACE (0x20) character.
+			//
+			// wordlen > 0 skips spaces if no printable characters left.
+			if needspace && wordlen > 0 {
+				dst.WriteByte(' ')
+			}
+
+			dst.WriteString(word[:wordlen])
+
+			if count > limit {
+				return false
+			}
+
+			needspace = true // To insert spaces in-between words in a token.
+			words++
+		}
+
+		// Add a space between tokens if there's one after HTML tag.
+		r, _ := utf8.DecodeLastRuneInString(token)
+		needspace = unicode.IsSpace(r) && words > 0
+
+		return true
+	})
+	if err != nil {
+		return ""
+	}
+
+	if count > limit {
+		dst.WriteRune('…')
 	}
 	}
 
 
-	return text
+	return dst.String()
 }
 }

+ 66 - 0
internal/reader/sanitizer/truncate_test.go

@@ -76,6 +76,72 @@ func TestTruncateHTML(t *testing.T) {
 			maxLen:   20,
 			maxLen:   20,
 			expected: "hello world",
 			expected: "hello world",
 		},
 		},
+		{
+			name:     "just enough characters",
+			input:    "Hello",
+			maxLen:   5,
+			expected: "Hello",
+		},
+		{
+			name:     "just enough unicode characters",
+			input:    "Привет",
+			maxLen:   6,
+			expected: "Привет",
+		},
+		{
+			name:     "spaces around tag",
+			input:    "hello <br/> world",
+			maxLen:   20,
+			expected: "hello world",
+		},
+		{
+			name:     "leading spaces",
+			input:    "  hello world",
+			maxLen:   5,
+			expected: "hello…",
+		},
+		{
+			name:     "text above limit with space at the end",
+			input:    "hello world",
+			maxLen:   6,
+			expected: "hello…",
+		},
+		{
+			name:     "leading space before tag",
+			input:    " <a>hello</a>",
+			maxLen:   15,
+			expected: "hello",
+		},
+		{
+			name:     "space-only tokens in between tags",
+			input:    "hello <br/>\t<a> </a>world",
+			maxLen:   15,
+			expected: "hello world",
+		},
+		{
+			name:     "truncate mid-word",
+			input:    "hello world",
+			maxLen:   8,
+			expected: "hello wo…",
+		},
+		{
+			name:     "truncate mid-word with unicode",
+			input:    "Съешь ещё этих мягких французских булок, да выпей же чаю",
+			maxLen:   25,
+			expected: "Съешь ещё этих мягких фра…",
+		},
+		{
+			name:     "negative limit",
+			input:    "whatever",
+			maxLen:   -10,
+			expected: "…",
+		},
+		{
+			name:     "zero limit",
+			input:    "whatever",
+			maxLen:   0,
+			expected: "…",
+		},
 	}
 	}
 
 
 	for _, tt := range tests {
 	for _, tt := range tests {