2 kuukautta sitten · f66772e911
--- a/internal/reader/sanitizer/truncate.go
+++ b/internal/reader/sanitizer/truncate.go
@@ -3,19 +3,86 @@
 
															 package sanitizer
														
 
															-import "strings"
														
 
															+import (
														
 
															+	"strings"
														
 
															+	"unicode"
														
 
															+	"unicode/utf8"
														
 
															+)
														
 
															-func TruncateHTML(input string, max int) string {
														
 
															-	text := StripTags(input)
														
 
															+// TruncateHTML returns cleaned up and shortened version of input.
														
 
															+//   - HTML tags are removed
														
 
															+//   - Consecutive whitespace characters replaced with single SPACE (0x20) character
														
 
															+//   - If input has more runes than limit, it's truncated
														
 
															+func TruncateHTML(input string, limit int) string {
														
 
															+	dst := &strings.Builder{}
														
 
															+	src := strings.NewReader(input)
														
 
															-	// Collapse multiple spaces into a single space
														
 
															-	text = strings.Join(strings.Fields(text), " ")
														
 
															+	words := 0
														
 
															+	count := 0
														
 
															+	needspace := false
														
 
															-	// Convert to runes to be safe with unicode
														
 
															-	runes := []rune(text)
														
 
															-	if len(runes) > max {
														
 
															-		return strings.TrimSpace(string(runes[:max])) + "…"
														
 
															+	err := stripIter(src, func(token string) bool {
														
 
															+		// Skip leading space.
														
 
															+		if words > 0 {
														
 
															+			// Add a space between tokens if there's one before HTML tag.
														
 
															+			r, _ := utf8.DecodeRuneInString(token)
														
 
															+			needspace = needspace || unicode.IsSpace(r)
														
 
															+		}
														
 
															+
														
 
															+		for word := range strings.FieldsSeq(token) {
														
 
															+			if needspace {
														
 
															+				if count += 1; count > limit {
														
 
															+					return false
														
 
															+				}
														
 
															+			}
														
 
															+
														
 
															+			// Compute how much of the word we can use later.
														
 
															+			wordlen := 0
														
 
															+			for wordlen < len(word) {
														
 
															+				if count += 1; count > limit {
														
 
															+					break
														
 
															+				}
														
 
															+
														
 
															+				r, w := utf8.DecodeRuneInString(word[wordlen:])
														
 
															+				if r == utf8.RuneError {
														
 
															+					wordlen += 1
														
 
															+					continue
														
 
															+				}
														
 
															+
														
 
															+				wordlen += w
														
 
															+			}
														
 
															+
														
 
															+			// This is the only place where space being placed.
														
 
															+			// That way any sequence of space characters ends up as a singular SPACE (0x20) character.
														
 
															+			//
														
 
															+			// wordlen > 0 skips spaces if no printable characters left.
														
 
															+			if needspace && wordlen > 0 {
														
 
															+				dst.WriteByte(' ')
														
 
															+			}
														
 
															+
														
 
															+			dst.WriteString(word[:wordlen])
														
 
															+
														
 
															+			if count > limit {
														
 
															+				return false
														
 
															+			}
														
 
															+
														
 
															+			needspace = true // To insert spaces in-between words in a token.
														
 
															+			words++
														
 
															+		}
														
 
															+
														
 
															+		// Add a space between tokens if there's one after HTML tag.
														
 
															+		r, _ := utf8.DecodeLastRuneInString(token)
														
 
															+		needspace = unicode.IsSpace(r) && words > 0
														
 
															+
														
 
															+		return true
														
 
															+	})
														
 
															+	if err != nil {
														
 
															+		return ""
														
 
															+	}
														
 
															+
														
 
															+	if count > limit {
														
 
															+		dst.WriteRune('…')
														
 
															 	}
														
 
															-	return text
														
 
															+	return dst.String()
														
 
															 }
														
--- a/internal/reader/sanitizer/truncate_test.go
+++ b/internal/reader/sanitizer/truncate_test.go
@@ -76,6 +76,72 @@ func TestTruncateHTML(t *testing.T) {
 
															 			maxLen:   20,
														
 
															 			expected: "hello world",
														
 
															 		},
														
 
															+		{
														
 
															+			name:     "just enough characters",
														
 
															+			input:    "Hello",
														
 
															+			maxLen:   5,
														
 
															+			expected: "Hello",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "just enough unicode characters",
														
 
															+			input:    "Привет",
														
 
															+			maxLen:   6,
														
 
															+			expected: "Привет",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "spaces around tag",
														
 
															+			input:    "hello <br/> world",
														
 
															+			maxLen:   20,
														
 
															+			expected: "hello world",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "leading spaces",
														
 
															+			input:    "  hello world",
														
 
															+			maxLen:   5,
														
 
															+			expected: "hello…",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "text above limit with space at the end",
														
 
															+			input:    "hello world",
														
 
															+			maxLen:   6,
														
 
															+			expected: "hello…",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "leading space before tag",
														
 
															+			input:    " <a>hello</a>",
														
 
															+			maxLen:   15,
														
 
															+			expected: "hello",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "space-only tokens in between tags",
														
 
															+			input:    "hello <br/>\t<a> </a>world",
														
 
															+			maxLen:   15,
														
 
															+			expected: "hello world",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "truncate mid-word",
														
 
															+			input:    "hello world",
														
 
															+			maxLen:   8,
														
 
															+			expected: "hello wo…",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "truncate mid-word with unicode",
														
 
															+			input:    "Съешь ещё этих мягких французских булок, да выпей же чаю",
														
 
															+			maxLen:   25,
														
 
															+			expected: "Съешь ещё этих мягких фра…",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "negative limit",
														
 
															+			input:    "whatever",
														
 
															+			maxLen:   -10,
														
 
															+			expected: "…",
														
 
															+		},
														
 
															+		{
														
 
															+			name:     "zero limit",
														
 
															+			input:    "whatever",
														
 
															+			maxLen:   0,
														
 
															+			expected: "…",
														
 
															+		},
														
 
															 	}
														
 
															 	for _, tt := range tests {