| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697 |
- // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
- // SPDX-License-Identifier: Apache-2.0
- package storage
- import (
- "strings"
- "testing"
- )
- func TestTruncateStringForTSVectorField(t *testing.T) {
- const megabyte = 1024 * 1024
- // Test case 1: Short Chinese text should not be truncated
- shortText := "这是一个简短的中文测试文本"
- result := truncateStringForTSVectorField(shortText, megabyte)
- if result != shortText {
- t.Errorf("Short text should not be truncated, got %s", result)
- }
- // Test case 2: Long Chinese text should be truncated to stay under 1MB
- // Generate a long Chinese string that would exceed 1MB
- chineseChar := "汉"
- longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB
- result = truncateStringForTSVectorField(longText, megabyte)
- // Verify the result is under 1MB
- if len(result) >= megabyte {
- t.Errorf("Truncated text should be under 1MB, got %d bytes", len(result))
- }
- // Verify the result is still valid UTF-8 and doesn't cut in the middle of a character
- if !strings.HasPrefix(longText, result) {
- t.Error("Truncated text should be a prefix of original text")
- }
- // Test case 3: Text exactly at limit should not be truncated
- limitText := strings.Repeat("a", megabyte-1)
- result = truncateStringForTSVectorField(limitText, megabyte)
- if result != limitText {
- t.Error("Text under limit should not be truncated")
- }
- // Test case 4: Mixed Chinese and ASCII text
- mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
- result = truncateStringForTSVectorField(mixedText, megabyte)
- if len(result) >= megabyte {
- t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
- }
- // Verify no broken UTF-8 sequences
- if !strings.HasPrefix(mixedText, result) {
- t.Error("Truncated mixed text should be a valid prefix")
- }
- // Test case 5: Large text ending with ASCII characters
- asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
- result = truncateStringForTSVectorField(asciiSuffix, megabyte)
- if len(result) >= megabyte {
- t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
- }
- // Should end with ASCII character
- if !strings.HasPrefix(asciiSuffix, result) {
- t.Error("Truncated ASCII suffix text should be a valid prefix")
- }
- // Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
- largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
- result = truncateStringForTSVectorField(largeAscii, megabyte)
- if len(result) >= megabyte {
- t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
- }
- // Should be a prefix
- if !strings.HasPrefix(largeAscii, result) {
- t.Error("Truncated ASCII text should be a valid prefix")
- }
- // Test case 7: Edge case - string that would trigger the fallback
- // Create a pathological case: all continuation bytes without start bytes
- // This should trigger the fallback because there are no valid UTF-8 boundaries
- invalidBytes := make([]byte, megabyte)
- for i := range invalidBytes {
- invalidBytes[i] = 0x80 // Continuation byte without start byte
- }
- result = truncateStringForTSVectorField(string(invalidBytes), megabyte)
- // Should return empty string as fallback
- if result != "" {
- t.Errorf("Invalid UTF-8 continuation bytes should return empty string, got %d bytes", len(result))
- }
- }
|