entry_test.go 3.4 KB

12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182838485868788899091929394959697
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package storage
  4. import (
  5. "strings"
  6. "testing"
  7. )
  8. func TestTruncateStringForTSVectorField(t *testing.T) {
  9. const megabyte = 1024 * 1024
  10. // Test case 1: Short Chinese text should not be truncated
  11. shortText := "这是一个简短的中文测试文本"
  12. result := truncateStringForTSVectorField(shortText, megabyte)
  13. if result != shortText {
  14. t.Errorf("Short text should not be truncated, got %s", result)
  15. }
  16. // Test case 2: Long Chinese text should be truncated to stay under 1MB
  17. // Generate a long Chinese string that would exceed 1MB
  18. chineseChar := "汉"
  19. longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB
  20. result = truncateStringForTSVectorField(longText, megabyte)
  21. // Verify the result is under 1MB
  22. if len(result) >= megabyte {
  23. t.Errorf("Truncated text should be under 1MB, got %d bytes", len(result))
  24. }
  25. // Verify the result is still valid UTF-8 and doesn't cut in the middle of a character
  26. if !strings.HasPrefix(longText, result) {
  27. t.Error("Truncated text should be a prefix of original text")
  28. }
  29. // Test case 3: Text exactly at limit should not be truncated
  30. limitText := strings.Repeat("a", megabyte-1)
  31. result = truncateStringForTSVectorField(limitText, megabyte)
  32. if result != limitText {
  33. t.Error("Text under limit should not be truncated")
  34. }
  35. // Test case 4: Mixed Chinese and ASCII text
  36. mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
  37. result = truncateStringForTSVectorField(mixedText, megabyte)
  38. if len(result) >= megabyte {
  39. t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
  40. }
  41. // Verify no broken UTF-8 sequences
  42. if !strings.HasPrefix(mixedText, result) {
  43. t.Error("Truncated mixed text should be a valid prefix")
  44. }
  45. // Test case 5: Large text ending with ASCII characters
  46. asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
  47. result = truncateStringForTSVectorField(asciiSuffix, megabyte)
  48. if len(result) >= megabyte {
  49. t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
  50. }
  51. // Should end with ASCII character
  52. if !strings.HasPrefix(asciiSuffix, result) {
  53. t.Error("Truncated ASCII suffix text should be a valid prefix")
  54. }
  55. // Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
  56. largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
  57. result = truncateStringForTSVectorField(largeAscii, megabyte)
  58. if len(result) >= megabyte {
  59. t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
  60. }
  61. // Should be a prefix
  62. if !strings.HasPrefix(largeAscii, result) {
  63. t.Error("Truncated ASCII text should be a valid prefix")
  64. }
  65. // Test case 7: Edge case - string that would trigger the fallback
  66. // Create a pathological case: all continuation bytes without start bytes
  67. // This should trigger the fallback because there are no valid UTF-8 boundaries
  68. invalidBytes := make([]byte, megabyte)
  69. for i := range invalidBytes {
  70. invalidBytes[i] = 0x80 // Continuation byte without start byte
  71. }
  72. result = truncateStringForTSVectorField(string(invalidBytes), megabyte)
  73. // Should return empty string as fallback
  74. if result != "" {
  75. t.Errorf("Invalid UTF-8 continuation bytes should return empty string, got %d bytes", len(result))
  76. }
  77. }