Ver Fonte

fix(storage): index only the first 500K characters of the article contents to avoid tsvector limits

The length of a tsvector (lexemes + positions) must be less than 1 megabyte.

We don't need to index the entire content, and we need to keep a buffer for the positions.
Frédéric Guillot há 10 meses atrás
pai
commit
84ae1d5dc0
2 ficheiros alterados com 26 adições e 19 exclusões
  1. 17 11
      internal/storage/entry.go
  2. 9 8
      internal/storage/entry_test.go

+ 17 - 11
internal/storage/entry.go

@@ -69,6 +69,7 @@ func (s *Storage) NewEntryQueryBuilder(userID int64) *EntryQueryBuilder {
 
 // UpdateEntryTitleAndContent updates entry title and content.
 func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
+	truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
 	query := `
 		UPDATE
 			entries
@@ -86,8 +87,8 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
 		entry.Title,
 		entry.Content,
 		entry.ReadingTime,
-		truncateStringForTSVectorField(entry.Title),
-		truncateStringForTSVectorField(entry.Content),
+		truncatedTitle,
+		truncatedContent,
 		entry.ID,
 		entry.UserID); err != nil {
 		return fmt.Errorf(`store: unable to update entry #%d: %v`, entry.ID, err)
@@ -98,6 +99,7 @@ func (s *Storage) UpdateEntryTitleAndContent(entry *model.Entry) error {
 
 // createEntry add a new entry.
 func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
+	truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
 	query := `
 		INSERT INTO entries
 			(
@@ -146,8 +148,8 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
 		entry.UserID,
 		entry.FeedID,
 		entry.ReadingTime,
-		truncateStringForTSVectorField(entry.Title),
-		truncateStringForTSVectorField(entry.Content),
+		truncatedTitle,
+		truncatedContent,
 		pq.Array(entry.Tags),
 	).Scan(
 		&entry.ID,
@@ -175,6 +177,7 @@ func (s *Storage) createEntry(tx *sql.Tx, entry *model.Entry) error {
 // Note: we do not update the published date because some feeds do not contains any date,
 // it default to time.Now() which could change the order of items on the history page.
 func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
+	truncatedTitle, truncatedContent := truncateTitleAndContentForTSVectorField(entry.Title, entry.Content)
 	query := `
 		UPDATE
 			entries
@@ -200,8 +203,8 @@ func (s *Storage) updateEntry(tx *sql.Tx, entry *model.Entry) error {
 		entry.Content,
 		entry.Author,
 		entry.ReadingTime,
-		truncateStringForTSVectorField(entry.Title),
-		truncateStringForTSVectorField(entry.Content),
+		truncatedTitle,
+		truncatedContent,
 		entry.UserID,
 		entry.FeedID,
 		entry.Hash,
@@ -702,17 +705,20 @@ func (s *Storage) UnshareEntry(userID int64, entryID int64) (err error) {
 	return
 }
 
-// truncateStringForTSVectorField truncates a string to fit within the maximum size for a TSVector field in PostgreSQL.
-func truncateStringForTSVectorField(s string) string {
+func truncateTitleAndContentForTSVectorField(title, content string) (string, string) {
 	// The length of a tsvector (lexemes + positions) must be less than 1 megabyte.
-	const maxTSVectorSize = 1024 * 1024
+	// We don't need to index the entire content, and we need to keep a buffer for the positions.
+	return truncateStringForTSVectorField(title, 200000), truncateStringForTSVectorField(content, 500000)
+}
 
-	if len(s) < maxTSVectorSize {
+// truncateStringForTSVectorField truncates a string and don't break UTF-8 characters.
+func truncateStringForTSVectorField(s string, maxSize int) string {
+	if len(s) < maxSize {
 		return s
 	}
 
 	// Truncate to fit under the limit, ensuring we don't break UTF-8 characters
-	truncated := s[:maxTSVectorSize-1]
+	truncated := s[:maxSize-1]
 
 	// Walk backwards to find the last complete UTF-8 character
 	for i := len(truncated) - 1; i >= 0; i-- {

+ 9 - 8
internal/storage/entry_test.go

@@ -9,20 +9,21 @@ import (
 )
 
 func TestTruncateStringForTSVectorField(t *testing.T) {
+	const megabyte = 1024 * 1024
+
 	// Test case 1: Short Chinese text should not be truncated
 	shortText := "这是一个简短的中文测试文本"
-	result := truncateStringForTSVectorField(shortText)
+	result := truncateStringForTSVectorField(shortText, megabyte)
 	if result != shortText {
 		t.Errorf("Short text should not be truncated, got %s", result)
 	}
 
 	// Test case 2: Long Chinese text should be truncated to stay under 1MB
 	// Generate a long Chinese string that would exceed 1MB
-	const megabyte = 1024 * 1024
 	chineseChar := "汉"
 	longText := strings.Repeat(chineseChar, megabyte/len(chineseChar)+1000) // Ensure it exceeds 1MB
 
-	result = truncateStringForTSVectorField(longText)
+	result = truncateStringForTSVectorField(longText, megabyte)
 
 	// Verify the result is under 1MB
 	if len(result) >= megabyte {
@@ -36,14 +37,14 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
 
 	// Test case 3: Text exactly at limit should not be truncated
 	limitText := strings.Repeat("a", megabyte-1)
-	result = truncateStringForTSVectorField(limitText)
+	result = truncateStringForTSVectorField(limitText, megabyte)
 	if result != limitText {
 		t.Error("Text under limit should not be truncated")
 	}
 
 	// Test case 4: Mixed Chinese and ASCII text
 	mixedText := strings.Repeat("测试Test汉字", megabyte/20) // Create large mixed text
-	result = truncateStringForTSVectorField(mixedText)
+	result = truncateStringForTSVectorField(mixedText, megabyte)
 
 	if len(result) >= megabyte {
 		t.Errorf("Mixed text should be truncated under 1MB, got %d bytes", len(result))
@@ -56,7 +57,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
 
 	// Test case 5: Large text ending with ASCII characters
 	asciiSuffix := strings.Repeat("a", megabyte-100) + strings.Repeat("测试", 50) + "abcdef"
-	result = truncateStringForTSVectorField(asciiSuffix)
+	result = truncateStringForTSVectorField(asciiSuffix, megabyte)
 
 	if len(result) >= megabyte {
 		t.Errorf("ASCII suffix text should be truncated under 1MB, got %d bytes", len(result))
@@ -69,7 +70,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
 
 	// Test case 6: Large ASCII text to cover ASCII branch in UTF-8 detection
 	largeAscii := strings.Repeat("abcdefghijklmnopqrstuvwxyz", megabyte/26+1000)
-	result = truncateStringForTSVectorField(largeAscii)
+	result = truncateStringForTSVectorField(largeAscii, megabyte)
 
 	if len(result) >= megabyte {
 		t.Errorf("Large ASCII text should be truncated under 1MB, got %d bytes", len(result))
@@ -87,7 +88,7 @@ func TestTruncateStringForTSVectorField(t *testing.T) {
 	for i := range invalidBytes {
 		invalidBytes[i] = 0x80 // Continuation byte without start byte
 	}
-	result = truncateStringForTSVectorField(string(invalidBytes))
+	result = truncateStringForTSVectorField(string(invalidBytes), megabyte)
 
 	// Should return empty string as fallback
 	if result != "" {