Browse Source

feat(storage): add limit parameter to `ClearRemovedEntriesContent`

Without the limit, this query is going to hangs forever on large
databases with millions of entries.
Frédéric Guillot 7 months ago
parent
commit
5403ca09f6
2 changed files with 13 additions and 6 deletions
  1. 1 1
      internal/cli/cleanup_tasks.go
  2. 12 5
      internal/storage/entry.go

+ 1 - 1
internal/cli/cleanup_tasks.go

@@ -54,7 +54,7 @@ func runCleanupTasks(store *storage.Storage) {
 			slog.Int64("removed_entries_enclosures_deleted", enclosuresAffected))
 	}
 
-	if contentAffected, err := store.ClearRemovedEntriesContent(); err != nil {
+	if contentAffected, err := store.ClearRemovedEntriesContent(config.Opts.CleanupArchiveBatchSize()); err != nil {
 		slog.Error("Unable to clear content from removed entries", slog.Any("error", err))
 	} else {
 		slog.Info("Clearing content from removed entries completed",

+ 12 - 5
internal/storage/entry.go

@@ -297,7 +297,7 @@ func (s *Storage) DeleteRemovedEntriesEnclosures() (int64, error) {
 }
 
 // ClearRemovedEntriesContent clears the content fields of entries marked as "removed", keeping only their metadata.
-func (s *Storage) ClearRemovedEntriesContent() (int64, error) {
+func (s *Storage) ClearRemovedEntriesContent(limit int) (int64, error) {
 	query := `
 		UPDATE
 			entries
@@ -305,12 +305,19 @@ func (s *Storage) ClearRemovedEntriesContent() (int64, error) {
 			title='',
 			content=NULL,
 			url='',
-			author=NULL
-		WHERE
-			status=$1 AND content IS NOT NULL
+			author=NULL,
+			comments_url=NULL,
+			document_vectors=NULL
+		WHERE id IN (
+			SELECT id
+			FROM entries
+			WHERE status = $1 AND content IS NOT NULL
+			ORDER BY id ASC
+			LIMIT $2
+		)
 	`
 
-	result, err := s.db.Exec(query, model.EntryStatusRemoved)
+	result, err := s.db.Exec(query, model.EntryStatusRemoved, limit)
 	if err != nil {
 		return 0, fmt.Errorf(`store: unable to clear content from removed entries: %v`, err)
 	}