11 kuukautta sitten · cb59944d6b
--- a/internal/reader/processor/processor.go
+++ b/internal/reader/processor/processor.go
@@ -6,7 +6,6 @@ package processor // import "miniflux.app/v2/internal/reader/processor"
 
				 import (
			
 
				 	"log/slog"
			
 
				 	"net/url"
			
 
				-	"regexp"
			
 
				 	"slices"
			
 
				 	"time"
			
 
				 
			
@@ -24,8 +23,6 @@ import (
 
				 	"miniflux.app/v2/internal/storage"
			
 
				 )
			
 
				 
			
 
				-var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`)
			
 
				-
			
 
				 // ProcessFeedEntries downloads original web page for entries and apply filters.
			
 
				 func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64, forceRefresh bool) {
			
 
				 	var filteredEntries model.Entries
			
@@ -60,7 +57,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
 
				 		}
			
 
				 
			
 
				 		webpageBaseURL := ""
			
 
				-		entry.URL = rewriteEntryURL(feed, entry)
			
 
				+		entry.URL = rewrite.RewriteEntryURL(feed, entry)
			
 
				 		entryIsNew := store.IsNewEntry(feed.ID, entry.Hash)
			
 
				 		if feed.Crawler && (entryIsNew || forceRefresh) {
			
 
				 			slog.Debug("Scraping entry",
			
@@ -143,7 +140,7 @@ func ProcessFeedEntries(store *storage.Storage, feed *model.Feed, userID int64,
 
				 // ProcessEntryWebPage downloads the entry web page and apply rewrite rules.
			
 
				 func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User) error {
			
 
				 	startTime := time.Now()
			
 
				-	entry.URL = rewriteEntryURL(feed, entry)
			
 
				+	entry.URL = rewrite.RewriteEntryURL(feed, entry)
			
 
				 
			
 
				 	requestBuilder := fetcher.NewRequestBuilder()
			
 
				 	requestBuilder.WithUserAgent(feed.UserAgent, config.Opts.HTTPClientUserAgent())
			
@@ -187,41 +184,6 @@ func ProcessEntryWebPage(feed *model.Feed, entry *model.Entry, user *model.User)
 
				 	return nil
			
 
				 }
			
 
				 
			
 
				-func rewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
			
 
				-	var rewrittenURL = entry.URL
			
 
				-	if feed.UrlRewriteRules != "" {
			
 
				-		parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
			
 
				-
			
 
				-		if len(parts) >= 3 {
			
 
				-			re, err := regexp.Compile(parts[1])
			
 
				-			if err != nil {
			
 
				-				slog.Error("Failed on regexp compilation",
			
 
				-					slog.String("url_rewrite_rules", feed.UrlRewriteRules),
			
 
				-					slog.Any("error", err),
			
 
				-				)
			
 
				-				return rewrittenURL
			
 
				-			}
			
 
				-			rewrittenURL = re.ReplaceAllString(entry.URL, parts[2])
			
 
				-			slog.Debug("Rewriting entry URL",
			
 
				-				slog.String("original_entry_url", entry.URL),
			
 
				-				slog.String("rewritten_entry_url", rewrittenURL),
			
 
				-				slog.Int64("feed_id", feed.ID),
			
 
				-				slog.String("feed_url", feed.FeedURL),
			
 
				-			)
			
 
				-		} else {
			
 
				-			slog.Debug("Cannot find search and replace terms for replace rule",
			
 
				-				slog.String("original_entry_url", entry.URL),
			
 
				-				slog.String("rewritten_entry_url", rewrittenURL),
			
 
				-				slog.Int64("feed_id", feed.ID),
			
 
				-				slog.String("feed_url", feed.FeedURL),
			
 
				-				slog.String("url_rewrite_rules", feed.UrlRewriteRules),
			
 
				-			)
			
 
				-		}
			
 
				-	}
			
 
				-
			
 
				-	return rewrittenURL
			
 
				-}
			
 
				-
			
 
				 func isRecentEntry(entry *model.Entry) bool {
			
 
				 	if config.Opts.FilterEntryMaxAgeDays() == 0 || entry.Date.After(time.Now().AddDate(0, 0, -config.Opts.FilterEntryMaxAgeDays())) {
			
 
				 		return true
			
--- a/internal/reader/rewrite/content_rewrite.go
+++ b/internal/reader/rewrite/content_rewrite.go
--- a/internal/reader/rewrite/content_rewrite_functions.go
+++ b/internal/reader/rewrite/content_rewrite_functions.go
--- a/internal/reader/rewrite/content_rewrite_rules.go
+++ b/internal/reader/rewrite/content_rewrite_rules.go
--- a/internal/reader/rewrite/content_rewrite_test.go
+++ b/internal/reader/rewrite/content_rewrite_test.go
--- a/internal/reader/rewrite/url_rewrite.go
+++ b/internal/reader/rewrite/url_rewrite.go
@@ -0,0 +1,48 @@
 
				+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
			
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+
			
 
				+package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
			
 
				+
			
 
				+import (
			
 
				+	"log/slog"
			
 
				+	"regexp"
			
 
				+
			
 
				+	"miniflux.app/v2/internal/model"
			
 
				+)
			
 
				+
			
 
				+var customReplaceRuleRegex = regexp.MustCompile(`rewrite\("([^"]+)"\|"([^"]+)"\)`)
			
 
				+
			
 
				+func RewriteEntryURL(feed *model.Feed, entry *model.Entry) string {
			
 
				+	var rewrittenURL = entry.URL
			
 
				+	if feed.UrlRewriteRules != "" {
			
 
				+		parts := customReplaceRuleRegex.FindStringSubmatch(feed.UrlRewriteRules)
			
 
				+
			
 
				+		if len(parts) >= 3 {
			
 
				+			re, err := regexp.Compile(parts[1])
			
 
				+			if err != nil {
			
 
				+				slog.Error("Failed on regexp compilation",
			
 
				+					slog.String("url_rewrite_rules", feed.UrlRewriteRules),
			
 
				+					slog.Any("error", err),
			
 
				+				)
			
 
				+				return rewrittenURL
			
 
				+			}
			
 
				+			rewrittenURL = re.ReplaceAllString(entry.URL, parts[2])
			
 
				+			slog.Debug("Rewriting entry URL",
			
 
				+				slog.String("original_entry_url", entry.URL),
			
 
				+				slog.String("rewritten_entry_url", rewrittenURL),
			
 
				+				slog.Int64("feed_id", feed.ID),
			
 
				+				slog.String("feed_url", feed.FeedURL),
			
 
				+			)
			
 
				+		} else {
			
 
				+			slog.Debug("Cannot find search and replace terms for replace rule",
			
 
				+				slog.String("original_entry_url", entry.URL),
			
 
				+				slog.String("rewritten_entry_url", rewrittenURL),
			
 
				+				slog.Int64("feed_id", feed.ID),
			
 
				+				slog.String("feed_url", feed.FeedURL),
			
 
				+				slog.String("url_rewrite_rules", feed.UrlRewriteRules),
			
 
				+			)
			
 
				+		}
			
 
				+	}
			
 
				+
			
 
				+	return rewrittenURL
			
 
				+}
			
--- a/internal/reader/rewrite/url_rewrite_test.go
+++ b/internal/reader/rewrite/url_rewrite_test.go
@@ -0,0 +1,297 @@
 
				+// SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
			
 
				+// SPDX-License-Identifier: Apache-2.0
			
 
				+
			
 
				+package rewrite // import "miniflux.app/v2/internal/reader/rewrite"
			
 
				+
			
 
				+import (
			
 
				+	"testing"
			
 
				+
			
 
				+	"miniflux.app/v2/internal/model"
			
 
				+)
			
 
				+
			
 
				+func TestRewriteEntryURL(t *testing.T) {
			
 
				+	scenarios := []struct {
			
 
				+		name        string
			
 
				+		feed        *model.Feed
			
 
				+		entry       *model.Entry
			
 
				+		expectedURL string
			
 
				+		description string
			
 
				+	}{
			
 
				+		{
			
 
				+			name: "NoRewriteRules",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: "",
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when no rewrite rules are specified",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "EmptyRewriteRules",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: "   ",
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when rewrite rules are empty/whitespace",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "ValidRewriteRule",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/article/(.+)"|"https://example.com/full-article/$1")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/full-article/123",
			
 
				+			description: "Should rewrite URL according to the regex pattern",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "ComplexRegexRewrite",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://news.ycombinator.com/rss",
			
 
				+				UrlRewriteRules: `rewrite("^https://news\.ycombinator\.com/item\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://news.ycombinator.com/item?id=12345",
			
 
				+			},
			
 
				+			expectedURL: "https://hn.algolia.com/api/v1/items/12345",
			
 
				+			description: "Should handle complex regex patterns with escaped characters",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "NoMatchingPattern",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://different.com/(.+)"|"https://rewritten.com/$1")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when regex pattern doesn't match",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "InvalidRegexPattern",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/[invalid"|"https://rewritten.com/$1")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when regex pattern is invalid",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "MalformedRewriteRule",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("invalid format")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when rewrite rule format is malformed",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "MultipleGroups",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/([^/]+)/article/(.+)"|"https://example.com/full/$1/story/$2")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/tech/article/ai-news",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/full/tech/story/ai-news",
			
 
				+			description: "Should handle multiple capture groups in regex",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "URLWithSpecialCharacters",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://proxy.example.com/$1")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/test?param=value&other=123#section",
			
 
				+			},
			
 
				+			expectedURL: "https://proxy.example.com/article/test?param=value&other=123#section",
			
 
				+			description: "Should handle URLs with query parameters and fragments",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "ReplaceWithStaticURL",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://static.example.com/reader")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://static.example.com/reader",
			
 
				+			description: "Should replace with static URL when no capture groups are used in replacement",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "EmptyReplacementString",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"x")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "x",
			
 
				+			description: "Should replace with specified string",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "EmptyReplacementNotSupported",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"")`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when replacement is empty string (not supported by regex pattern)",
			
 
				+		},
			
 
				+		{
			
 
				+			name: "InvalidRewriteRuleFormat",
			
 
				+			feed: &model.Feed{
			
 
				+				ID:              1,
			
 
				+				FeedURL:         "https://example.com/feed.xml",
			
 
				+				UrlRewriteRules: `not-a-rewrite-rule`,
			
 
				+			},
			
 
				+			entry: &model.Entry{
			
 
				+				URL: "https://example.com/article/123",
			
 
				+			},
			
 
				+			expectedURL: "https://example.com/article/123",
			
 
				+			description: "Should return original URL when rewrite rule doesn't match expected format",
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, scenario := range scenarios {
			
 
				+		t.Run(scenario.name, func(t *testing.T) {
			
 
				+			result := RewriteEntryURL(scenario.feed, scenario.entry)
			
 
				+			if result != scenario.expectedURL {
			
 
				+				t.Errorf("Expected URL %q, got %q. Description: %s", scenario.expectedURL, result, scenario.description)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestRewriteEntryURLWithNilValues(t *testing.T) {
			
 
				+	t.Run("NilFeed", func(t *testing.T) {
			
 
				+		entry := &model.Entry{URL: "https://example.com/article/123"}
			
 
				+
			
 
				+		// This should panic or handle gracefully - let's see what happens
			
 
				+		defer func() {
			
 
				+			if r := recover(); r == nil {
			
 
				+				t.Error("Expected panic when feed is nil, but function completed normally")
			
 
				+			}
			
 
				+		}()
			
 
				+
			
 
				+		RewriteEntryURL(nil, entry)
			
 
				+	})
			
 
				+
			
 
				+	t.Run("NilEntry", func(t *testing.T) {
			
 
				+		feed := &model.Feed{
			
 
				+			ID:              1,
			
 
				+			FeedURL:         "https://example.com/feed.xml",
			
 
				+			UrlRewriteRules: `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`,
			
 
				+		}
			
 
				+
			
 
				+		// This should panic or handle gracefully - let's see what happens
			
 
				+		defer func() {
			
 
				+			if r := recover(); r == nil {
			
 
				+				t.Error("Expected panic when entry is nil, but function completed normally")
			
 
				+			}
			
 
				+		}()
			
 
				+
			
 
				+		RewriteEntryURL(feed, nil)
			
 
				+	})
			
 
				+}
			
 
				+
			
 
				+func TestCustomReplaceRuleRegex(t *testing.T) {
			
 
				+	scenarios := []struct {
			
 
				+		name     string
			
 
				+		input    string
			
 
				+		expected []string
			
 
				+		matches  bool
			
 
				+	}{
			
 
				+		{
			
 
				+			name:     "ValidRule",
			
 
				+			input:    `rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`,
			
 
				+			expected: []string{`rewrite("^https://example.com/(.+)"|"https://rewritten.com/$1")`, `^https://example.com/(.+)`, `https://rewritten.com/$1`},
			
 
				+			matches:  true,
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "ValidRuleWithEscapedCharacters",
			
 
				+			input:    `rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`,
			
 
				+			expected: []string{`rewrite("^https://news\\.ycombinator\\.com/item\\?id=(.+)"|"https://hn.algolia.com/api/v1/items/$1")`, `^https://news\\.ycombinator\\.com/item\\?id=(.+)`, `https://hn.algolia.com/api/v1/items/$1`},
			
 
				+			matches:  true,
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "InvalidFormat",
			
 
				+			input:    `rewrite("invalid")`,
			
 
				+			expected: nil,
			
 
				+			matches:  false,
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "EmptyString",
			
 
				+			input:    ``,
			
 
				+			expected: nil,
			
 
				+			matches:  false,
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "RandomText",
			
 
				+			input:    `some random text`,
			
 
				+			expected: nil,
			
 
				+			matches:  false,
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, scenario := range scenarios {
			
 
				+		t.Run(scenario.name, func(t *testing.T) {
			
 
				+			parts := customReplaceRuleRegex.FindStringSubmatch(scenario.input)
			
 
				+
			
 
				+			if scenario.matches {
			
 
				+				if len(parts) < 3 {
			
 
				+					t.Errorf("Expected regex to match and return at least 3 parts, got %d parts: %v", len(parts), parts)
			
 
				+					return
			
 
				+				}
			
 
				+
			
 
				+				// Check the full match and captured groups
			
 
				+				if parts[0] != scenario.expected[0] {
			
 
				+					t.Errorf("Expected full match %q, got %q", scenario.expected[0], parts[0])
			
 
				+				}
			
 
				+				if parts[1] != scenario.expected[1] {
			
 
				+					t.Errorf("Expected first capture group %q, got %q", scenario.expected[1], parts[1])
			
 
				+				}
			
 
				+				if parts[2] != scenario.expected[2] {
			
 
				+					t.Errorf("Expected second capture group %q, got %q", scenario.expected[2], parts[2])
			
 
				+				}
			
 
				+			} else if len(parts) >= 3 {
			
 
				+				t.Errorf("Expected regex not to match, but got %d parts: %v", len(parts), parts)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}