10 tháng trước cách đây · 69a74c4abf
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -7,7 +7,6 @@ import (
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"log/slog"
			
 
				-	"regexp"
			
 
				 	"strings"
			
 
				 
			
 
				 	"miniflux.app/v2/internal/urllib"
			
@@ -16,13 +15,9 @@ import (
 
				 	"golang.org/x/net/html"
			
 
				 )
			
 
				 
			
 
				-const (
			
 
				-	defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
			
 
				-)
			
 
				+const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
			
 
				 
			
 
				 var (
			
 
				-	divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
			
 
				-
			
 
				 	strongCandidates  = [...]string{"popupbody", "-ad", "g-plus"}
			
 
				 	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
			
 
				 	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
			
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
			
 
				-	// Test the regex pattern directly to ensure it matches the expected elements
			
 
				-	testCases := []struct {
			
 
				-		name        string
			
 
				-		html        string
			
 
				-		shouldMatch bool
			
 
				-		description string
			
 
				-	}{
			
 
				-		{
			
 
				-			name:        "anchor tag",
			
 
				-			html:        `<a href="#">link</a>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match anchor tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "blockquote tag",
			
 
				-			html:        `<blockquote>quote</blockquote>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match blockquote tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "dl tag",
			
 
				-			html:        `<dl><dt>term</dt></dl>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match dl tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "div tag",
			
 
				-			html:        `<div>content</div>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match div tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "img tag",
			
 
				-			html:        `<img src="test.jpg">`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match img tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "ol tag",
			
 
				-			html:        `<ol><li>item</li></ol>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match ol tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "p tag",
			
 
				-			html:        `<p>paragraph</p>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match p tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "pre tag",
			
 
				-			html:        `<pre>code</pre>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match pre tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "table tag",
			
 
				-			html:        `<table><tr></tr></table>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match table tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "ul tag",
			
 
				-			html:        `<ul><li>item</li></ul>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match ul tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "self-closing anchor",
			
 
				-			html:        `<a/>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match self-closing anchor tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "tag with attributes",
			
 
				-			html:        `<a href="#" class="link">text</a>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match tags with attributes",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "uppercase tags",
			
 
				-			html:        `<A href="#">link</A>`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should be case insensitive",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "mixed case tags",
			
 
				-			html:        `<Img src="test.jpg">`,
			
 
				-			shouldMatch: true,
			
 
				-			description: "should match mixed case tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "span tag",
			
 
				-			html:        `<span>text</span>`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match span tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "em tag",
			
 
				-			html:        `<em>emphasis</em>`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match em tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "strong tag",
			
 
				-			html:        `<strong>bold</strong>`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match strong tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "i tag",
			
 
				-			html:        `<i>italic</i>`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match i tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "b tag",
			
 
				-			html:        `<b>bold</b>`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match b tags",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "plain text",
			
 
				-			html:        `just plain text`,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match plain text",
			
 
				-		},
			
 
				-		{
			
 
				-			name:        "empty string",
			
 
				-			html:        ``,
			
 
				-			shouldMatch: false,
			
 
				-			description: "should NOT match empty string",
			
 
				-		},
			
 
				-	}
			
 
				-
			
 
				-	for _, tc := range testCases {
			
 
				-		t.Run(tc.name, func(t *testing.T) {
			
 
				-			result := divToPElementsRegexp.MatchString(tc.html)
			
 
				-			if result != tc.shouldMatch {
			
 
				-				t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
			
 
				-			}
			
 
				-		})
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
			
 
				 	t.Run("document with no divs", func(t *testing.T) {
			
 
				 		html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`