Parcourir la source

refactor(readability): minor clean up

Remove a now-useless regex and its associated test.
jvoisin il y a 9 mois
Parent
commit
69a74c4abf

+ 1 - 6
internal/reader/readability/readability.go

@@ -7,7 +7,6 @@ import (
 	"fmt"
 	"io"
 	"log/slog"
-	"regexp"
 	"strings"
 
 	"miniflux.app/v2/internal/urllib"
@@ -16,13 +15,9 @@ import (
 	"golang.org/x/net/html"
 )
 
-const (
-	defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
-)
+const defaultTagsToScore = "section,h2,h3,h4,h5,h6,p,td,pre,div"
 
 var (
-	divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
-
 	strongCandidates  = [...]string{"popupbody", "-ad", "g-plus"}
 	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
 	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}

+ 0 - 146
internal/reader/readability/readability_test.go

@@ -1880,152 +1880,6 @@ func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
 	}
 }
 
-func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
-	// Test the regex pattern directly to ensure it matches the expected elements
-	testCases := []struct {
-		name        string
-		html        string
-		shouldMatch bool
-		description string
-	}{
-		{
-			name:        "anchor tag",
-			html:        `<a href="#">link</a>`,
-			shouldMatch: true,
-			description: "should match anchor tags",
-		},
-		{
-			name:        "blockquote tag",
-			html:        `<blockquote>quote</blockquote>`,
-			shouldMatch: true,
-			description: "should match blockquote tags",
-		},
-		{
-			name:        "dl tag",
-			html:        `<dl><dt>term</dt></dl>`,
-			shouldMatch: true,
-			description: "should match dl tags",
-		},
-		{
-			name:        "div tag",
-			html:        `<div>content</div>`,
-			shouldMatch: true,
-			description: "should match div tags",
-		},
-		{
-			name:        "img tag",
-			html:        `<img src="test.jpg">`,
-			shouldMatch: true,
-			description: "should match img tags",
-		},
-		{
-			name:        "ol tag",
-			html:        `<ol><li>item</li></ol>`,
-			shouldMatch: true,
-			description: "should match ol tags",
-		},
-		{
-			name:        "p tag",
-			html:        `<p>paragraph</p>`,
-			shouldMatch: true,
-			description: "should match p tags",
-		},
-		{
-			name:        "pre tag",
-			html:        `<pre>code</pre>`,
-			shouldMatch: true,
-			description: "should match pre tags",
-		},
-		{
-			name:        "table tag",
-			html:        `<table><tr></tr></table>`,
-			shouldMatch: true,
-			description: "should match table tags",
-		},
-		{
-			name:        "ul tag",
-			html:        `<ul><li>item</li></ul>`,
-			shouldMatch: true,
-			description: "should match ul tags",
-		},
-		{
-			name:        "self-closing anchor",
-			html:        `<a/>`,
-			shouldMatch: true,
-			description: "should match self-closing anchor tags",
-		},
-		{
-			name:        "tag with attributes",
-			html:        `<a href="#" class="link">text</a>`,
-			shouldMatch: true,
-			description: "should match tags with attributes",
-		},
-		{
-			name:        "uppercase tags",
-			html:        `<A href="#">link</A>`,
-			shouldMatch: true,
-			description: "should be case insensitive",
-		},
-		{
-			name:        "mixed case tags",
-			html:        `<Img src="test.jpg">`,
-			shouldMatch: true,
-			description: "should match mixed case tags",
-		},
-		{
-			name:        "span tag",
-			html:        `<span>text</span>`,
-			shouldMatch: false,
-			description: "should NOT match span tags",
-		},
-		{
-			name:        "em tag",
-			html:        `<em>emphasis</em>`,
-			shouldMatch: false,
-			description: "should NOT match em tags",
-		},
-		{
-			name:        "strong tag",
-			html:        `<strong>bold</strong>`,
-			shouldMatch: false,
-			description: "should NOT match strong tags",
-		},
-		{
-			name:        "i tag",
-			html:        `<i>italic</i>`,
-			shouldMatch: false,
-			description: "should NOT match i tags",
-		},
-		{
-			name:        "b tag",
-			html:        `<b>bold</b>`,
-			shouldMatch: false,
-			description: "should NOT match b tags",
-		},
-		{
-			name:        "plain text",
-			html:        `just plain text`,
-			shouldMatch: false,
-			description: "should NOT match plain text",
-		},
-		{
-			name:        "empty string",
-			html:        ``,
-			shouldMatch: false,
-			description: "should NOT match empty string",
-		},
-	}
-
-	for _, tc := range testCases {
-		t.Run(tc.name, func(t *testing.T) {
-			result := divToPElementsRegexp.MatchString(tc.html)
-			if result != tc.shouldMatch {
-				t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
-			}
-		})
-	}
-}
-
 func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
 	t.Run("document with no divs", func(t *testing.T) {
 		html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`