hai 10 meses · 6eeccae7cd
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -27,8 +27,8 @@ var (
 
				 	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
			
 
				 	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
			
 
				 
			
 
				-	positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
			
 
				-	negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
			
 
				+	positiveKeywords = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
			
 
				+	negativeKeywords = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
			
 
				 )
			
 
				 
			
 
				 type candidate struct {
			
@@ -37,23 +37,31 @@ type candidate struct {
 
				 }
			
 
				 
			
 
				 func (c *candidate) Node() *html.Node {
			
 
				+	if c.selection.Length() == 0 {
			
 
				+		return nil
			
 
				+	}
			
 
				 	return c.selection.Get(0)
			
 
				 }
			
 
				 
			
 
				 func (c *candidate) String() string {
			
 
				+	node := c.Node()
			
 
				+	if node == nil {
			
 
				+		return fmt.Sprintf("empty => %f", c.score)
			
 
				+	}
			
 
				+
			
 
				 	id, _ := c.selection.Attr("id")
			
 
				 	class, _ := c.selection.Attr("class")
			
 
				 
			
 
				 	switch {
			
 
				 	case id != "" && class != "":
			
 
				-		return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
			
 
				+		return fmt.Sprintf("%s#%s.%s => %f", node.DataAtom, id, class, c.score)
			
 
				 	case id != "":
			
 
				-		return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
			
 
				+		return fmt.Sprintf("%s#%s => %f", node.DataAtom, id, c.score)
			
 
				 	case class != "":
			
 
				-		return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
			
 
				+		return fmt.Sprintf("%s.%s => %f", node.DataAtom, class, c.score)
			
 
				 	}
			
 
				 
			
 
				-	return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
			
 
				+	return fmt.Sprintf("%s => %f", node.DataAtom, c.score)
			
 
				 }
			
 
				 
			
 
				 type candidateList map[*html.Node]*candidate
			
@@ -111,7 +119,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 
				 		tag := "div"
			
 
				 		node := s.Get(0)
			
 
				 
			
 
				-		if node == topCandidate.Node() {
			
 
				+		topNode := topCandidate.Node()
			
 
				+		if topNode != nil && node == topNode {
			
 
				 			append = true
			
 
				 		} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
			
 
				 			append = true
			
@@ -147,14 +156,14 @@ func shouldRemoveCandidate(str string) bool {
 
				 	str = strings.ToLower(str)
			
 
				 
			
 
				 	// Those candidates have no false-positives, no need to check against `maybeCandidate`
			
 
				-	for _, strong := range strongCandidates {
			
 
				-		if strings.Contains(str, strong) {
			
 
				+	for _, strongCandidate := range strongCandidates {
			
 
				+		if strings.Contains(str, strongCandidate) {
			
 
				 			return true
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				-	for _, unlikely := range unlikelyCandidate {
			
 
				-		if strings.Contains(str, unlikely) {
			
 
				+	for _, unlikelyCandidate := range unlikelyCandidate {
			
 
				+		if strings.Contains(str, unlikelyCandidate) {
			
 
				 			// Do we have a false positive?
			
 
				 			for _, maybe := range maybeCandidate {
			
 
				 				if strings.Contains(str, maybe) {
			
@@ -268,6 +277,11 @@ func getCandidates(document *goquery.Document) candidateList {
 
				 func scoreNode(s *goquery.Selection) *candidate {
			
 
				 	c := &candidate{selection: s, score: 0}
			
 
				 
			
 
				+	// Check if selection is empty to avoid panic
			
 
				+	if s.Length() == 0 {
			
 
				+		return c
			
 
				+	}
			
 
				+
			
 
				 	switch s.Get(0).DataAtom.String() {
			
 
				 	case "div":
			
 
				 		c.score += 5
			
@@ -314,13 +328,13 @@ func getClassWeight(s *goquery.Selection) float32 {
 
				 
			
 
				 func getWeight(s string) int {
			
 
				 	s = strings.ToLower(s)
			
 
				-	for _, pos := range negative {
			
 
				-		if strings.Contains(s, pos) {
			
 
				+	for _, keyword := range negativeKeywords {
			
 
				+		if strings.Contains(s, keyword) {
			
 
				 			return -25
			
 
				 		}
			
 
				 	}
			
 
				-	for _, pos := range positive {
			
 
				-		if strings.Contains(s, pos) {
			
 
				+	for _, keyword := range positiveKeywords {
			
 
				+		if strings.Contains(s, keyword) {
			
 
				 			return +25
			
 
				 		}
			
 
				 	}
			
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -11,8 +11,60 @@ import (
 
				 	"testing"
			
 
				 
			
 
				 	"github.com/PuerkitoBio/goquery"
			
 
				+	"golang.org/x/net/html"
			
 
				 )
			
 
				 
			
 
				+func BenchmarkExtractContent(b *testing.B) {
			
 
				+	var testCases = map[string][]byte{
			
 
				+		"miniflux_github.html":    {},
			
 
				+		"miniflux_wikipedia.html": {},
			
 
				+	}
			
 
				+	for filename := range testCases {
			
 
				+		data, err := os.ReadFile("testdata/" + filename)
			
 
				+		if err != nil {
			
 
				+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
			
 
				+		}
			
 
				+		testCases[filename] = data
			
 
				+	}
			
 
				+	for range b.N {
			
 
				+		for _, v := range testCases {
			
 
				+			ExtractContent(bytes.NewReader(v))
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func BenchmarkGetWeight(b *testing.B) {
			
 
				+	testCases := []string{
			
 
				+		"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
			
 
				+		"d-flex flex-column mb-3",
			
 
				+		"AppHeader-search-control AppHeader-search-control-overflow",
			
 
				+		"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
			
 
				+		"sr-only",
			
 
				+		"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
			
 
				+	}
			
 
				+	for range b.N {
			
 
				+		for _, v := range testCases {
			
 
				+			getWeight(v)
			
 
				+		}
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func BenchmarkTransformMisusedDivsIntoParagraphs(b *testing.B) {
			
 
				+	html := `<html><body>
			
 
				+		<div>Simple text content</div>
			
 
				+		<div>More <span>inline</span> content</div>
			
 
				+		<div><a href="#">Link content</a></div>
			
 
				+		<div><p>Paragraph content</p></div>
			
 
				+		<div>Another simple text</div>
			
 
				+	</body></html>`
			
 
				+
			
 
				+	b.ResetTimer()
			
 
				+	for i := 0; i < b.N; i++ {
			
 
				+		doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		transformMisusedDivsIntoParagraphs(doc)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				 func TestBaseURL(t *testing.T) {
			
 
				 	html := `
			
 
				 		<html>
			
@@ -189,25 +241,6 @@ func TestNestedSpanInCodeBlock(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func BenchmarkExtractContent(b *testing.B) {
			
 
				-	var testCases = map[string][]byte{
			
 
				-		"miniflux_github.html":    {},
			
 
				-		"miniflux_wikipedia.html": {},
			
 
				-	}
			
 
				-	for filename := range testCases {
			
 
				-		data, err := os.ReadFile("testdata/" + filename)
			
 
				-		if err != nil {
			
 
				-			b.Fatalf(`Unable to read file %q: %v`, filename, err)
			
 
				-		}
			
 
				-		testCases[filename] = data
			
 
				-	}
			
 
				-	for range b.N {
			
 
				-		for _, v := range testCases {
			
 
				-			ExtractContent(bytes.NewReader(v))
			
 
				-		}
			
 
				-	}
			
 
				-}
			
 
				-
			
 
				 func TestGetClassWeight(t *testing.T) {
			
 
				 	testCases := []struct {
			
 
				 		name     string
			
@@ -1315,18 +1348,1134 @@ func TestContainsSentence(t *testing.T) {
 
				 	}
			
 
				 }
			
 
				 
			
 
				-func BenchmarkGetWeight(b *testing.B) {
			
 
				-	testCases := []string{
			
 
				-		"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
			
 
				-		"d-flex flex-column mb-3",
			
 
				-		"AppHeader-search-control AppHeader-search-control-overflow",
			
 
				-		"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
			
 
				-		"sr-only",
			
 
				-		"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
			
 
				+func TestScoreNode(t *testing.T) {
			
 
				+	testCases := []struct {
			
 
				+		name          string
			
 
				+		html          string
			
 
				+		expectedScore float32
			
 
				+		expectedTag   string
			
 
				+	}{
			
 
				+		{
			
 
				+			name:          "div element with no class or id",
			
 
				+			html:          `<div>Some content</div>`,
			
 
				+			expectedScore: 5,
			
 
				+			expectedTag:   "div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "pre element with no class or id",
			
 
				+			html:          `<pre>Some code</pre>`,
			
 
				+			expectedScore: 3,
			
 
				+			expectedTag:   "pre",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "td element with no class or id",
			
 
				+			html:          `<table><tr><td>Table cell</td></tr></table>`,
			
 
				+			expectedScore: 3,
			
 
				+			expectedTag:   "td",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "blockquote element with no class or id",
			
 
				+			html:          `<blockquote>Quote</blockquote>`,
			
 
				+			expectedScore: 3,
			
 
				+			expectedTag:   "blockquote",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "img element with no class or id",
			
 
				+			html:          `<img src="test.jpg" alt="test">`,
			
 
				+			expectedScore: 3,
			
 
				+			expectedTag:   "img",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "ol element with no class or id",
			
 
				+			html:          `<ol><li>Item</li></ol>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "ol",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "ul element with no class or id",
			
 
				+			html:          `<ul><li>Item</li></ul>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "ul",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "address element with no class or id",
			
 
				+			html:          `<address>Contact info</address>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "address",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "dl element with no class or id",
			
 
				+			html:          `<dl><dt>Term</dt><dd>Definition</dd></dl>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "dl",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "dd element with no class or id",
			
 
				+			html:          `<dd>Definition</dd>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "dd",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "dt element with no class or id",
			
 
				+			html:          `<dt>Term</dt>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "dt",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "li element with no class or id",
			
 
				+			html:          `<li>List item</li>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "li",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "form element with no class or id",
			
 
				+			html:          `<form>Form content</form>`,
			
 
				+			expectedScore: -3,
			
 
				+			expectedTag:   "form",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h1 element with no class or id",
			
 
				+			html:          `<h1>Heading</h1>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h1",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h2 element with no class or id",
			
 
				+			html:          `<h2>Heading</h2>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h2",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h3 element with no class or id",
			
 
				+			html:          `<h3>Heading</h3>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h3",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h4 element with no class or id",
			
 
				+			html:          `<h4>Heading</h4>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h4",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h5 element with no class or id",
			
 
				+			html:          `<h5>Heading</h5>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h5",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h6 element with no class or id",
			
 
				+			html:          `<h6>Heading</h6>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "h6",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "th element with no class or id",
			
 
				+			html:          `<table><tr><th>Header cell</th></tr></table>`,
			
 
				+			expectedScore: -5,
			
 
				+			expectedTag:   "th",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "p element with no class or id (default case)",
			
 
				+			html:          `<p>Paragraph content</p>`,
			
 
				+			expectedScore: 0,
			
 
				+			expectedTag:   "p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "span element with no class or id (default case)",
			
 
				+			html:          `<span>Span content</span>`,
			
 
				+			expectedScore: 0,
			
 
				+			expectedTag:   "span",
			
 
				+		},
			
 
				 	}
			
 
				-	for range b.N {
			
 
				-		for _, v := range testCases {
			
 
				-			getWeight(v)
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
			
 
				+			if err != nil {
			
 
				+				t.Fatal(err)
			
 
				+			}
			
 
				+
			
 
				+			selection := doc.Find(tc.expectedTag)
			
 
				+			if selection.Length() == 0 {
			
 
				+				t.Fatalf("Could not find element with tag %s", tc.expectedTag)
			
 
				+			}
			
 
				+
			
 
				+			candidate := scoreNode(selection)
			
 
				+
			
 
				+			if candidate.score != tc.expectedScore {
			
 
				+				t.Errorf("Expected score %f, got %f", tc.expectedScore, candidate.score)
			
 
				+			}
			
 
				+
			
 
				+			if candidate.selection != selection {
			
 
				+				t.Error("Expected selection to be preserved in candidate")
			
 
				+			}
			
 
				+
			
 
				+			if candidate.Node() == nil {
			
 
				+				t.Errorf("Expected valid node, got nil")
			
 
				+			} else if candidate.Node().Data != tc.expectedTag {
			
 
				+				t.Errorf("Expected node tag %s, got %s", tc.expectedTag, candidate.Node().Data)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestScoreNodeWithClassWeights(t *testing.T) {
			
 
				+	testCases := []struct {
			
 
				+		name          string
			
 
				+		html          string
			
 
				+		expectedScore float32
			
 
				+		description   string
			
 
				+	}{
			
 
				+		{
			
 
				+			name:          "div with positive class",
			
 
				+			html:          `<div class="content">Content</div>`,
			
 
				+			expectedScore: 30, // 5 (div) + 25 (positive class)
			
 
				+			description:   "div base score + positive class weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with negative class",
			
 
				+			html:          `<div class="comment">Content</div>`,
			
 
				+			expectedScore: -20, // 5 (div) + (-25) (negative class)
			
 
				+			description:   "div base score + negative class weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with positive id",
			
 
				+			html:          `<div id="main">Content</div>`,
			
 
				+			expectedScore: 30, // 5 (div) + 25 (positive id)
			
 
				+			description:   "div base score + positive id weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with negative id",
			
 
				+			html:          `<div id="sidebar">Content</div>`,
			
 
				+			expectedScore: -20, // 5 (div) + (-25) (negative id)
			
 
				+			description:   "div base score + negative id weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with both positive class and id",
			
 
				+			html:          `<div class="content" id="main">Content</div>`,
			
 
				+			expectedScore: 55, // 5 (div) + 25 (positive class) + 25 (positive id)
			
 
				+			description:   "div base score + positive class weight + positive id weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with both negative class and id",
			
 
				+			html:          `<div class="comment" id="sidebar">Content</div>`,
			
 
				+			expectedScore: -45, // 5 (div) + (-25) (negative class) + (-25) (negative id)
			
 
				+			description:   "div base score + negative class weight + negative id weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "div with mixed class and id weights",
			
 
				+			html:          `<div class="content" id="sidebar">Content</div>`,
			
 
				+			expectedScore: 5, // 5 (div) + 25 (positive class) + (-25) (negative id)
			
 
				+			description:   "div base score + positive class weight + negative id weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "h1 with positive class (should still be negative overall)",
			
 
				+			html:          `<h1 class="content">Heading</h1>`,
			
 
				+			expectedScore: 20, // -5 (h1) + 25 (positive class)
			
 
				+			description:   "h1 base score + positive class weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "ul with negative class (more negative)",
			
 
				+			html:          `<ul class="comment">List</ul>`,
			
 
				+			expectedScore: -28, // -3 (ul) + (-25) (negative class)
			
 
				+			description:   "ul base score + negative class weight",
			
 
				+		},
			
 
				+		{
			
 
				+			name:          "p with neutral class/id (no weight change)",
			
 
				+			html:          `<p class="normal" id="regular">Paragraph</p>`,
			
 
				+			expectedScore: 0, // 0 (p) + 0 (neutral class) + 0 (neutral id)
			
 
				+			description:   "p base score with neutral class and id",
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
			
 
				+			if err != nil {
			
 
				+				t.Fatal(err)
			
 
				+			}
			
 
				+
			
 
				+			// Find the first non-html/body element
			
 
				+			selection := doc.Find("div, h1, h2, h3, h4, h5, h6, ul, ol, p, pre, blockquote, img, td, th, address, dl, dd, dt, li, form, span").First()
			
 
				+			if selection.Length() == 0 {
			
 
				+				t.Fatal("Could not find element")
			
 
				+			}
			
 
				+
			
 
				+			candidate := scoreNode(selection)
			
 
				+
			
 
				+			if candidate.score != tc.expectedScore {
			
 
				+				t.Errorf("%s: Expected score %f, got %f", tc.description, tc.expectedScore, candidate.score)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestScoreNodeEdgeCases(t *testing.T) {
			
 
				+	t.Run("empty selection", func(t *testing.T) {
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(`<div></div>`))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		// Create empty selection
			
 
				+		emptySelection := doc.Find("nonexistent")
			
 
				+		if emptySelection.Length() != 0 {
			
 
				+			t.Fatal("Expected empty selection")
			
 
				+		}
			
 
				+
			
 
				+		// scoreNode should handle empty selection gracefully
			
 
				+		candidate := scoreNode(emptySelection)
			
 
				+		if candidate == nil {
			
 
				+			t.Error("Expected non-nil candidate even for empty selection")
			
 
				+		}
			
 
				+
			
 
				+		// Should have score 0 and empty selection
			
 
				+		if candidate != nil && candidate.score != 0 {
			
 
				+			t.Errorf("Expected score 0 for empty selection, got %f", candidate.score)
			
 
				+		}
			
 
				+
			
 
				+		if candidate.selection.Length() != 0 {
			
 
				+			t.Error("Expected candidate to preserve empty selection")
			
 
				+		}
			
 
				+
			
 
				+		// Node() should return nil for empty selection
			
 
				+		if candidate.Node() != nil {
			
 
				+			t.Error("Expected Node() to return nil for empty selection")
			
 
				+		}
			
 
				+
			
 
				+		// String() should handle empty selection gracefully
			
 
				+		str := candidate.String()
			
 
				+		expected := "empty => 0.000000"
			
 
				+		if str != expected {
			
 
				+			t.Errorf("Expected String() to return %q, got %q", expected, str)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("multiple elements in selection", func(t *testing.T) {
			
 
				+		html := `<div>
			
 
				+			<p class="article">First paragraph</p>
			
 
				+			<p class="sidebar">Second paragraph</p>
			
 
				+		</div>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		// Select all p elements
			
 
				+		selection := doc.Find("p")
			
 
				+		if selection.Length() != 2 {
			
 
				+			t.Fatalf("Expected 2 p elements, got %d", selection.Length())
			
 
				+		}
			
 
				+
			
 
				+		// scoreNode should only consider the first element in the selection
			
 
				+		candidate := scoreNode(selection)
			
 
				+
			
 
				+		// Should score based on first p element (class="article")
			
 
				+		expectedScore := float32(25) // 0 (p) + 25 (positive class)
			
 
				+		if candidate.score != expectedScore {
			
 
				+			t.Errorf("Expected score %f, got %f", expectedScore, candidate.score)
			
 
				+		}
			
 
				+
			
 
				+		if candidate.Node() == nil {
			
 
				+			t.Error("Expected valid node, got nil")
			
 
				+		} else if candidate.Node().Data != "p" {
			
 
				+			t.Errorf("Expected node tag p, got %s", candidate.Node().Data)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("nested elements", func(t *testing.T) {
			
 
				+		html := `<div class="article">
			
 
				+			<p class="content">
			
 
				+				<span class="highlight">Text</span>
			
 
				+			</p>
			
 
				+		</div>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		// Test scoring each level
			
 
				+		divSelection := doc.Find("div")
			
 
				+		divCandidate := scoreNode(divSelection)
			
 
				+		expectedDivScore := float32(30) // 5 (div) + 25 (positive class)
			
 
				+		if divCandidate.score != expectedDivScore {
			
 
				+			t.Errorf("Div score: expected %f, got %f", expectedDivScore, divCandidate.score)
			
 
				+		}
			
 
				+
			
 
				+		pSelection := doc.Find("p")
			
 
				+		pCandidate := scoreNode(pSelection)
			
 
				+		expectedPScore := float32(25) // 0 (p) + 25 (positive class)
			
 
				+		if pCandidate.score != expectedPScore {
			
 
				+			t.Errorf("P score: expected %f, got %f", expectedPScore, pCandidate.score)
			
 
				+		}
			
 
				+
			
 
				+		spanSelection := doc.Find("span")
			
 
				+		spanCandidate := scoreNode(spanSelection)
			
 
				+		expectedSpanScore := float32(0) // 0 (span) + 0 (neutral class)
			
 
				+		if spanCandidate.score != expectedSpanScore {
			
 
				+			t.Errorf("Span score: expected %f, got %f", expectedSpanScore, spanCandidate.score)
			
 
				+		}
			
 
				+	})
			
 
				+}
			
 
				+
			
 
				+func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
			
 
				+	testCases := []struct {
			
 
				+		name        string
			
 
				+		input       string
			
 
				+		expected    string
			
 
				+		description string
			
 
				+	}{
			
 
				+		{
			
 
				+			name:        "div with only text should become paragraph",
			
 
				+			input:       `<div>Simple text content</div>`,
			
 
				+			expected:    `<p>Simple text content</p>`,
			
 
				+			description: "div containing only text should be converted to p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with inline elements should become paragraph",
			
 
				+			input:       `<div>Text with <span>inline</span> and <em>emphasis</em></div>`,
			
 
				+			expected:    `<p>Text with <span>inline</span> and <em>emphasis</em></p>`,
			
 
				+			description: "div with inline elements should be converted to p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with strong and other inline elements",
			
 
				+			input:       `<div>Some <strong>bold</strong> and <i>italic</i> text</div>`,
			
 
				+			expected:    `<p>Some <strong>bold</strong> and <i>italic</i> text</p>`,
			
 
				+			description: "div with inline formatting should be converted to p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with anchor tag should NOT become paragraph",
			
 
				+			input:       `<div>Text with <a href="#">link</a></div>`,
			
 
				+			expected:    `<div>Text with <a href="#">link</a></div>`,
			
 
				+			description: "div containing anchor tag should remain div (matches regex)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with paragraph should NOT become paragraph",
			
 
				+			input:       `<div><p>Nested paragraph</p></div>`,
			
 
				+			expected:    `<div><p>Nested paragraph</p></div>`,
			
 
				+			description: "div containing p tag should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with blockquote should NOT become paragraph",
			
 
				+			input:       `<div><blockquote>Quote</blockquote></div>`,
			
 
				+			expected:    `<div><blockquote>Quote</blockquote></div>`,
			
 
				+			description: "div containing blockquote should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with nested div should NOT become paragraph",
			
 
				+			input:       `<div><div>Nested div</div></div>`,
			
 
				+			expected:    `<div><p>Nested div</p></div>`,
			
 
				+			description: "outer div has nested div (matches regex), inner div has text only (gets converted)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with img should NOT become paragraph",
			
 
				+			input:       `<div><img src="test.jpg" alt="test"></div>`,
			
 
				+			expected:    `<div><img src="test.jpg" alt="test"/></div>`,
			
 
				+			description: "div containing img should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with ol should NOT become paragraph",
			
 
				+			input:       `<div><ol><li>Item</li></ol></div>`,
			
 
				+			expected:    `<div><ol><li>Item</li></ol></div>`,
			
 
				+			description: "div containing ol should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with ul should NOT become paragraph",
			
 
				+			input:       `<div><ul><li>Item</li></ul></div>`,
			
 
				+			expected:    `<div><ul><li>Item</li></ul></div>`,
			
 
				+			description: "div containing ul should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with pre should NOT become paragraph",
			
 
				+			input:       `<div><pre>Code block</pre></div>`,
			
 
				+			expected:    `<div><pre>Code block</pre></div>`,
			
 
				+			description: "div containing pre should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with table should NOT become paragraph",
			
 
				+			input:       `<div><table><tr><td>Cell</td></tr></table></div>`,
			
 
				+			expected:    `<div><table><tbody><tr><td>Cell</td></tr></tbody></table></div>`,
			
 
				+			description: "div containing table should remain div (note: GoQuery adds tbody)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with dl should NOT become paragraph",
			
 
				+			input:       `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
			
 
				+			expected:    `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
			
 
				+			description: "div containing dl should remain div",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "empty div should become paragraph",
			
 
				+			input:       `<div></div>`,
			
 
				+			expected:    `<p></p>`,
			
 
				+			description: "empty div should be converted to p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with only whitespace should become paragraph",
			
 
				+			input:       `<div>   </div>`,
			
 
				+			expected:    `<p>   </p>`,
			
 
				+			description: "div with only whitespace should be converted to p",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div with self-closing anchor tag should NOT become paragraph",
			
 
				+			input:       `<div>Text <a/> more text</div>`,
			
 
				+			expected:    `<div>Text <a> more text</a></div>`,
			
 
				+			description: "div with self-closing anchor should remain div (note: GoQuery normalizes self-closing tags)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "case insensitive matching - uppercase A",
			
 
				+			input:       `<div>Text with <A href="#">link</A></div>`,
			
 
				+			expected:    `<div>Text with <a href="#">link</a></div>`,
			
 
				+			description: "regex should be case insensitive (note: GoQuery normalizes case)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "case insensitive matching - uppercase IMG",
			
 
				+			input:       `<div><IMG src="test.jpg"></div>`,
			
 
				+			expected:    `<div><img src="test.jpg"/></div>`,
			
 
				+			description: "regex should be case insensitive (note: GoQuery normalizes case)",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "multiple divs transformation",
			
 
				+			input:       `<div>Text only</div><div><p>Has paragraph</p></div><div>More text</div>`,
			
 
				+			expected:    `<p>Text only</p><div><p>Has paragraph</p></div><p>More text</p>`,
			
 
				+			description: "should transform multiple divs appropriately",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "nested divs where inner gets transformed",
			
 
				+			input:       `<div><div>Inner text only</div><p>Paragraph</p></div>`,
			
 
				+			expected:    `<div><p>Inner text only</p><p>Paragraph</p></div>`,
			
 
				+			description: "inner div should be transformed even if outer div isn't",
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			// Wrap input in a basic HTML structure
			
 
				+			html := fmt.Sprintf(`<html><body>%s</body></html>`, tc.input)
			
 
				+
			
 
				+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse HTML: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			// Apply the transformation
			
 
				+			transformMisusedDivsIntoParagraphs(doc)
			
 
				+
			
 
				+			// Extract the body content
			
 
				+			bodyHtml, err := doc.Find("body").Html()
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to get body HTML: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			// Clean up whitespace for comparison
			
 
				+			result := strings.TrimSpace(bodyHtml)
			
 
				+			expected := strings.TrimSpace(tc.expected)
			
 
				+
			
 
				+			if result != expected {
			
 
				+				t.Errorf("%s\nExpected: %s\nGot:      %s", tc.description, expected, result)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
			
 
				+	// Test the regex pattern directly to ensure it matches the expected elements
			
 
				+	testCases := []struct {
			
 
				+		name        string
			
 
				+		html        string
			
 
				+		shouldMatch bool
			
 
				+		description string
			
 
				+	}{
			
 
				+		{
			
 
				+			name:        "anchor tag",
			
 
				+			html:        `<a href="#">link</a>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match anchor tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "blockquote tag",
			
 
				+			html:        `<blockquote>quote</blockquote>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match blockquote tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "dl tag",
			
 
				+			html:        `<dl><dt>term</dt></dl>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match dl tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "div tag",
			
 
				+			html:        `<div>content</div>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match div tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "img tag",
			
 
				+			html:        `<img src="test.jpg">`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match img tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "ol tag",
			
 
				+			html:        `<ol><li>item</li></ol>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match ol tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "p tag",
			
 
				+			html:        `<p>paragraph</p>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match p tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "pre tag",
			
 
				+			html:        `<pre>code</pre>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match pre tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "table tag",
			
 
				+			html:        `<table><tr></tr></table>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match table tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "ul tag",
			
 
				+			html:        `<ul><li>item</li></ul>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match ul tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "self-closing anchor",
			
 
				+			html:        `<a/>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match self-closing anchor tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "tag with attributes",
			
 
				+			html:        `<a href="#" class="link">text</a>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match tags with attributes",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "uppercase tags",
			
 
				+			html:        `<A href="#">link</A>`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should be case insensitive",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "mixed case tags",
			
 
				+			html:        `<Img src="test.jpg">`,
			
 
				+			shouldMatch: true,
			
 
				+			description: "should match mixed case tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "span tag",
			
 
				+			html:        `<span>text</span>`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match span tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "em tag",
			
 
				+			html:        `<em>emphasis</em>`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match em tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "strong tag",
			
 
				+			html:        `<strong>bold</strong>`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match strong tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "i tag",
			
 
				+			html:        `<i>italic</i>`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match i tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "b tag",
			
 
				+			html:        `<b>bold</b>`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match b tags",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "plain text",
			
 
				+			html:        `just plain text`,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match plain text",
			
 
				+		},
			
 
				+		{
			
 
				+			name:        "empty string",
			
 
				+			html:        ``,
			
 
				+			shouldMatch: false,
			
 
				+			description: "should NOT match empty string",
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			result := divToPElementsRegexp.MatchString(tc.html)
			
 
				+			if result != tc.shouldMatch {
			
 
				+				t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
			
 
				+	t.Run("document with no divs", func(t *testing.T) {
			
 
				+		html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		// Should not panic or cause issues
			
 
				+		transformMisusedDivsIntoParagraphs(doc)
			
 
				+
			
 
				+		bodyHtml, _ := doc.Find("body").Html()
			
 
				+		expected := `<p>No divs here</p><span>Just other elements</span>`
			
 
				+
			
 
				+		if strings.TrimSpace(bodyHtml) != expected {
			
 
				+			t.Errorf("Expected no changes to document without divs")
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("empty document", func(t *testing.T) {
			
 
				+		html := `<html><body></body></html>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		// Should not panic with empty document
			
 
				+		transformMisusedDivsIntoParagraphs(doc)
			
 
				+
			
 
				+		bodyHtml, _ := doc.Find("body").Html()
			
 
				+		if strings.TrimSpace(bodyHtml) != "" {
			
 
				+			t.Errorf("Expected empty body to remain empty")
			
 
				 		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("deeply nested divs", func(t *testing.T) {
			
 
				+		html := `<html><body><div><div><div>Deep text</div></div></div></body></html>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		transformMisusedDivsIntoParagraphs(doc)
			
 
				+
			
 
				+		bodyHtml, _ := doc.Find("body").Html()
			
 
				+		// The outer divs contain other divs (matches regex), so they remain divs
			
 
				+		// Only the innermost div with just text gets converted to p
			
 
				+		expected := `<div><div><p>Deep text</p></div></div>`
			
 
				+
			
 
				+		if strings.TrimSpace(bodyHtml) != expected {
			
 
				+			t.Errorf("Expected nested div transformation\nGot: %s\nExpected: %s", strings.TrimSpace(bodyHtml), expected)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("complex mixed content", func(t *testing.T) {
			
 
				+		html := `<html><body>
			
 
				+			<div>Text only div</div>
			
 
				+			<div><a href="#">Link div</a></div>
			
 
				+			<div><span>Inline</span> text</div>
			
 
				+			<div><p>Block element</p></div>
			
 
				+		</body></html>`
			
 
				+
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		transformMisusedDivsIntoParagraphs(doc)
			
 
				+
			
 
				+		// Count paragraphs and divs
			
 
				+		pCount := doc.Find("p").Length()
			
 
				+		divCount := doc.Find("div").Length()
			
 
				+
			
 
				+		// Should have 3 paragraphs (original p + 2 converted divs) and 2 divs (link div + block element div)
			
 
				+		expectedPCount := 3
			
 
				+		expectedDivCount := 2
			
 
				+
			
 
				+		if pCount != expectedPCount {
			
 
				+			t.Errorf("Expected %d paragraphs, got %d", expectedPCount, pCount)
			
 
				+		}
			
 
				+		if divCount != expectedDivCount {
			
 
				+			t.Errorf("Expected %d divs, got %d", expectedDivCount, divCount)
			
 
				+		}
			
 
				+	})
			
 
				+}
			
 
				+
			
 
				+func TestCandidateString(t *testing.T) {
			
 
				+	testCases := []struct {
			
 
				+		name     string
			
 
				+		html     string
			
 
				+		expected string
			
 
				+		setup    func(*goquery.Document) *candidate
			
 
				+	}{
			
 
				+		{
			
 
				+			name:     "empty candidate",
			
 
				+			html:     `<div></div>`,
			
 
				+			expected: "empty => 0.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				emptySelection := doc.Find("nonexistent")
			
 
				+				return &candidate{selection: emptySelection, score: 0}
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with no class or id",
			
 
				+			html:     `<div>Content</div>`,
			
 
				+			expected: "div => 5.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with class only",
			
 
				+			html:     `<div class="content">Content</div>`,
			
 
				+			expected: "div.content => 30.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with id only",
			
 
				+			html:     `<div id="main">Content</div>`,
			
 
				+			expected: "div#main => 30.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with both class and id",
			
 
				+			html:     `<div class="content" id="main">Content</div>`,
			
 
				+			expected: "div#main.content => 55.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with multiple classes",
			
 
				+			html:     `<div class="article main content">Content</div>`,
			
 
				+			expected: "div.article main content => 30.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "paragraph candidate with negative class",
			
 
				+			html:     `<p class="comment">Comment text</p>`,
			
 
				+			expected: "p.comment => -25.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("p")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "heading candidate with positive id",
			
 
				+			html:     `<h1 id="main">Heading</h1>`,
			
 
				+			expected: "h1#main => 20.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("h1")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with special characters in class",
			
 
				+			html:     `<div class="my-class_name">Content</div>`,
			
 
				+			expected: "div.my-class_name => 5.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with empty class attribute",
			
 
				+			html:     `<div class="">Content</div>`,
			
 
				+			expected: "div => 5.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "candidate with empty id attribute",
			
 
				+			html:     `<div id="">Content</div>`,
			
 
				+			expected: "div => 5.000000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("div")
			
 
				+				return scoreNode(selection)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "custom score candidate",
			
 
				+			html:     `<span>Content</span>`,
			
 
				+			expected: "span => 42.500000",
			
 
				+			setup: func(doc *goquery.Document) *candidate {
			
 
				+				selection := doc.Find("span")
			
 
				+				c := scoreNode(selection)
			
 
				+				c.score = 42.5 // Override score for testing
			
 
				+				return c
			
 
				+			},
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse HTML: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			candidate := tc.setup(doc)
			
 
				+			result := candidate.String()
			
 
				+
			
 
				+			if result != tc.expected {
			
 
				+				t.Errorf("Expected: %s, Got: %s", tc.expected, result)
			
 
				+			}
			
 
				+		})
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestCandidateListString(t *testing.T) {
			
 
				+	testCases := []struct {
			
 
				+		name     string
			
 
				+		html     string
			
 
				+		expected string
			
 
				+		setup    func(*goquery.Document) candidateList
			
 
				+	}{
			
 
				+		{
			
 
				+			name:     "empty candidate list",
			
 
				+			html:     `<div></div>`,
			
 
				+			expected: "",
			
 
				+			setup: func(doc *goquery.Document) candidateList {
			
 
				+				return make(candidateList)
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name:     "single candidate",
			
 
				+			html:     `<div class="content">Content</div>`,
			
 
				+			expected: "div.content => 30.000000",
			
 
				+			setup: func(doc *goquery.Document) candidateList {
			
 
				+				candidates := make(candidateList)
			
 
				+				selection := doc.Find("div")
			
 
				+				candidate := scoreNode(selection)
			
 
				+				candidates[selection.Get(0)] = candidate
			
 
				+				return candidates
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "multiple candidates",
			
 
				+			html: `<div class="content">Content</div><p class="text">Paragraph</p><h1 id="main">Title</h1>`,
			
 
				+			setup: func(doc *goquery.Document) candidateList {
			
 
				+				candidates := make(candidateList)
			
 
				+
			
 
				+				divSelection := doc.Find("div")
			
 
				+				divCandidate := scoreNode(divSelection)
			
 
				+				candidates[divSelection.Get(0)] = divCandidate
			
 
				+
			
 
				+				pSelection := doc.Find("p")
			
 
				+				pCandidate := scoreNode(pSelection)
			
 
				+				candidates[pSelection.Get(0)] = pCandidate
			
 
				+
			
 
				+				h1Selection := doc.Find("h1")
			
 
				+				h1Candidate := scoreNode(h1Selection)
			
 
				+				candidates[h1Selection.Get(0)] = h1Candidate
			
 
				+
			
 
				+				return candidates
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "candidates with mixed scores",
			
 
				+			html: `<div class="comment">Comment</div><p class="content">Good content</p>`,
			
 
				+			setup: func(doc *goquery.Document) candidateList {
			
 
				+				candidates := make(candidateList)
			
 
				+
			
 
				+				divSelection := doc.Find("div")
			
 
				+				divCandidate := scoreNode(divSelection)
			
 
				+				candidates[divSelection.Get(0)] = divCandidate
			
 
				+
			
 
				+				pSelection := doc.Find("p")
			
 
				+				pCandidate := scoreNode(pSelection)
			
 
				+				candidates[pSelection.Get(0)] = pCandidate
			
 
				+
			
 
				+				return candidates
			
 
				+			},
			
 
				+		},
			
 
				+		{
			
 
				+			name: "candidate with empty selection",
			
 
				+			html: `<div>Test</div>`,
			
 
				+			setup: func(doc *goquery.Document) candidateList {
			
 
				+				candidates := make(candidateList)
			
 
				+
			
 
				+				// Add a regular candidate
			
 
				+				divSelection := doc.Find("div")
			
 
				+				divCandidate := scoreNode(divSelection)
			
 
				+				candidates[divSelection.Get(0)] = divCandidate
			
 
				+
			
 
				+				// Add a candidate with empty selection (this is artificial but tests the edge case)
			
 
				+				emptySelection := doc.Find("nonexistent")
			
 
				+				emptyCandidate := &candidate{selection: emptySelection, score: 0}
			
 
				+				// We can't use emptySelection.Get(0) as key since it would panic,
			
 
				+				// so we'll create a dummy node for this test
			
 
				+				dummyNode := &html.Node{Type: html.ElementNode, Data: "dummy"}
			
 
				+				candidates[dummyNode] = emptyCandidate
			
 
				+
			
 
				+				return candidates
			
 
				+			},
			
 
				+		},
			
 
				+	}
			
 
				+
			
 
				+	for _, tc := range testCases {
			
 
				+		t.Run(tc.name, func(t *testing.T) {
			
 
				+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
			
 
				+			if err != nil {
			
 
				+				t.Fatalf("Failed to parse HTML: %v", err)
			
 
				+			}
			
 
				+
			
 
				+			candidates := tc.setup(doc)
			
 
				+			result := candidates.String()
			
 
				+
			
 
				+			if tc.name == "empty candidate list" {
			
 
				+				if result != tc.expected {
			
 
				+					t.Errorf("Expected: %s, Got: %s", tc.expected, result)
			
 
				+				}
			
 
				+				return
			
 
				+			}
			
 
				+
			
 
				+			// For multiple candidates, we need to check that all expected parts are present
			
 
				+			// since map iteration order is not guaranteed
			
 
				+			switch tc.name {
			
 
				+			case "multiple candidates":
			
 
				+				expectedParts := []string{"div.content => 30.000000", "p.text => 25.000000", "h1#main => 20.000000"}
			
 
				+				for _, part := range expectedParts {
			
 
				+					if !strings.Contains(result, part) {
			
 
				+						t.Errorf("Expected result to contain: %s, Got: %s", part, result)
			
 
				+					}
			
 
				+				}
			
 
				+				// Check that it's comma-separated
			
 
				+				if !strings.Contains(result, ", ") {
			
 
				+					t.Errorf("Expected comma-separated format, Got: %s", result)
			
 
				+				}
			
 
				+			case "candidates with mixed scores":
			
 
				+				expectedParts := []string{"div.comment => -20.000000", "p.content => 25.000000"}
			
 
				+				for _, part := range expectedParts {
			
 
				+					if !strings.Contains(result, part) {
			
 
				+						t.Errorf("Expected result to contain: %s, Got: %s", part, result)
			
 
				+					}
			
 
				+				}
			
 
				+			case "candidate with empty selection":
			
 
				+				// Should contain both the regular candidate and the empty one
			
 
				+				if !strings.Contains(result, "div => 5.000000") {
			
 
				+					t.Errorf("Expected result to contain div candidate, Got: %s", result)
			
 
				+				}
			
 
				+				if !strings.Contains(result, "empty => 0.000000") {
			
 
				+					t.Errorf("Expected result to contain empty candidate, Got: %s", result)
			
 
				+				}
			
 
				+			default:
			
 
				+				// Single candidate test cases
			
 
				+				if result != tc.expected {
			
 
				+					t.Errorf("Expected: %s, Got: %s", tc.expected, result)
			
 
				+				}
			
 
				+			}
			
 
				+		})
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+func TestCandidateStringEdgeCases(t *testing.T) {
			
 
				+	t.Run("candidate with nil node but valid selection", func(t *testing.T) {
			
 
				+		// This tests the case where Node() returns nil but selection exists
			
 
				+		html := `<div>Test</div>`
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		emptySelection := doc.Find("nonexistent")
			
 
				+		candidate := &candidate{
			
 
				+			selection: emptySelection,
			
 
				+			score:     10.5,
			
 
				+		}
			
 
				+
			
 
				+		result := candidate.String()
			
 
				+		expected := "empty => 10.500000"
			
 
				+
			
 
				+		if result != expected {
			
 
				+			t.Errorf("Expected: %s, Got: %s", expected, result)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("candidate with zero score", func(t *testing.T) {
			
 
				+		html := `<div>Test</div>`
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		selection := doc.Find("div")
			
 
				+		candidate := &candidate{
			
 
				+			selection: selection,
			
 
				+			score:     0,
			
 
				+		}
			
 
				+
			
 
				+		result := candidate.String()
			
 
				+		expected := "div => 0.000000"
			
 
				+
			
 
				+		if result != expected {
			
 
				+			t.Errorf("Expected: %s, Got: %s", expected, result)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("candidate with negative score", func(t *testing.T) {
			
 
				+		html := `<h1>Test</h1>`
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		selection := doc.Find("h1")
			
 
				+		candidate := &candidate{
			
 
				+			selection: selection,
			
 
				+			score:     -10.5,
			
 
				+		}
			
 
				+
			
 
				+		result := candidate.String()
			
 
				+		expected := "h1 => -10.500000"
			
 
				+
			
 
				+		if result != expected {
			
 
				+			t.Errorf("Expected: %s, Got: %s", expected, result)
			
 
				+		}
			
 
				+	})
			
 
				+
			
 
				+	t.Run("candidate with very long class and id", func(t *testing.T) {
			
 
				+		html := `<div class="very-long-class-name-that-might-cause-issues" id="very-long-id-name-that-might-also-cause-formatting-issues">Test</div>`
			
 
				+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
			
 
				+		if err != nil {
			
 
				+			t.Fatal(err)
			
 
				+		}
			
 
				+
			
 
				+		selection := doc.Find("div")
			
 
				+		candidate := scoreNode(selection)
			
 
				+
			
 
				+		result := candidate.String()
			
 
				+		expected := "div#very-long-id-name-that-might-also-cause-formatting-issues.very-long-class-name-that-might-cause-issues => 5.000000"
			
 
				+
			
 
				+		if result != expected {
			
 
				+			t.Errorf("Expected: %s, Got: %s", expected, result)
			
 
				+		}
			
 
				+	})
			
 
				+}