Bladeren bron

test(readability): increase test coverage

Frédéric Guillot 9 maanden geleden
bovenliggende
commit
6eeccae7cd
2 gewijzigde bestanden met toevoegingen van 1208 en 45 verwijderingen
  1. 29 15
      internal/reader/readability/readability.go
  2. 1179 30
      internal/reader/readability/readability_test.go

+ 29 - 15
internal/reader/readability/readability.go

@@ -27,8 +27,8 @@ var (
 	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
 	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
 
-	positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
-	negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
+	positiveKeywords = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
+	negativeKeywords = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
 )
 
 type candidate struct {
@@ -37,23 +37,31 @@ type candidate struct {
 }
 
 func (c *candidate) Node() *html.Node {
+	if c.selection.Length() == 0 {
+		return nil
+	}
 	return c.selection.Get(0)
 }
 
 func (c *candidate) String() string {
+	node := c.Node()
+	if node == nil {
+		return fmt.Sprintf("empty => %f", c.score)
+	}
+
 	id, _ := c.selection.Attr("id")
 	class, _ := c.selection.Attr("class")
 
 	switch {
 	case id != "" && class != "":
-		return fmt.Sprintf("%s#%s.%s => %f", c.Node().DataAtom, id, class, c.score)
+		return fmt.Sprintf("%s#%s.%s => %f", node.DataAtom, id, class, c.score)
 	case id != "":
-		return fmt.Sprintf("%s#%s => %f", c.Node().DataAtom, id, c.score)
+		return fmt.Sprintf("%s#%s => %f", node.DataAtom, id, c.score)
 	case class != "":
-		return fmt.Sprintf("%s.%s => %f", c.Node().DataAtom, class, c.score)
+		return fmt.Sprintf("%s.%s => %f", node.DataAtom, class, c.score)
 	}
 
-	return fmt.Sprintf("%s => %f", c.Node().DataAtom, c.score)
+	return fmt.Sprintf("%s => %f", node.DataAtom, c.score)
 }
 
 type candidateList map[*html.Node]*candidate
@@ -111,7 +119,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 		tag := "div"
 		node := s.Get(0)
 
-		if node == topCandidate.Node() {
+		topNode := topCandidate.Node()
+		if topNode != nil && node == topNode {
 			append = true
 		} else if c, ok := candidates[node]; ok && c.score >= siblingScoreThreshold {
 			append = true
@@ -147,14 +156,14 @@ func shouldRemoveCandidate(str string) bool {
 	str = strings.ToLower(str)
 
 	// Those candidates have no false-positives, no need to check against `maybeCandidate`
-	for _, strong := range strongCandidates {
-		if strings.Contains(str, strong) {
+	for _, strongCandidate := range strongCandidates {
+		if strings.Contains(str, strongCandidate) {
 			return true
 		}
 	}
 
-	for _, unlikely := range unlikelyCandidate {
-		if strings.Contains(str, unlikely) {
+	for _, unlikelyCandidate := range unlikelyCandidate {
+		if strings.Contains(str, unlikelyCandidate) {
 			// Do we have a false positive?
 			for _, maybe := range maybeCandidate {
 				if strings.Contains(str, maybe) {
@@ -268,6 +277,11 @@ func getCandidates(document *goquery.Document) candidateList {
 func scoreNode(s *goquery.Selection) *candidate {
 	c := &candidate{selection: s, score: 0}
 
+	// Check if selection is empty to avoid panic
+	if s.Length() == 0 {
+		return c
+	}
+
 	switch s.Get(0).DataAtom.String() {
 	case "div":
 		c.score += 5
@@ -314,13 +328,13 @@ func getClassWeight(s *goquery.Selection) float32 {
 
 func getWeight(s string) int {
 	s = strings.ToLower(s)
-	for _, pos := range negative {
-		if strings.Contains(s, pos) {
+	for _, keyword := range negativeKeywords {
+		if strings.Contains(s, keyword) {
 			return -25
 		}
 	}
-	for _, pos := range positive {
-		if strings.Contains(s, pos) {
+	for _, keyword := range positiveKeywords {
+		if strings.Contains(s, keyword) {
 			return +25
 		}
 	}

+ 1179 - 30
internal/reader/readability/readability_test.go

@@ -11,8 +11,60 @@ import (
 	"testing"
 
 	"github.com/PuerkitoBio/goquery"
+	"golang.org/x/net/html"
 )
 
+func BenchmarkExtractContent(b *testing.B) {
+	var testCases = map[string][]byte{
+		"miniflux_github.html":    {},
+		"miniflux_wikipedia.html": {},
+	}
+	for filename := range testCases {
+		data, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+		testCases[filename] = data
+	}
+	for range b.N {
+		for _, v := range testCases {
+			ExtractContent(bytes.NewReader(v))
+		}
+	}
+}
+
+func BenchmarkGetWeight(b *testing.B) {
+	testCases := []string{
+		"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
+		"d-flex flex-column mb-3",
+		"AppHeader-search-control AppHeader-search-control-overflow",
+		"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
+		"sr-only",
+		"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
+	}
+	for range b.N {
+		for _, v := range testCases {
+			getWeight(v)
+		}
+	}
+}
+
+func BenchmarkTransformMisusedDivsIntoParagraphs(b *testing.B) {
+	html := `<html><body>
+		<div>Simple text content</div>
+		<div>More <span>inline</span> content</div>
+		<div><a href="#">Link content</a></div>
+		<div><p>Paragraph content</p></div>
+		<div>Another simple text</div>
+	</body></html>`
+
+	b.ResetTimer()
+	for i := 0; i < b.N; i++ {
+		doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
+		transformMisusedDivsIntoParagraphs(doc)
+	}
+}
+
 func TestBaseURL(t *testing.T) {
 	html := `
 		<html>
@@ -189,25 +241,6 @@ func TestNestedSpanInCodeBlock(t *testing.T) {
 	}
 }
 
-func BenchmarkExtractContent(b *testing.B) {
-	var testCases = map[string][]byte{
-		"miniflux_github.html":    {},
-		"miniflux_wikipedia.html": {},
-	}
-	for filename := range testCases {
-		data, err := os.ReadFile("testdata/" + filename)
-		if err != nil {
-			b.Fatalf(`Unable to read file %q: %v`, filename, err)
-		}
-		testCases[filename] = data
-	}
-	for range b.N {
-		for _, v := range testCases {
-			ExtractContent(bytes.NewReader(v))
-		}
-	}
-}
-
 func TestGetClassWeight(t *testing.T) {
 	testCases := []struct {
 		name     string
@@ -1315,18 +1348,1134 @@ func TestContainsSentence(t *testing.T) {
 	}
 }
 
-func BenchmarkGetWeight(b *testing.B) {
-	testCases := []string{
-		"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
-		"d-flex flex-column mb-3",
-		"AppHeader-search-control AppHeader-search-control-overflow",
-		"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
-		"sr-only",
-		"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
+func TestScoreNode(t *testing.T) {
+	testCases := []struct {
+		name          string
+		html          string
+		expectedScore float32
+		expectedTag   string
+	}{
+		{
+			name:          "div element with no class or id",
+			html:          `<div>Some content</div>`,
+			expectedScore: 5,
+			expectedTag:   "div",
+		},
+		{
+			name:          "pre element with no class or id",
+			html:          `<pre>Some code</pre>`,
+			expectedScore: 3,
+			expectedTag:   "pre",
+		},
+		{
+			name:          "td element with no class or id",
+			html:          `<table><tr><td>Table cell</td></tr></table>`,
+			expectedScore: 3,
+			expectedTag:   "td",
+		},
+		{
+			name:          "blockquote element with no class or id",
+			html:          `<blockquote>Quote</blockquote>`,
+			expectedScore: 3,
+			expectedTag:   "blockquote",
+		},
+		{
+			name:          "img element with no class or id",
+			html:          `<img src="test.jpg" alt="test">`,
+			expectedScore: 3,
+			expectedTag:   "img",
+		},
+		{
+			name:          "ol element with no class or id",
+			html:          `<ol><li>Item</li></ol>`,
+			expectedScore: -3,
+			expectedTag:   "ol",
+		},
+		{
+			name:          "ul element with no class or id",
+			html:          `<ul><li>Item</li></ul>`,
+			expectedScore: -3,
+			expectedTag:   "ul",
+		},
+		{
+			name:          "address element with no class or id",
+			html:          `<address>Contact info</address>`,
+			expectedScore: -3,
+			expectedTag:   "address",
+		},
+		{
+			name:          "dl element with no class or id",
+			html:          `<dl><dt>Term</dt><dd>Definition</dd></dl>`,
+			expectedScore: -3,
+			expectedTag:   "dl",
+		},
+		{
+			name:          "dd element with no class or id",
+			html:          `<dd>Definition</dd>`,
+			expectedScore: -3,
+			expectedTag:   "dd",
+		},
+		{
+			name:          "dt element with no class or id",
+			html:          `<dt>Term</dt>`,
+			expectedScore: -3,
+			expectedTag:   "dt",
+		},
+		{
+			name:          "li element with no class or id",
+			html:          `<li>List item</li>`,
+			expectedScore: -3,
+			expectedTag:   "li",
+		},
+		{
+			name:          "form element with no class or id",
+			html:          `<form>Form content</form>`,
+			expectedScore: -3,
+			expectedTag:   "form",
+		},
+		{
+			name:          "h1 element with no class or id",
+			html:          `<h1>Heading</h1>`,
+			expectedScore: -5,
+			expectedTag:   "h1",
+		},
+		{
+			name:          "h2 element with no class or id",
+			html:          `<h2>Heading</h2>`,
+			expectedScore: -5,
+			expectedTag:   "h2",
+		},
+		{
+			name:          "h3 element with no class or id",
+			html:          `<h3>Heading</h3>`,
+			expectedScore: -5,
+			expectedTag:   "h3",
+		},
+		{
+			name:          "h4 element with no class or id",
+			html:          `<h4>Heading</h4>`,
+			expectedScore: -5,
+			expectedTag:   "h4",
+		},
+		{
+			name:          "h5 element with no class or id",
+			html:          `<h5>Heading</h5>`,
+			expectedScore: -5,
+			expectedTag:   "h5",
+		},
+		{
+			name:          "h6 element with no class or id",
+			html:          `<h6>Heading</h6>`,
+			expectedScore: -5,
+			expectedTag:   "h6",
+		},
+		{
+			name:          "th element with no class or id",
+			html:          `<table><tr><th>Header cell</th></tr></table>`,
+			expectedScore: -5,
+			expectedTag:   "th",
+		},
+		{
+			name:          "p element with no class or id (default case)",
+			html:          `<p>Paragraph content</p>`,
+			expectedScore: 0,
+			expectedTag:   "p",
+		},
+		{
+			name:          "span element with no class or id (default case)",
+			html:          `<span>Span content</span>`,
+			expectedScore: 0,
+			expectedTag:   "span",
+		},
 	}
-	for range b.N {
-		for _, v := range testCases {
-			getWeight(v)
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			selection := doc.Find(tc.expectedTag)
+			if selection.Length() == 0 {
+				t.Fatalf("Could not find element with tag %s", tc.expectedTag)
+			}
+
+			candidate := scoreNode(selection)
+
+			if candidate.score != tc.expectedScore {
+				t.Errorf("Expected score %f, got %f", tc.expectedScore, candidate.score)
+			}
+
+			if candidate.selection != selection {
+				t.Error("Expected selection to be preserved in candidate")
+			}
+
+			if candidate.Node() == nil {
+				t.Errorf("Expected valid node, got nil")
+			} else if candidate.Node().Data != tc.expectedTag {
+				t.Errorf("Expected node tag %s, got %s", tc.expectedTag, candidate.Node().Data)
+			}
+		})
+	}
+}
+
+func TestScoreNodeWithClassWeights(t *testing.T) {
+	testCases := []struct {
+		name          string
+		html          string
+		expectedScore float32
+		description   string
+	}{
+		{
+			name:          "div with positive class",
+			html:          `<div class="content">Content</div>`,
+			expectedScore: 30, // 5 (div) + 25 (positive class)
+			description:   "div base score + positive class weight",
+		},
+		{
+			name:          "div with negative class",
+			html:          `<div class="comment">Content</div>`,
+			expectedScore: -20, // 5 (div) + (-25) (negative class)
+			description:   "div base score + negative class weight",
+		},
+		{
+			name:          "div with positive id",
+			html:          `<div id="main">Content</div>`,
+			expectedScore: 30, // 5 (div) + 25 (positive id)
+			description:   "div base score + positive id weight",
+		},
+		{
+			name:          "div with negative id",
+			html:          `<div id="sidebar">Content</div>`,
+			expectedScore: -20, // 5 (div) + (-25) (negative id)
+			description:   "div base score + negative id weight",
+		},
+		{
+			name:          "div with both positive class and id",
+			html:          `<div class="content" id="main">Content</div>`,
+			expectedScore: 55, // 5 (div) + 25 (positive class) + 25 (positive id)
+			description:   "div base score + positive class weight + positive id weight",
+		},
+		{
+			name:          "div with both negative class and id",
+			html:          `<div class="comment" id="sidebar">Content</div>`,
+			expectedScore: -45, // 5 (div) + (-25) (negative class) + (-25) (negative id)
+			description:   "div base score + negative class weight + negative id weight",
+		},
+		{
+			name:          "div with mixed class and id weights",
+			html:          `<div class="content" id="sidebar">Content</div>`,
+			expectedScore: 5, // 5 (div) + 25 (positive class) + (-25) (negative id)
+			description:   "div base score + positive class weight + negative id weight",
+		},
+		{
+			name:          "h1 with positive class (should still be negative overall)",
+			html:          `<h1 class="content">Heading</h1>`,
+			expectedScore: 20, // -5 (h1) + 25 (positive class)
+			description:   "h1 base score + positive class weight",
+		},
+		{
+			name:          "ul with negative class (more negative)",
+			html:          `<ul class="comment">List</ul>`,
+			expectedScore: -28, // -3 (ul) + (-25) (negative class)
+			description:   "ul base score + negative class weight",
+		},
+		{
+			name:          "p with neutral class/id (no weight change)",
+			html:          `<p class="normal" id="regular">Paragraph</p>`,
+			expectedScore: 0, // 0 (p) + 0 (neutral class) + 0 (neutral id)
+			description:   "p base score with neutral class and id",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+			if err != nil {
+				t.Fatal(err)
+			}
+
+			// Find the first non-html/body element
+			selection := doc.Find("div, h1, h2, h3, h4, h5, h6, ul, ol, p, pre, blockquote, img, td, th, address, dl, dd, dt, li, form, span").First()
+			if selection.Length() == 0 {
+				t.Fatal("Could not find element")
+			}
+
+			candidate := scoreNode(selection)
+
+			if candidate.score != tc.expectedScore {
+				t.Errorf("%s: Expected score %f, got %f", tc.description, tc.expectedScore, candidate.score)
+			}
+		})
+	}
+}
+
+func TestScoreNodeEdgeCases(t *testing.T) {
+	t.Run("empty selection", func(t *testing.T) {
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(`<div></div>`))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Create empty selection
+		emptySelection := doc.Find("nonexistent")
+		if emptySelection.Length() != 0 {
+			t.Fatal("Expected empty selection")
+		}
+
+		// scoreNode should handle empty selection gracefully
+		candidate := scoreNode(emptySelection)
+		if candidate == nil {
+			t.Error("Expected non-nil candidate even for empty selection")
+		}
+
+		// Should have score 0 and empty selection
+		if candidate != nil && candidate.score != 0 {
+			t.Errorf("Expected score 0 for empty selection, got %f", candidate.score)
+		}
+
+		if candidate.selection.Length() != 0 {
+			t.Error("Expected candidate to preserve empty selection")
+		}
+
+		// Node() should return nil for empty selection
+		if candidate.Node() != nil {
+			t.Error("Expected Node() to return nil for empty selection")
+		}
+
+		// String() should handle empty selection gracefully
+		str := candidate.String()
+		expected := "empty => 0.000000"
+		if str != expected {
+			t.Errorf("Expected String() to return %q, got %q", expected, str)
+		}
+	})
+
+	t.Run("multiple elements in selection", func(t *testing.T) {
+		html := `<div>
+			<p class="article">First paragraph</p>
+			<p class="sidebar">Second paragraph</p>
+		</div>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Select all p elements
+		selection := doc.Find("p")
+		if selection.Length() != 2 {
+			t.Fatalf("Expected 2 p elements, got %d", selection.Length())
+		}
+
+		// scoreNode should only consider the first element in the selection
+		candidate := scoreNode(selection)
+
+		// Should score based on first p element (class="article")
+		expectedScore := float32(25) // 0 (p) + 25 (positive class)
+		if candidate.score != expectedScore {
+			t.Errorf("Expected score %f, got %f", expectedScore, candidate.score)
+		}
+
+		if candidate.Node() == nil {
+			t.Error("Expected valid node, got nil")
+		} else if candidate.Node().Data != "p" {
+			t.Errorf("Expected node tag p, got %s", candidate.Node().Data)
+		}
+	})
+
+	t.Run("nested elements", func(t *testing.T) {
+		html := `<div class="article">
+			<p class="content">
+				<span class="highlight">Text</span>
+			</p>
+		</div>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Test scoring each level
+		divSelection := doc.Find("div")
+		divCandidate := scoreNode(divSelection)
+		expectedDivScore := float32(30) // 5 (div) + 25 (positive class)
+		if divCandidate.score != expectedDivScore {
+			t.Errorf("Div score: expected %f, got %f", expectedDivScore, divCandidate.score)
+		}
+
+		pSelection := doc.Find("p")
+		pCandidate := scoreNode(pSelection)
+		expectedPScore := float32(25) // 0 (p) + 25 (positive class)
+		if pCandidate.score != expectedPScore {
+			t.Errorf("P score: expected %f, got %f", expectedPScore, pCandidate.score)
+		}
+
+		spanSelection := doc.Find("span")
+		spanCandidate := scoreNode(spanSelection)
+		expectedSpanScore := float32(0) // 0 (span) + 0 (neutral class)
+		if spanCandidate.score != expectedSpanScore {
+			t.Errorf("Span score: expected %f, got %f", expectedSpanScore, spanCandidate.score)
+		}
+	})
+}
+
+func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
+	testCases := []struct {
+		name        string
+		input       string
+		expected    string
+		description string
+	}{
+		{
+			name:        "div with only text should become paragraph",
+			input:       `<div>Simple text content</div>`,
+			expected:    `<p>Simple text content</p>`,
+			description: "div containing only text should be converted to p",
+		},
+		{
+			name:        "div with inline elements should become paragraph",
+			input:       `<div>Text with <span>inline</span> and <em>emphasis</em></div>`,
+			expected:    `<p>Text with <span>inline</span> and <em>emphasis</em></p>`,
+			description: "div with inline elements should be converted to p",
+		},
+		{
+			name:        "div with strong and other inline elements",
+			input:       `<div>Some <strong>bold</strong> and <i>italic</i> text</div>`,
+			expected:    `<p>Some <strong>bold</strong> and <i>italic</i> text</p>`,
+			description: "div with inline formatting should be converted to p",
+		},
+		{
+			name:        "div with anchor tag should NOT become paragraph",
+			input:       `<div>Text with <a href="#">link</a></div>`,
+			expected:    `<div>Text with <a href="#">link</a></div>`,
+			description: "div containing anchor tag should remain div (matches regex)",
+		},
+		{
+			name:        "div with paragraph should NOT become paragraph",
+			input:       `<div><p>Nested paragraph</p></div>`,
+			expected:    `<div><p>Nested paragraph</p></div>`,
+			description: "div containing p tag should remain div",
+		},
+		{
+			name:        "div with blockquote should NOT become paragraph",
+			input:       `<div><blockquote>Quote</blockquote></div>`,
+			expected:    `<div><blockquote>Quote</blockquote></div>`,
+			description: "div containing blockquote should remain div",
+		},
+		{
+			name:        "div with nested div should NOT become paragraph",
+			input:       `<div><div>Nested div</div></div>`,
+			expected:    `<div><p>Nested div</p></div>`,
+			description: "outer div has nested div (matches regex), inner div has text only (gets converted)",
+		},
+		{
+			name:        "div with img should NOT become paragraph",
+			input:       `<div><img src="test.jpg" alt="test"></div>`,
+			expected:    `<div><img src="test.jpg" alt="test"/></div>`,
+			description: "div containing img should remain div",
+		},
+		{
+			name:        "div with ol should NOT become paragraph",
+			input:       `<div><ol><li>Item</li></ol></div>`,
+			expected:    `<div><ol><li>Item</li></ol></div>`,
+			description: "div containing ol should remain div",
+		},
+		{
+			name:        "div with ul should NOT become paragraph",
+			input:       `<div><ul><li>Item</li></ul></div>`,
+			expected:    `<div><ul><li>Item</li></ul></div>`,
+			description: "div containing ul should remain div",
+		},
+		{
+			name:        "div with pre should NOT become paragraph",
+			input:       `<div><pre>Code block</pre></div>`,
+			expected:    `<div><pre>Code block</pre></div>`,
+			description: "div containing pre should remain div",
+		},
+		{
+			name:        "div with table should NOT become paragraph",
+			input:       `<div><table><tr><td>Cell</td></tr></table></div>`,
+			expected:    `<div><table><tbody><tr><td>Cell</td></tr></tbody></table></div>`,
+			description: "div containing table should remain div (note: GoQuery adds tbody)",
+		},
+		{
+			name:        "div with dl should NOT become paragraph",
+			input:       `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
+			expected:    `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
+			description: "div containing dl should remain div",
+		},
+		{
+			name:        "empty div should become paragraph",
+			input:       `<div></div>`,
+			expected:    `<p></p>`,
+			description: "empty div should be converted to p",
+		},
+		{
+			name:        "div with only whitespace should become paragraph",
+			input:       `<div>   </div>`,
+			expected:    `<p>   </p>`,
+			description: "div with only whitespace should be converted to p",
+		},
+		{
+			name:        "div with self-closing anchor tag should NOT become paragraph",
+			input:       `<div>Text <a/> more text</div>`,
+			expected:    `<div>Text <a> more text</a></div>`,
+			description: "div with self-closing anchor should remain div (note: GoQuery normalizes self-closing tags)",
+		},
+		{
+			name:        "case insensitive matching - uppercase A",
+			input:       `<div>Text with <A href="#">link</A></div>`,
+			expected:    `<div>Text with <a href="#">link</a></div>`,
+			description: "regex should be case insensitive (note: GoQuery normalizes case)",
+		},
+		{
+			name:        "case insensitive matching - uppercase IMG",
+			input:       `<div><IMG src="test.jpg"></div>`,
+			expected:    `<div><img src="test.jpg"/></div>`,
+			description: "regex should be case insensitive (note: GoQuery normalizes case)",
+		},
+		{
+			name:        "multiple divs transformation",
+			input:       `<div>Text only</div><div><p>Has paragraph</p></div><div>More text</div>`,
+			expected:    `<p>Text only</p><div><p>Has paragraph</p></div><p>More text</p>`,
+			description: "should transform multiple divs appropriately",
+		},
+		{
+			name:        "nested divs where inner gets transformed",
+			input:       `<div><div>Inner text only</div><p>Paragraph</p></div>`,
+			expected:    `<div><p>Inner text only</p><p>Paragraph</p></div>`,
+			description: "inner div should be transformed even if outer div isn't",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			// Wrap input in a basic HTML structure
+			html := fmt.Sprintf(`<html><body>%s</body></html>`, tc.input)
+
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+			if err != nil {
+				t.Fatalf("Failed to parse HTML: %v", err)
+			}
+
+			// Apply the transformation
+			transformMisusedDivsIntoParagraphs(doc)
+
+			// Extract the body content
+			bodyHtml, err := doc.Find("body").Html()
+			if err != nil {
+				t.Fatalf("Failed to get body HTML: %v", err)
+			}
+
+			// Clean up whitespace for comparison
+			result := strings.TrimSpace(bodyHtml)
+			expected := strings.TrimSpace(tc.expected)
+
+			if result != expected {
+				t.Errorf("%s\nExpected: %s\nGot:      %s", tc.description, expected, result)
+			}
+		})
+	}
+}
+
+func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
+	// Test the regex pattern directly to ensure it matches the expected elements
+	testCases := []struct {
+		name        string
+		html        string
+		shouldMatch bool
+		description string
+	}{
+		{
+			name:        "anchor tag",
+			html:        `<a href="#">link</a>`,
+			shouldMatch: true,
+			description: "should match anchor tags",
+		},
+		{
+			name:        "blockquote tag",
+			html:        `<blockquote>quote</blockquote>`,
+			shouldMatch: true,
+			description: "should match blockquote tags",
+		},
+		{
+			name:        "dl tag",
+			html:        `<dl><dt>term</dt></dl>`,
+			shouldMatch: true,
+			description: "should match dl tags",
+		},
+		{
+			name:        "div tag",
+			html:        `<div>content</div>`,
+			shouldMatch: true,
+			description: "should match div tags",
+		},
+		{
+			name:        "img tag",
+			html:        `<img src="test.jpg">`,
+			shouldMatch: true,
+			description: "should match img tags",
+		},
+		{
+			name:        "ol tag",
+			html:        `<ol><li>item</li></ol>`,
+			shouldMatch: true,
+			description: "should match ol tags",
+		},
+		{
+			name:        "p tag",
+			html:        `<p>paragraph</p>`,
+			shouldMatch: true,
+			description: "should match p tags",
+		},
+		{
+			name:        "pre tag",
+			html:        `<pre>code</pre>`,
+			shouldMatch: true,
+			description: "should match pre tags",
+		},
+		{
+			name:        "table tag",
+			html:        `<table><tr></tr></table>`,
+			shouldMatch: true,
+			description: "should match table tags",
+		},
+		{
+			name:        "ul tag",
+			html:        `<ul><li>item</li></ul>`,
+			shouldMatch: true,
+			description: "should match ul tags",
+		},
+		{
+			name:        "self-closing anchor",
+			html:        `<a/>`,
+			shouldMatch: true,
+			description: "should match self-closing anchor tags",
+		},
+		{
+			name:        "tag with attributes",
+			html:        `<a href="#" class="link">text</a>`,
+			shouldMatch: true,
+			description: "should match tags with attributes",
+		},
+		{
+			name:        "uppercase tags",
+			html:        `<A href="#">link</A>`,
+			shouldMatch: true,
+			description: "should be case insensitive",
+		},
+		{
+			name:        "mixed case tags",
+			html:        `<Img src="test.jpg">`,
+			shouldMatch: true,
+			description: "should match mixed case tags",
+		},
+		{
+			name:        "span tag",
+			html:        `<span>text</span>`,
+			shouldMatch: false,
+			description: "should NOT match span tags",
+		},
+		{
+			name:        "em tag",
+			html:        `<em>emphasis</em>`,
+			shouldMatch: false,
+			description: "should NOT match em tags",
+		},
+		{
+			name:        "strong tag",
+			html:        `<strong>bold</strong>`,
+			shouldMatch: false,
+			description: "should NOT match strong tags",
+		},
+		{
+			name:        "i tag",
+			html:        `<i>italic</i>`,
+			shouldMatch: false,
+			description: "should NOT match i tags",
+		},
+		{
+			name:        "b tag",
+			html:        `<b>bold</b>`,
+			shouldMatch: false,
+			description: "should NOT match b tags",
+		},
+		{
+			name:        "plain text",
+			html:        `just plain text`,
+			shouldMatch: false,
+			description: "should NOT match plain text",
+		},
+		{
+			name:        "empty string",
+			html:        ``,
+			shouldMatch: false,
+			description: "should NOT match empty string",
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			result := divToPElementsRegexp.MatchString(tc.html)
+			if result != tc.shouldMatch {
+				t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
+			}
+		})
+	}
+}
+
+func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
+	t.Run("document with no divs", func(t *testing.T) {
+		html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Should not panic or cause issues
+		transformMisusedDivsIntoParagraphs(doc)
+
+		bodyHtml, _ := doc.Find("body").Html()
+		expected := `<p>No divs here</p><span>Just other elements</span>`
+
+		if strings.TrimSpace(bodyHtml) != expected {
+			t.Errorf("Expected no changes to document without divs")
+		}
+	})
+
+	t.Run("empty document", func(t *testing.T) {
+		html := `<html><body></body></html>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		// Should not panic with empty document
+		transformMisusedDivsIntoParagraphs(doc)
+
+		bodyHtml, _ := doc.Find("body").Html()
+		if strings.TrimSpace(bodyHtml) != "" {
+			t.Errorf("Expected empty body to remain empty")
 		}
+	})
+
+	t.Run("deeply nested divs", func(t *testing.T) {
+		html := `<html><body><div><div><div>Deep text</div></div></div></body></html>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		transformMisusedDivsIntoParagraphs(doc)
+
+		bodyHtml, _ := doc.Find("body").Html()
+		// The outer divs contain other divs (matches regex), so they remain divs
+		// Only the innermost div with just text gets converted to p
+		expected := `<div><div><p>Deep text</p></div></div>`
+
+		if strings.TrimSpace(bodyHtml) != expected {
+			t.Errorf("Expected nested div transformation\nGot: %s\nExpected: %s", strings.TrimSpace(bodyHtml), expected)
+		}
+	})
+
+	t.Run("complex mixed content", func(t *testing.T) {
+		html := `<html><body>
+			<div>Text only div</div>
+			<div><a href="#">Link div</a></div>
+			<div><span>Inline</span> text</div>
+			<div><p>Block element</p></div>
+		</body></html>`
+
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		transformMisusedDivsIntoParagraphs(doc)
+
+		// Count paragraphs and divs
+		pCount := doc.Find("p").Length()
+		divCount := doc.Find("div").Length()
+
+		// Should have 3 paragraphs (original p + 2 converted divs) and 2 divs (link div + block element div)
+		expectedPCount := 3
+		expectedDivCount := 2
+
+		if pCount != expectedPCount {
+			t.Errorf("Expected %d paragraphs, got %d", expectedPCount, pCount)
+		}
+		if divCount != expectedDivCount {
+			t.Errorf("Expected %d divs, got %d", expectedDivCount, divCount)
+		}
+	})
+}
+
+func TestCandidateString(t *testing.T) {
+	testCases := []struct {
+		name     string
+		html     string
+		expected string
+		setup    func(*goquery.Document) *candidate
+	}{
+		{
+			name:     "empty candidate",
+			html:     `<div></div>`,
+			expected: "empty => 0.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				emptySelection := doc.Find("nonexistent")
+				return &candidate{selection: emptySelection, score: 0}
+			},
+		},
+		{
+			name:     "candidate with no class or id",
+			html:     `<div>Content</div>`,
+			expected: "div => 5.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with class only",
+			html:     `<div class="content">Content</div>`,
+			expected: "div.content => 30.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with id only",
+			html:     `<div id="main">Content</div>`,
+			expected: "div#main => 30.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with both class and id",
+			html:     `<div class="content" id="main">Content</div>`,
+			expected: "div#main.content => 55.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with multiple classes",
+			html:     `<div class="article main content">Content</div>`,
+			expected: "div.article main content => 30.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "paragraph candidate with negative class",
+			html:     `<p class="comment">Comment text</p>`,
+			expected: "p.comment => -25.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("p")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "heading candidate with positive id",
+			html:     `<h1 id="main">Heading</h1>`,
+			expected: "h1#main => 20.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("h1")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with special characters in class",
+			html:     `<div class="my-class_name">Content</div>`,
+			expected: "div.my-class_name => 5.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with empty class attribute",
+			html:     `<div class="">Content</div>`,
+			expected: "div => 5.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "candidate with empty id attribute",
+			html:     `<div id="">Content</div>`,
+			expected: "div => 5.000000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("div")
+				return scoreNode(selection)
+			},
+		},
+		{
+			name:     "custom score candidate",
+			html:     `<span>Content</span>`,
+			expected: "span => 42.500000",
+			setup: func(doc *goquery.Document) *candidate {
+				selection := doc.Find("span")
+				c := scoreNode(selection)
+				c.score = 42.5 // Override score for testing
+				return c
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+			if err != nil {
+				t.Fatalf("Failed to parse HTML: %v", err)
+			}
+
+			candidate := tc.setup(doc)
+			result := candidate.String()
+
+			if result != tc.expected {
+				t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+			}
+		})
+	}
+}
+
+func TestCandidateListString(t *testing.T) {
+	testCases := []struct {
+		name     string
+		html     string
+		expected string
+		setup    func(*goquery.Document) candidateList
+	}{
+		{
+			name:     "empty candidate list",
+			html:     `<div></div>`,
+			expected: "",
+			setup: func(doc *goquery.Document) candidateList {
+				return make(candidateList)
+			},
+		},
+		{
+			name:     "single candidate",
+			html:     `<div class="content">Content</div>`,
+			expected: "div.content => 30.000000",
+			setup: func(doc *goquery.Document) candidateList {
+				candidates := make(candidateList)
+				selection := doc.Find("div")
+				candidate := scoreNode(selection)
+				candidates[selection.Get(0)] = candidate
+				return candidates
+			},
+		},
+		{
+			name: "multiple candidates",
+			html: `<div class="content">Content</div><p class="text">Paragraph</p><h1 id="main">Title</h1>`,
+			setup: func(doc *goquery.Document) candidateList {
+				candidates := make(candidateList)
+
+				divSelection := doc.Find("div")
+				divCandidate := scoreNode(divSelection)
+				candidates[divSelection.Get(0)] = divCandidate
+
+				pSelection := doc.Find("p")
+				pCandidate := scoreNode(pSelection)
+				candidates[pSelection.Get(0)] = pCandidate
+
+				h1Selection := doc.Find("h1")
+				h1Candidate := scoreNode(h1Selection)
+				candidates[h1Selection.Get(0)] = h1Candidate
+
+				return candidates
+			},
+		},
+		{
+			name: "candidates with mixed scores",
+			html: `<div class="comment">Comment</div><p class="content">Good content</p>`,
+			setup: func(doc *goquery.Document) candidateList {
+				candidates := make(candidateList)
+
+				divSelection := doc.Find("div")
+				divCandidate := scoreNode(divSelection)
+				candidates[divSelection.Get(0)] = divCandidate
+
+				pSelection := doc.Find("p")
+				pCandidate := scoreNode(pSelection)
+				candidates[pSelection.Get(0)] = pCandidate
+
+				return candidates
+			},
+		},
+		{
+			name: "candidate with empty selection",
+			html: `<div>Test</div>`,
+			setup: func(doc *goquery.Document) candidateList {
+				candidates := make(candidateList)
+
+				// Add a regular candidate
+				divSelection := doc.Find("div")
+				divCandidate := scoreNode(divSelection)
+				candidates[divSelection.Get(0)] = divCandidate
+
+				// Add a candidate with empty selection (this is artificial but tests the edge case)
+				emptySelection := doc.Find("nonexistent")
+				emptyCandidate := &candidate{selection: emptySelection, score: 0}
+				// We can't use emptySelection.Get(0) as key since it would panic,
+				// so we'll create a dummy node for this test
+				dummyNode := &html.Node{Type: html.ElementNode, Data: "dummy"}
+				candidates[dummyNode] = emptyCandidate
+
+				return candidates
+			},
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
+			if err != nil {
+				t.Fatalf("Failed to parse HTML: %v", err)
+			}
+
+			candidates := tc.setup(doc)
+			result := candidates.String()
+
+			if tc.name == "empty candidate list" {
+				if result != tc.expected {
+					t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+				}
+				return
+			}
+
+			// For multiple candidates, we need to check that all expected parts are present
+			// since map iteration order is not guaranteed
+			switch tc.name {
+			case "multiple candidates":
+				expectedParts := []string{"div.content => 30.000000", "p.text => 25.000000", "h1#main => 20.000000"}
+				for _, part := range expectedParts {
+					if !strings.Contains(result, part) {
+						t.Errorf("Expected result to contain: %s, Got: %s", part, result)
+					}
+				}
+				// Check that it's comma-separated
+				if !strings.Contains(result, ", ") {
+					t.Errorf("Expected comma-separated format, Got: %s", result)
+				}
+			case "candidates with mixed scores":
+				expectedParts := []string{"div.comment => -20.000000", "p.content => 25.000000"}
+				for _, part := range expectedParts {
+					if !strings.Contains(result, part) {
+						t.Errorf("Expected result to contain: %s, Got: %s", part, result)
+					}
+				}
+			case "candidate with empty selection":
+				// Should contain both the regular candidate and the empty one
+				if !strings.Contains(result, "div => 5.000000") {
+					t.Errorf("Expected result to contain div candidate, Got: %s", result)
+				}
+				if !strings.Contains(result, "empty => 0.000000") {
+					t.Errorf("Expected result to contain empty candidate, Got: %s", result)
+				}
+			default:
+				// Single candidate test cases
+				if result != tc.expected {
+					t.Errorf("Expected: %s, Got: %s", tc.expected, result)
+				}
+			}
+		})
 	}
 }
+
+func TestCandidateStringEdgeCases(t *testing.T) {
+	t.Run("candidate with nil node but valid selection", func(t *testing.T) {
+		// This tests the case where Node() returns nil but selection exists
+		html := `<div>Test</div>`
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		emptySelection := doc.Find("nonexistent")
+		candidate := &candidate{
+			selection: emptySelection,
+			score:     10.5,
+		}
+
+		result := candidate.String()
+		expected := "empty => 10.500000"
+
+		if result != expected {
+			t.Errorf("Expected: %s, Got: %s", expected, result)
+		}
+	})
+
+	t.Run("candidate with zero score", func(t *testing.T) {
+		html := `<div>Test</div>`
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		selection := doc.Find("div")
+		candidate := &candidate{
+			selection: selection,
+			score:     0,
+		}
+
+		result := candidate.String()
+		expected := "div => 0.000000"
+
+		if result != expected {
+			t.Errorf("Expected: %s, Got: %s", expected, result)
+		}
+	})
+
+	t.Run("candidate with negative score", func(t *testing.T) {
+		html := `<h1>Test</h1>`
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		selection := doc.Find("h1")
+		candidate := &candidate{
+			selection: selection,
+			score:     -10.5,
+		}
+
+		result := candidate.String()
+		expected := "h1 => -10.500000"
+
+		if result != expected {
+			t.Errorf("Expected: %s, Got: %s", expected, result)
+		}
+	})
+
+	t.Run("candidate with very long class and id", func(t *testing.T) {
+		html := `<div class="very-long-class-name-that-might-cause-issues" id="very-long-id-name-that-might-also-cause-formatting-issues">Test</div>`
+		doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
+		if err != nil {
+			t.Fatal(err)
+		}
+
+		selection := doc.Find("div")
+		candidate := scoreNode(selection)
+
+		result := candidate.String()
+		expected := "div#very-long-id-name-that-might-also-cause-formatting-issues.very-long-class-name-that-might-cause-issues => 5.000000"
+
+		if result != expected {
+			t.Errorf("Expected: %s, Got: %s", expected, result)
+		}
+	})
+}