|
|
@@ -11,8 +11,60 @@ import (
|
|
|
"testing"
|
|
|
|
|
|
"github.com/PuerkitoBio/goquery"
|
|
|
+ "golang.org/x/net/html"
|
|
|
)
|
|
|
|
|
|
+func BenchmarkExtractContent(b *testing.B) {
|
|
|
+ var testCases = map[string][]byte{
|
|
|
+ "miniflux_github.html": {},
|
|
|
+ "miniflux_wikipedia.html": {},
|
|
|
+ }
|
|
|
+ for filename := range testCases {
|
|
|
+ data, err := os.ReadFile("testdata/" + filename)
|
|
|
+ if err != nil {
|
|
|
+ b.Fatalf(`Unable to read file %q: %v`, filename, err)
|
|
|
+ }
|
|
|
+ testCases[filename] = data
|
|
|
+ }
|
|
|
+ for range b.N {
|
|
|
+ for _, v := range testCases {
|
|
|
+ ExtractContent(bytes.NewReader(v))
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func BenchmarkGetWeight(b *testing.B) {
|
|
|
+ testCases := []string{
|
|
|
+ "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
|
|
|
+ "d-flex flex-column mb-3",
|
|
|
+ "AppHeader-search-control AppHeader-search-control-overflow",
|
|
|
+ "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
|
|
|
+ "sr-only",
|
|
|
+ "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
|
|
|
+ }
|
|
|
+ for range b.N {
|
|
|
+ for _, v := range testCases {
|
|
|
+ getWeight(v)
|
|
|
+ }
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func BenchmarkTransformMisusedDivsIntoParagraphs(b *testing.B) {
|
|
|
+ html := `<html><body>
|
|
|
+ <div>Simple text content</div>
|
|
|
+ <div>More <span>inline</span> content</div>
|
|
|
+ <div><a href="#">Link content</a></div>
|
|
|
+ <div><p>Paragraph content</p></div>
|
|
|
+ <div>Another simple text</div>
|
|
|
+ </body></html>`
|
|
|
+
|
|
|
+ b.ResetTimer()
|
|
|
+ for i := 0; i < b.N; i++ {
|
|
|
+ doc, _ := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
func TestBaseURL(t *testing.T) {
|
|
|
html := `
|
|
|
<html>
|
|
|
@@ -189,25 +241,6 @@ func TestNestedSpanInCodeBlock(t *testing.T) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func BenchmarkExtractContent(b *testing.B) {
|
|
|
- var testCases = map[string][]byte{
|
|
|
- "miniflux_github.html": {},
|
|
|
- "miniflux_wikipedia.html": {},
|
|
|
- }
|
|
|
- for filename := range testCases {
|
|
|
- data, err := os.ReadFile("testdata/" + filename)
|
|
|
- if err != nil {
|
|
|
- b.Fatalf(`Unable to read file %q: %v`, filename, err)
|
|
|
- }
|
|
|
- testCases[filename] = data
|
|
|
- }
|
|
|
- for range b.N {
|
|
|
- for _, v := range testCases {
|
|
|
- ExtractContent(bytes.NewReader(v))
|
|
|
- }
|
|
|
- }
|
|
|
-}
|
|
|
-
|
|
|
func TestGetClassWeight(t *testing.T) {
|
|
|
testCases := []struct {
|
|
|
name string
|
|
|
@@ -1315,18 +1348,1134 @@ func TestContainsSentence(t *testing.T) {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func BenchmarkGetWeight(b *testing.B) {
|
|
|
- testCases := []string{
|
|
|
- "p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
|
|
|
- "d-flex flex-column mb-3",
|
|
|
- "AppHeader-search-control AppHeader-search-control-overflow",
|
|
|
- "Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
|
|
|
- "sr-only",
|
|
|
- "validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
|
|
|
+func TestScoreNode(t *testing.T) {
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ html string
|
|
|
+ expectedScore float32
|
|
|
+ expectedTag string
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "div element with no class or id",
|
|
|
+ html: `<div>Some content</div>`,
|
|
|
+ expectedScore: 5,
|
|
|
+ expectedTag: "div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "pre element with no class or id",
|
|
|
+ html: `<pre>Some code</pre>`,
|
|
|
+ expectedScore: 3,
|
|
|
+ expectedTag: "pre",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "td element with no class or id",
|
|
|
+ html: `<table><tr><td>Table cell</td></tr></table>`,
|
|
|
+ expectedScore: 3,
|
|
|
+ expectedTag: "td",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "blockquote element with no class or id",
|
|
|
+ html: `<blockquote>Quote</blockquote>`,
|
|
|
+ expectedScore: 3,
|
|
|
+ expectedTag: "blockquote",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "img element with no class or id",
|
|
|
+ html: `<img src="test.jpg" alt="test">`,
|
|
|
+ expectedScore: 3,
|
|
|
+ expectedTag: "img",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "ol element with no class or id",
|
|
|
+ html: `<ol><li>Item</li></ol>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "ol",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "ul element with no class or id",
|
|
|
+ html: `<ul><li>Item</li></ul>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "ul",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "address element with no class or id",
|
|
|
+ html: `<address>Contact info</address>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "address",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "dl element with no class or id",
|
|
|
+ html: `<dl><dt>Term</dt><dd>Definition</dd></dl>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "dl",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "dd element with no class or id",
|
|
|
+ html: `<dd>Definition</dd>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "dd",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "dt element with no class or id",
|
|
|
+ html: `<dt>Term</dt>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "dt",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "li element with no class or id",
|
|
|
+ html: `<li>List item</li>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "li",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "form element with no class or id",
|
|
|
+ html: `<form>Form content</form>`,
|
|
|
+ expectedScore: -3,
|
|
|
+ expectedTag: "form",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h1 element with no class or id",
|
|
|
+ html: `<h1>Heading</h1>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h1",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h2 element with no class or id",
|
|
|
+ html: `<h2>Heading</h2>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h2",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h3 element with no class or id",
|
|
|
+ html: `<h3>Heading</h3>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h3",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h4 element with no class or id",
|
|
|
+ html: `<h4>Heading</h4>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h4",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h5 element with no class or id",
|
|
|
+ html: `<h5>Heading</h5>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h5",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h6 element with no class or id",
|
|
|
+ html: `<h6>Heading</h6>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "h6",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "th element with no class or id",
|
|
|
+ html: `<table><tr><th>Header cell</th></tr></table>`,
|
|
|
+ expectedScore: -5,
|
|
|
+ expectedTag: "th",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "p element with no class or id (default case)",
|
|
|
+ html: `<p>Paragraph content</p>`,
|
|
|
+ expectedScore: 0,
|
|
|
+ expectedTag: "p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "span element with no class or id (default case)",
|
|
|
+ html: `<span>Span content</span>`,
|
|
|
+ expectedScore: 0,
|
|
|
+ expectedTag: "span",
|
|
|
+ },
|
|
|
}
|
|
|
- for range b.N {
|
|
|
- for _, v := range testCases {
|
|
|
- getWeight(v)
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ selection := doc.Find(tc.expectedTag)
|
|
|
+ if selection.Length() == 0 {
|
|
|
+ t.Fatalf("Could not find element with tag %s", tc.expectedTag)
|
|
|
+ }
|
|
|
+
|
|
|
+ candidate := scoreNode(selection)
|
|
|
+
|
|
|
+ if candidate.score != tc.expectedScore {
|
|
|
+ t.Errorf("Expected score %f, got %f", tc.expectedScore, candidate.score)
|
|
|
+ }
|
|
|
+
|
|
|
+ if candidate.selection != selection {
|
|
|
+ t.Error("Expected selection to be preserved in candidate")
|
|
|
+ }
|
|
|
+
|
|
|
+ if candidate.Node() == nil {
|
|
|
+ t.Errorf("Expected valid node, got nil")
|
|
|
+ } else if candidate.Node().Data != tc.expectedTag {
|
|
|
+ t.Errorf("Expected node tag %s, got %s", tc.expectedTag, candidate.Node().Data)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestScoreNodeWithClassWeights(t *testing.T) {
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ html string
|
|
|
+ expectedScore float32
|
|
|
+ description string
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "div with positive class",
|
|
|
+ html: `<div class="content">Content</div>`,
|
|
|
+ expectedScore: 30, // 5 (div) + 25 (positive class)
|
|
|
+ description: "div base score + positive class weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with negative class",
|
|
|
+ html: `<div class="comment">Content</div>`,
|
|
|
+ expectedScore: -20, // 5 (div) + (-25) (negative class)
|
|
|
+ description: "div base score + negative class weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with positive id",
|
|
|
+ html: `<div id="main">Content</div>`,
|
|
|
+ expectedScore: 30, // 5 (div) + 25 (positive id)
|
|
|
+ description: "div base score + positive id weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with negative id",
|
|
|
+ html: `<div id="sidebar">Content</div>`,
|
|
|
+ expectedScore: -20, // 5 (div) + (-25) (negative id)
|
|
|
+ description: "div base score + negative id weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with both positive class and id",
|
|
|
+ html: `<div class="content" id="main">Content</div>`,
|
|
|
+ expectedScore: 55, // 5 (div) + 25 (positive class) + 25 (positive id)
|
|
|
+ description: "div base score + positive class weight + positive id weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with both negative class and id",
|
|
|
+ html: `<div class="comment" id="sidebar">Content</div>`,
|
|
|
+ expectedScore: -45, // 5 (div) + (-25) (negative class) + (-25) (negative id)
|
|
|
+ description: "div base score + negative class weight + negative id weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with mixed class and id weights",
|
|
|
+ html: `<div class="content" id="sidebar">Content</div>`,
|
|
|
+ expectedScore: 5, // 5 (div) + 25 (positive class) + (-25) (negative id)
|
|
|
+ description: "div base score + positive class weight + negative id weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "h1 with positive class (should still be negative overall)",
|
|
|
+ html: `<h1 class="content">Heading</h1>`,
|
|
|
+ expectedScore: 20, // -5 (h1) + 25 (positive class)
|
|
|
+ description: "h1 base score + positive class weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "ul with negative class (more negative)",
|
|
|
+ html: `<ul class="comment">List</ul>`,
|
|
|
+ expectedScore: -28, // -3 (ul) + (-25) (negative class)
|
|
|
+ description: "ul base score + negative class weight",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "p with neutral class/id (no weight change)",
|
|
|
+ html: `<p class="normal" id="regular">Paragraph</p>`,
|
|
|
+ expectedScore: 0, // 0 (p) + 0 (neutral class) + 0 (neutral id)
|
|
|
+ description: "p base score with neutral class and id",
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Find the first non-html/body element
|
|
|
+ selection := doc.Find("div, h1, h2, h3, h4, h5, h6, ul, ol, p, pre, blockquote, img, td, th, address, dl, dd, dt, li, form, span").First()
|
|
|
+ if selection.Length() == 0 {
|
|
|
+ t.Fatal("Could not find element")
|
|
|
+ }
|
|
|
+
|
|
|
+ candidate := scoreNode(selection)
|
|
|
+
|
|
|
+ if candidate.score != tc.expectedScore {
|
|
|
+ t.Errorf("%s: Expected score %f, got %f", tc.description, tc.expectedScore, candidate.score)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestScoreNodeEdgeCases(t *testing.T) {
|
|
|
+ t.Run("empty selection", func(t *testing.T) {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(`<div></div>`))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Create empty selection
|
|
|
+ emptySelection := doc.Find("nonexistent")
|
|
|
+ if emptySelection.Length() != 0 {
|
|
|
+ t.Fatal("Expected empty selection")
|
|
|
+ }
|
|
|
+
|
|
|
+ // scoreNode should handle empty selection gracefully
|
|
|
+ candidate := scoreNode(emptySelection)
|
|
|
+ if candidate == nil {
|
|
|
+ t.Error("Expected non-nil candidate even for empty selection")
|
|
|
+ }
|
|
|
+
|
|
|
+ // Should have score 0 and empty selection
|
|
|
+ if candidate != nil && candidate.score != 0 {
|
|
|
+ t.Errorf("Expected score 0 for empty selection, got %f", candidate.score)
|
|
|
+ }
|
|
|
+
|
|
|
+ if candidate.selection.Length() != 0 {
|
|
|
+ t.Error("Expected candidate to preserve empty selection")
|
|
|
+ }
|
|
|
+
|
|
|
+ // Node() should return nil for empty selection
|
|
|
+ if candidate.Node() != nil {
|
|
|
+ t.Error("Expected Node() to return nil for empty selection")
|
|
|
+ }
|
|
|
+
|
|
|
+ // String() should handle empty selection gracefully
|
|
|
+ str := candidate.String()
|
|
|
+ expected := "empty => 0.000000"
|
|
|
+ if str != expected {
|
|
|
+ t.Errorf("Expected String() to return %q, got %q", expected, str)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("multiple elements in selection", func(t *testing.T) {
|
|
|
+ html := `<div>
|
|
|
+ <p class="article">First paragraph</p>
|
|
|
+ <p class="sidebar">Second paragraph</p>
|
|
|
+ </div>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Select all p elements
|
|
|
+ selection := doc.Find("p")
|
|
|
+ if selection.Length() != 2 {
|
|
|
+ t.Fatalf("Expected 2 p elements, got %d", selection.Length())
|
|
|
+ }
|
|
|
+
|
|
|
+ // scoreNode should only consider the first element in the selection
|
|
|
+ candidate := scoreNode(selection)
|
|
|
+
|
|
|
+ // Should score based on first p element (class="article")
|
|
|
+ expectedScore := float32(25) // 0 (p) + 25 (positive class)
|
|
|
+ if candidate.score != expectedScore {
|
|
|
+ t.Errorf("Expected score %f, got %f", expectedScore, candidate.score)
|
|
|
+ }
|
|
|
+
|
|
|
+ if candidate.Node() == nil {
|
|
|
+ t.Error("Expected valid node, got nil")
|
|
|
+ } else if candidate.Node().Data != "p" {
|
|
|
+ t.Errorf("Expected node tag p, got %s", candidate.Node().Data)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("nested elements", func(t *testing.T) {
|
|
|
+ html := `<div class="article">
|
|
|
+ <p class="content">
|
|
|
+ <span class="highlight">Text</span>
|
|
|
+ </p>
|
|
|
+ </div>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Test scoring each level
|
|
|
+ divSelection := doc.Find("div")
|
|
|
+ divCandidate := scoreNode(divSelection)
|
|
|
+ expectedDivScore := float32(30) // 5 (div) + 25 (positive class)
|
|
|
+ if divCandidate.score != expectedDivScore {
|
|
|
+ t.Errorf("Div score: expected %f, got %f", expectedDivScore, divCandidate.score)
|
|
|
+ }
|
|
|
+
|
|
|
+ pSelection := doc.Find("p")
|
|
|
+ pCandidate := scoreNode(pSelection)
|
|
|
+ expectedPScore := float32(25) // 0 (p) + 25 (positive class)
|
|
|
+ if pCandidate.score != expectedPScore {
|
|
|
+ t.Errorf("P score: expected %f, got %f", expectedPScore, pCandidate.score)
|
|
|
+ }
|
|
|
+
|
|
|
+ spanSelection := doc.Find("span")
|
|
|
+ spanCandidate := scoreNode(spanSelection)
|
|
|
+ expectedSpanScore := float32(0) // 0 (span) + 0 (neutral class)
|
|
|
+ if spanCandidate.score != expectedSpanScore {
|
|
|
+ t.Errorf("Span score: expected %f, got %f", expectedSpanScore, spanCandidate.score)
|
|
|
+ }
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+func TestTransformMisusedDivsIntoParagraphs(t *testing.T) {
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ input string
|
|
|
+ expected string
|
|
|
+ description string
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "div with only text should become paragraph",
|
|
|
+ input: `<div>Simple text content</div>`,
|
|
|
+ expected: `<p>Simple text content</p>`,
|
|
|
+ description: "div containing only text should be converted to p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with inline elements should become paragraph",
|
|
|
+ input: `<div>Text with <span>inline</span> and <em>emphasis</em></div>`,
|
|
|
+ expected: `<p>Text with <span>inline</span> and <em>emphasis</em></p>`,
|
|
|
+ description: "div with inline elements should be converted to p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with strong and other inline elements",
|
|
|
+ input: `<div>Some <strong>bold</strong> and <i>italic</i> text</div>`,
|
|
|
+ expected: `<p>Some <strong>bold</strong> and <i>italic</i> text</p>`,
|
|
|
+ description: "div with inline formatting should be converted to p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with anchor tag should NOT become paragraph",
|
|
|
+ input: `<div>Text with <a href="#">link</a></div>`,
|
|
|
+ expected: `<div>Text with <a href="#">link</a></div>`,
|
|
|
+ description: "div containing anchor tag should remain div (matches regex)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with paragraph should NOT become paragraph",
|
|
|
+ input: `<div><p>Nested paragraph</p></div>`,
|
|
|
+ expected: `<div><p>Nested paragraph</p></div>`,
|
|
|
+ description: "div containing p tag should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with blockquote should NOT become paragraph",
|
|
|
+ input: `<div><blockquote>Quote</blockquote></div>`,
|
|
|
+ expected: `<div><blockquote>Quote</blockquote></div>`,
|
|
|
+ description: "div containing blockquote should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with nested div should NOT become paragraph",
|
|
|
+ input: `<div><div>Nested div</div></div>`,
|
|
|
+ expected: `<div><p>Nested div</p></div>`,
|
|
|
+ description: "outer div has nested div (matches regex), inner div has text only (gets converted)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with img should NOT become paragraph",
|
|
|
+ input: `<div><img src="test.jpg" alt="test"></div>`,
|
|
|
+ expected: `<div><img src="test.jpg" alt="test"/></div>`,
|
|
|
+ description: "div containing img should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with ol should NOT become paragraph",
|
|
|
+ input: `<div><ol><li>Item</li></ol></div>`,
|
|
|
+ expected: `<div><ol><li>Item</li></ol></div>`,
|
|
|
+ description: "div containing ol should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with ul should NOT become paragraph",
|
|
|
+ input: `<div><ul><li>Item</li></ul></div>`,
|
|
|
+ expected: `<div><ul><li>Item</li></ul></div>`,
|
|
|
+ description: "div containing ul should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with pre should NOT become paragraph",
|
|
|
+ input: `<div><pre>Code block</pre></div>`,
|
|
|
+ expected: `<div><pre>Code block</pre></div>`,
|
|
|
+ description: "div containing pre should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with table should NOT become paragraph",
|
|
|
+ input: `<div><table><tr><td>Cell</td></tr></table></div>`,
|
|
|
+ expected: `<div><table><tbody><tr><td>Cell</td></tr></tbody></table></div>`,
|
|
|
+ description: "div containing table should remain div (note: GoQuery adds tbody)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with dl should NOT become paragraph",
|
|
|
+ input: `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
|
|
|
+ expected: `<div><dl><dt>Term</dt><dd>Definition</dd></dl></div>`,
|
|
|
+ description: "div containing dl should remain div",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "empty div should become paragraph",
|
|
|
+ input: `<div></div>`,
|
|
|
+ expected: `<p></p>`,
|
|
|
+ description: "empty div should be converted to p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with only whitespace should become paragraph",
|
|
|
+ input: `<div> </div>`,
|
|
|
+ expected: `<p> </p>`,
|
|
|
+ description: "div with only whitespace should be converted to p",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div with self-closing anchor tag should NOT become paragraph",
|
|
|
+ input: `<div>Text <a/> more text</div>`,
|
|
|
+ expected: `<div>Text <a> more text</a></div>`,
|
|
|
+ description: "div with self-closing anchor should remain div (note: GoQuery normalizes self-closing tags)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "case insensitive matching - uppercase A",
|
|
|
+ input: `<div>Text with <A href="#">link</A></div>`,
|
|
|
+ expected: `<div>Text with <a href="#">link</a></div>`,
|
|
|
+ description: "regex should be case insensitive (note: GoQuery normalizes case)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "case insensitive matching - uppercase IMG",
|
|
|
+ input: `<div><IMG src="test.jpg"></div>`,
|
|
|
+ expected: `<div><img src="test.jpg"/></div>`,
|
|
|
+ description: "regex should be case insensitive (note: GoQuery normalizes case)",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "multiple divs transformation",
|
|
|
+ input: `<div>Text only</div><div><p>Has paragraph</p></div><div>More text</div>`,
|
|
|
+ expected: `<p>Text only</p><div><p>Has paragraph</p></div><p>More text</p>`,
|
|
|
+ description: "should transform multiple divs appropriately",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "nested divs where inner gets transformed",
|
|
|
+ input: `<div><div>Inner text only</div><p>Paragraph</p></div>`,
|
|
|
+ expected: `<div><p>Inner text only</p><p>Paragraph</p></div>`,
|
|
|
+ description: "inner div should be transformed even if outer div isn't",
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ // Wrap input in a basic HTML structure
|
|
|
+ html := fmt.Sprintf(`<html><body>%s</body></html>`, tc.input)
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatalf("Failed to parse HTML: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Apply the transformation
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+
|
|
|
+ // Extract the body content
|
|
|
+ bodyHtml, err := doc.Find("body").Html()
|
|
|
+ if err != nil {
|
|
|
+ t.Fatalf("Failed to get body HTML: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Clean up whitespace for comparison
|
|
|
+ result := strings.TrimSpace(bodyHtml)
|
|
|
+ expected := strings.TrimSpace(tc.expected)
|
|
|
+
|
|
|
+ if result != expected {
|
|
|
+ t.Errorf("%s\nExpected: %s\nGot: %s", tc.description, expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestTransformMisusedDivsIntoParagraphsRegexPattern(t *testing.T) {
|
|
|
+ // Test the regex pattern directly to ensure it matches the expected elements
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ html string
|
|
|
+ shouldMatch bool
|
|
|
+ description string
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "anchor tag",
|
|
|
+ html: `<a href="#">link</a>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match anchor tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "blockquote tag",
|
|
|
+ html: `<blockquote>quote</blockquote>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match blockquote tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "dl tag",
|
|
|
+ html: `<dl><dt>term</dt></dl>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match dl tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "div tag",
|
|
|
+ html: `<div>content</div>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match div tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "img tag",
|
|
|
+ html: `<img src="test.jpg">`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match img tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "ol tag",
|
|
|
+ html: `<ol><li>item</li></ol>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match ol tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "p tag",
|
|
|
+ html: `<p>paragraph</p>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match p tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "pre tag",
|
|
|
+ html: `<pre>code</pre>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match pre tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "table tag",
|
|
|
+ html: `<table><tr></tr></table>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match table tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "ul tag",
|
|
|
+ html: `<ul><li>item</li></ul>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match ul tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "self-closing anchor",
|
|
|
+ html: `<a/>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match self-closing anchor tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "tag with attributes",
|
|
|
+ html: `<a href="#" class="link">text</a>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match tags with attributes",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "uppercase tags",
|
|
|
+ html: `<A href="#">link</A>`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should be case insensitive",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "mixed case tags",
|
|
|
+ html: `<Img src="test.jpg">`,
|
|
|
+ shouldMatch: true,
|
|
|
+ description: "should match mixed case tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "span tag",
|
|
|
+ html: `<span>text</span>`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match span tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "em tag",
|
|
|
+ html: `<em>emphasis</em>`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match em tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "strong tag",
|
|
|
+ html: `<strong>bold</strong>`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match strong tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "i tag",
|
|
|
+ html: `<i>italic</i>`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match i tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "b tag",
|
|
|
+ html: `<b>bold</b>`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match b tags",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "plain text",
|
|
|
+ html: `just plain text`,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match plain text",
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "empty string",
|
|
|
+ html: ``,
|
|
|
+ shouldMatch: false,
|
|
|
+ description: "should NOT match empty string",
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ result := divToPElementsRegexp.MatchString(tc.html)
|
|
|
+ if result != tc.shouldMatch {
|
|
|
+ t.Errorf("%s\nHTML: %s\nExpected match: %v, Got: %v", tc.description, tc.html, tc.shouldMatch, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestTransformMisusedDivsIntoParagraphsEdgeCases(t *testing.T) {
|
|
|
+ t.Run("document with no divs", func(t *testing.T) {
|
|
|
+ html := `<html><body><p>No divs here</p><span>Just other elements</span></body></html>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Should not panic or cause issues
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+
|
|
|
+ bodyHtml, _ := doc.Find("body").Html()
|
|
|
+ expected := `<p>No divs here</p><span>Just other elements</span>`
|
|
|
+
|
|
|
+ if strings.TrimSpace(bodyHtml) != expected {
|
|
|
+ t.Errorf("Expected no changes to document without divs")
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("empty document", func(t *testing.T) {
|
|
|
+ html := `<html><body></body></html>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ // Should not panic with empty document
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+
|
|
|
+ bodyHtml, _ := doc.Find("body").Html()
|
|
|
+ if strings.TrimSpace(bodyHtml) != "" {
|
|
|
+ t.Errorf("Expected empty body to remain empty")
|
|
|
}
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("deeply nested divs", func(t *testing.T) {
|
|
|
+ html := `<html><body><div><div><div>Deep text</div></div></div></body></html>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+
|
|
|
+ bodyHtml, _ := doc.Find("body").Html()
|
|
|
+ // The outer divs contain other divs (matches regex), so they remain divs
|
|
|
+ // Only the innermost div with just text gets converted to p
|
|
|
+ expected := `<div><div><p>Deep text</p></div></div>`
|
|
|
+
|
|
|
+ if strings.TrimSpace(bodyHtml) != expected {
|
|
|
+ t.Errorf("Expected nested div transformation\nGot: %s\nExpected: %s", strings.TrimSpace(bodyHtml), expected)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("complex mixed content", func(t *testing.T) {
|
|
|
+ html := `<html><body>
|
|
|
+ <div>Text only div</div>
|
|
|
+ <div><a href="#">Link div</a></div>
|
|
|
+ <div><span>Inline</span> text</div>
|
|
|
+ <div><p>Block element</p></div>
|
|
|
+ </body></html>`
|
|
|
+
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ transformMisusedDivsIntoParagraphs(doc)
|
|
|
+
|
|
|
+ // Count paragraphs and divs
|
|
|
+ pCount := doc.Find("p").Length()
|
|
|
+ divCount := doc.Find("div").Length()
|
|
|
+
|
|
|
+ // Should have 3 paragraphs (original p + 2 converted divs) and 2 divs (link div + block element div)
|
|
|
+ expectedPCount := 3
|
|
|
+ expectedDivCount := 2
|
|
|
+
|
|
|
+ if pCount != expectedPCount {
|
|
|
+ t.Errorf("Expected %d paragraphs, got %d", expectedPCount, pCount)
|
|
|
+ }
|
|
|
+ if divCount != expectedDivCount {
|
|
|
+ t.Errorf("Expected %d divs, got %d", expectedDivCount, divCount)
|
|
|
+ }
|
|
|
+ })
|
|
|
+}
|
|
|
+
|
|
|
+func TestCandidateString(t *testing.T) {
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ html string
|
|
|
+ expected string
|
|
|
+ setup func(*goquery.Document) *candidate
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "empty candidate",
|
|
|
+ html: `<div></div>`,
|
|
|
+ expected: "empty => 0.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ emptySelection := doc.Find("nonexistent")
|
|
|
+ return &candidate{selection: emptySelection, score: 0}
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with no class or id",
|
|
|
+ html: `<div>Content</div>`,
|
|
|
+ expected: "div => 5.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with class only",
|
|
|
+ html: `<div class="content">Content</div>`,
|
|
|
+ expected: "div.content => 30.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with id only",
|
|
|
+ html: `<div id="main">Content</div>`,
|
|
|
+ expected: "div#main => 30.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with both class and id",
|
|
|
+ html: `<div class="content" id="main">Content</div>`,
|
|
|
+ expected: "div#main.content => 55.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with multiple classes",
|
|
|
+ html: `<div class="article main content">Content</div>`,
|
|
|
+ expected: "div.article main content => 30.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "paragraph candidate with negative class",
|
|
|
+ html: `<p class="comment">Comment text</p>`,
|
|
|
+ expected: "p.comment => -25.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("p")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "heading candidate with positive id",
|
|
|
+ html: `<h1 id="main">Heading</h1>`,
|
|
|
+ expected: "h1#main => 20.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("h1")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with special characters in class",
|
|
|
+ html: `<div class="my-class_name">Content</div>`,
|
|
|
+ expected: "div.my-class_name => 5.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with empty class attribute",
|
|
|
+ html: `<div class="">Content</div>`,
|
|
|
+ expected: "div => 5.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with empty id attribute",
|
|
|
+ html: `<div id="">Content</div>`,
|
|
|
+ expected: "div => 5.000000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("div")
|
|
|
+ return scoreNode(selection)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "custom score candidate",
|
|
|
+ html: `<span>Content</span>`,
|
|
|
+ expected: "span => 42.500000",
|
|
|
+ setup: func(doc *goquery.Document) *candidate {
|
|
|
+ selection := doc.Find("span")
|
|
|
+ c := scoreNode(selection)
|
|
|
+ c.score = 42.5 // Override score for testing
|
|
|
+ return c
|
|
|
+ },
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatalf("Failed to parse HTML: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ candidate := tc.setup(doc)
|
|
|
+ result := candidate.String()
|
|
|
+
|
|
|
+ if result != tc.expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func TestCandidateListString(t *testing.T) {
|
|
|
+ testCases := []struct {
|
|
|
+ name string
|
|
|
+ html string
|
|
|
+ expected string
|
|
|
+ setup func(*goquery.Document) candidateList
|
|
|
+ }{
|
|
|
+ {
|
|
|
+ name: "empty candidate list",
|
|
|
+ html: `<div></div>`,
|
|
|
+ expected: "",
|
|
|
+ setup: func(doc *goquery.Document) candidateList {
|
|
|
+ return make(candidateList)
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "single candidate",
|
|
|
+ html: `<div class="content">Content</div>`,
|
|
|
+ expected: "div.content => 30.000000",
|
|
|
+ setup: func(doc *goquery.Document) candidateList {
|
|
|
+ candidates := make(candidateList)
|
|
|
+ selection := doc.Find("div")
|
|
|
+ candidate := scoreNode(selection)
|
|
|
+ candidates[selection.Get(0)] = candidate
|
|
|
+ return candidates
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "multiple candidates",
|
|
|
+ html: `<div class="content">Content</div><p class="text">Paragraph</p><h1 id="main">Title</h1>`,
|
|
|
+ setup: func(doc *goquery.Document) candidateList {
|
|
|
+ candidates := make(candidateList)
|
|
|
+
|
|
|
+ divSelection := doc.Find("div")
|
|
|
+ divCandidate := scoreNode(divSelection)
|
|
|
+ candidates[divSelection.Get(0)] = divCandidate
|
|
|
+
|
|
|
+ pSelection := doc.Find("p")
|
|
|
+ pCandidate := scoreNode(pSelection)
|
|
|
+ candidates[pSelection.Get(0)] = pCandidate
|
|
|
+
|
|
|
+ h1Selection := doc.Find("h1")
|
|
|
+ h1Candidate := scoreNode(h1Selection)
|
|
|
+ candidates[h1Selection.Get(0)] = h1Candidate
|
|
|
+
|
|
|
+ return candidates
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidates with mixed scores",
|
|
|
+ html: `<div class="comment">Comment</div><p class="content">Good content</p>`,
|
|
|
+ setup: func(doc *goquery.Document) candidateList {
|
|
|
+ candidates := make(candidateList)
|
|
|
+
|
|
|
+ divSelection := doc.Find("div")
|
|
|
+ divCandidate := scoreNode(divSelection)
|
|
|
+ candidates[divSelection.Get(0)] = divCandidate
|
|
|
+
|
|
|
+ pSelection := doc.Find("p")
|
|
|
+ pCandidate := scoreNode(pSelection)
|
|
|
+ candidates[pSelection.Get(0)] = pCandidate
|
|
|
+
|
|
|
+ return candidates
|
|
|
+ },
|
|
|
+ },
|
|
|
+ {
|
|
|
+ name: "candidate with empty selection",
|
|
|
+ html: `<div>Test</div>`,
|
|
|
+ setup: func(doc *goquery.Document) candidateList {
|
|
|
+ candidates := make(candidateList)
|
|
|
+
|
|
|
+ // Add a regular candidate
|
|
|
+ divSelection := doc.Find("div")
|
|
|
+ divCandidate := scoreNode(divSelection)
|
|
|
+ candidates[divSelection.Get(0)] = divCandidate
|
|
|
+
|
|
|
+ // Add a candidate with empty selection (this is artificial but tests the edge case)
|
|
|
+ emptySelection := doc.Find("nonexistent")
|
|
|
+ emptyCandidate := &candidate{selection: emptySelection, score: 0}
|
|
|
+ // We can't use emptySelection.Get(0) as key since it would panic,
|
|
|
+ // so we'll create a dummy node for this test
|
|
|
+ dummyNode := &html.Node{Type: html.ElementNode, Data: "dummy"}
|
|
|
+ candidates[dummyNode] = emptyCandidate
|
|
|
+
|
|
|
+ return candidates
|
|
|
+ },
|
|
|
+ },
|
|
|
+ }
|
|
|
+
|
|
|
+ for _, tc := range testCases {
|
|
|
+ t.Run(tc.name, func(t *testing.T) {
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(tc.html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatalf("Failed to parse HTML: %v", err)
|
|
|
+ }
|
|
|
+
|
|
|
+ candidates := tc.setup(doc)
|
|
|
+ result := candidates.String()
|
|
|
+
|
|
|
+ if tc.name == "empty candidate list" {
|
|
|
+ if result != tc.expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
|
|
|
+ }
|
|
|
+ return
|
|
|
+ }
|
|
|
+
|
|
|
+ // For multiple candidates, we need to check that all expected parts are present
|
|
|
+ // since map iteration order is not guaranteed
|
|
|
+ switch tc.name {
|
|
|
+ case "multiple candidates":
|
|
|
+ expectedParts := []string{"div.content => 30.000000", "p.text => 25.000000", "h1#main => 20.000000"}
|
|
|
+ for _, part := range expectedParts {
|
|
|
+ if !strings.Contains(result, part) {
|
|
|
+ t.Errorf("Expected result to contain: %s, Got: %s", part, result)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ // Check that it's comma-separated
|
|
|
+ if !strings.Contains(result, ", ") {
|
|
|
+ t.Errorf("Expected comma-separated format, Got: %s", result)
|
|
|
+ }
|
|
|
+ case "candidates with mixed scores":
|
|
|
+ expectedParts := []string{"div.comment => -20.000000", "p.content => 25.000000"}
|
|
|
+ for _, part := range expectedParts {
|
|
|
+ if !strings.Contains(result, part) {
|
|
|
+ t.Errorf("Expected result to contain: %s, Got: %s", part, result)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "candidate with empty selection":
|
|
|
+ // Should contain both the regular candidate and the empty one
|
|
|
+ if !strings.Contains(result, "div => 5.000000") {
|
|
|
+ t.Errorf("Expected result to contain div candidate, Got: %s", result)
|
|
|
+ }
|
|
|
+ if !strings.Contains(result, "empty => 0.000000") {
|
|
|
+ t.Errorf("Expected result to contain empty candidate, Got: %s", result)
|
|
|
+ }
|
|
|
+ default:
|
|
|
+ // Single candidate test cases
|
|
|
+ if result != tc.expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", tc.expected, result)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ })
|
|
|
}
|
|
|
}
|
|
|
+
|
|
|
+func TestCandidateStringEdgeCases(t *testing.T) {
|
|
|
+ t.Run("candidate with nil node but valid selection", func(t *testing.T) {
|
|
|
+ // This tests the case where Node() returns nil but selection exists
|
|
|
+ html := `<div>Test</div>`
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ emptySelection := doc.Find("nonexistent")
|
|
|
+ candidate := &candidate{
|
|
|
+ selection: emptySelection,
|
|
|
+ score: 10.5,
|
|
|
+ }
|
|
|
+
|
|
|
+ result := candidate.String()
|
|
|
+ expected := "empty => 10.500000"
|
|
|
+
|
|
|
+ if result != expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("candidate with zero score", func(t *testing.T) {
|
|
|
+ html := `<div>Test</div>`
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ selection := doc.Find("div")
|
|
|
+ candidate := &candidate{
|
|
|
+ selection: selection,
|
|
|
+ score: 0,
|
|
|
+ }
|
|
|
+
|
|
|
+ result := candidate.String()
|
|
|
+ expected := "div => 0.000000"
|
|
|
+
|
|
|
+ if result != expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("candidate with negative score", func(t *testing.T) {
|
|
|
+ html := `<h1>Test</h1>`
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ selection := doc.Find("h1")
|
|
|
+ candidate := &candidate{
|
|
|
+ selection: selection,
|
|
|
+ score: -10.5,
|
|
|
+ }
|
|
|
+
|
|
|
+ result := candidate.String()
|
|
|
+ expected := "h1 => -10.500000"
|
|
|
+
|
|
|
+ if result != expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+
|
|
|
+ t.Run("candidate with very long class and id", func(t *testing.T) {
|
|
|
+ html := `<div class="very-long-class-name-that-might-cause-issues" id="very-long-id-name-that-might-also-cause-formatting-issues">Test</div>`
|
|
|
+ doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
|
|
|
+ if err != nil {
|
|
|
+ t.Fatal(err)
|
|
|
+ }
|
|
|
+
|
|
|
+ selection := doc.Find("div")
|
|
|
+ candidate := scoreNode(selection)
|
|
|
+
|
|
|
+ result := candidate.String()
|
|
|
+ expected := "div#very-long-id-name-that-might-also-cause-formatting-issues.very-long-class-name-that-might-cause-issues => 5.000000"
|
|
|
+
|
|
|
+ if result != expected {
|
|
|
+ t.Errorf("Expected: %s, Got: %s", expected, result)
|
|
|
+ }
|
|
|
+ })
|
|
|
+}
|