Selaa lähdekoodia

Return outer HTML when scraping elements

cinput 6 vuotta sitten
vanhempi
commit
8e1ed8bef3

+ 1 - 7
reader/scraper/scraper.go

@@ -75,13 +75,7 @@ func scrapContent(page io.Reader, rules string) (string, error) {
 	document.Find(rules).Each(func(i int, s *goquery.Selection) {
 		var content string
 
-		// For some inline elements, we get the parent.
-		if s.Is("img") || s.Is("iframe") {
-			content, _ = s.Parent().Html()
-		} else {
-			content, _ = s.Html()
-		}
-
+		content, _ = goquery.OuterHtml(s)
 		contents += content
 	})
 

+ 35 - 1
reader/scraper/scraper_test.go

@@ -4,7 +4,12 @@
 
 package scraper // import "miniflux.app/reader/scraper"
 
-import "testing"
+import (
+	"bytes"
+	"io/ioutil"
+	"strings"
+	"testing"
+)
 
 func TestGetPredefinedRules(t *testing.T) {
 	if getPredefinedScraperRules("http://www.phoronix.com/") == "" {
@@ -40,3 +45,32 @@ func TestWhitelistedContentTypes(t *testing.T) {
 		}
 	}
 }
+
+func TestSelectorRules(t *testing.T) {
+	var ruleTestCases = map[string]string {
+		"img.html":	"article > img",
+		"iframe.html":	"article > iframe",
+		"p.html":	"article > p",
+	}
+
+	for filename, rule := range ruleTestCases {
+		html, err := ioutil.ReadFile("testdata/" + filename)
+		if err != nil {
+			t.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+
+		actualResult, err := scrapContent(bytes.NewReader(html), rule)
+		if err != nil {
+			t.Fatalf(`Scraping error for %q - %q: %v`, filename, rule, err)
+		}
+
+		expectedResult, err := ioutil.ReadFile("testdata/" + filename + "-result")
+		if err != nil {
+			t.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+
+		if actualResult != strings.TrimSpace(string(expectedResult)) {
+			t.Errorf(`Unexpected result for %q, got "%s" instead of "%s"`, rule, actualResult, expectedResult)
+		}
+	}
+}

+ 12 - 0
reader/scraper/testdata/iframe.html

@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<iframe id="1" src="about:blank"></iframe>
+			<iframe id="2" src="about:blank"></iframe>
+			<iframe id="3" src="about:blank"></iframe>
+			<iframe id="4" src="about:blank"></iframe>
+			<iframe id="5" src="about:blank"></iframe>
+		</article>
+	</body>
+</html>

+ 1 - 0
reader/scraper/testdata/iframe.html-result

@@ -0,0 +1 @@
+<iframe id="1" src="about:blank"></iframe><iframe id="2" src="about:blank"></iframe><iframe id="3" src="about:blank"></iframe><iframe id="4" src="about:blank"></iframe><iframe id="5" src="about:blank"></iframe>

+ 12 - 0
reader/scraper/testdata/img.html

@@ -0,0 +1,12 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<img id="1" src="#" alt="" />
+			<img id="2" src="#" alt="" />
+			<img id="3" src="#" alt="" />
+			<img id="4" src="#" alt="" />
+			<img id="5" src="#" alt="" />
+		</article>
+	</body>
+</html>

+ 1 - 0
reader/scraper/testdata/img.html-result

@@ -0,0 +1 @@
+<img id="1" src="#" alt=""/><img id="2" src="#" alt=""/><img id="3" src="#" alt=""/><img id="4" src="#" alt=""/><img id="5" src="#" alt=""/>

+ 10 - 0
reader/scraper/testdata/p.html

@@ -0,0 +1,10 @@
+<!DOCTYPE html>
+<html lang="en-US">
+	<body>
+		<article>
+			<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p>
+			<p>Apquam tincidunt mauris eu risus.</p>
+			<p>Vestibulum auctor dapibus neque.</p>
+		</article>
+	</body>
+</html>

+ 1 - 0
reader/scraper/testdata/p.html-result

@@ -0,0 +1 @@
+<p>Lorem ipsum dolor sit amet, consectetuer adipiscing ept.</p><p>Apquam tincidunt mauris eu risus.</p><p>Vestibulum auctor dapibus neque.</p>