Parcourir la source

fix(readability): do not remove elements within code blocks

`<span class="hljs-comment"># exit 1</span>` will match the `unlikelyCandidatesRegexp` because it contains the `comment` string.
Frédéric Guillot il y a 10 mois
Parent
commit
6d58052504

+ 5 - 0
internal/reader/readability/readability.go

@@ -162,6 +162,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 			return
 			return
 		}
 		}
 
 
+		// Don't remove elements within code blocks (pre or code tags)
+		if s.Closest("pre, code").Length() > 0 {
+			return
+		}
+
 		if class, ok := s.Attr("class"); ok {
 		if class, ok := s.Attr("class"); ok {
 			if shouldRemove(class) {
 			if shouldRemove(class) {
 				s.Remove()
 				s.Remove()

+ 22 - 0
internal/reader/readability/readability_test.go

@@ -164,6 +164,28 @@ func TestRemoveBlacklist(t *testing.T) {
 	}
 	}
 }
 }
 
 
+func TestNestedSpanInCodeBlock(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+			</head>
+			<body>
+				<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
+			</body>
+		</html>`
+	want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
+
+	_, result, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if result != want {
+		t.Errorf(`Invalid content, got %s instead of %s`, result, want)
+	}
+}
+
 func BenchmarkExtractContent(b *testing.B) {
 func BenchmarkExtractContent(b *testing.B) {
 	var testCases = map[string][]byte{
 	var testCases = map[string][]byte{
 		"miniflux_github.html":    {},
 		"miniflux_github.html":    {},