소스 검색

fix(readability): do not remove elements within code blocks

`<span class="hljs-comment"># exit 1</span>` will match the `unlikelyCandidatesRegexp` because it contains the `comment` string.
Frédéric Guillot 9 달 전
부모
커밋
6d58052504
2개의 변경된 파일27개의 추가작업 그리고 0개의 파일을 삭제
  1. 5 0
      internal/reader/readability/readability.go
  2. 22 0
      internal/reader/readability/readability_test.go

+ 5 - 0
internal/reader/readability/readability.go

@@ -162,6 +162,11 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 			return
 		}
 
+		// Don't remove elements within code blocks (pre or code tags)
+		if s.Closest("pre, code").Length() > 0 {
+			return
+		}
+
 		if class, ok := s.Attr("class"); ok {
 			if shouldRemove(class) {
 				s.Remove()

+ 22 - 0
internal/reader/readability/readability_test.go

@@ -164,6 +164,28 @@ func TestRemoveBlacklist(t *testing.T) {
 	}
 }
 
+func TestNestedSpanInCodeBlock(t *testing.T) {
+	html := `
+		<html>
+			<head>
+				<title>Test</title>
+			</head>
+			<body>
+				<article><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></article>
+			</body>
+		</html>`
+	want := `<div><div><p>Some content</p><pre><code class="hljs-built_in">Code block with <span class="hljs-built_in">nested span</span> <span class="hljs-comment"># exit 1</span></code></pre></div></div>`
+
+	_, result, err := ExtractContent(strings.NewReader(html))
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	if result != want {
+		t.Errorf(`Invalid content, got %s instead of %s`, result, want)
+	}
+}
+
 func BenchmarkExtractContent(b *testing.B) {
 	var testCases = map[string][]byte{
 		"miniflux_github.html":    {},