hace 1 año · 2df59b4865
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -23,7 +23,6 @@ const (
 
															 var (
														
 
															 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
														
 
															-	blacklistCandidatesRegexp  = regexp.MustCompile(`popupbody|-ad|g-plus`)
														
 
															 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
														
 
															 	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
														
@@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 
															 		}
														
 
															 	}
														
 
															-	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
														
 
															-		s.Remove()
														
 
															-	})
														
 
															+	document.Find("script,style").Remove()
														
 
															 	transformMisusedDivsIntoParagraphs(document)
														
 
															 	removeUnlikelyCandidates(document)
														
@@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 
															 }
														
 
															 func removeUnlikelyCandidates(document *goquery.Document) {
														
 
															+	var shouldRemove = func(str string) bool {
														
 
															+		str = strings.ToLower(str)
														
 
															+		if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
														
 
															+			return true
														
 
															+		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
														
 
															+			return true
														
 
															+		}
														
 
															+		return false
														
 
															+	}
														
 
															+
														
 
															 	document.Find("*").Each(func(i int, s *goquery.Selection) {
														
 
															 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
														
 
															 			return
														
 
															 		}
														
 
															-		class, _ := s.Attr("class")
														
 
															-		id, _ := s.Attr("id")
														
 
															-		str := strings.ToLower(class + id)
														
 
															-		if blacklistCandidatesRegexp.MatchString(str) {
														
 
															-			s.Remove()
														
 
															-		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
														
 
															-			s.Remove()
														
 
															+		if class, ok := s.Attr("class"); ok {
														
 
															+			if shouldRemove(class) {
														
 
															+				s.Remove()
														
 
															+			}
														
 
															+		} else if id, ok := s.Attr("id"); ok {
														
 
															+			if shouldRemove(id) {
														
 
															+				s.Remove()
														
 
															+			}
														
 
															 		}
														
 
															 	})
														
 
															 }
														
@@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
 
															 // element looks good or bad.
														
 
															 func getClassWeight(s *goquery.Selection) float32 {
														
 
															 	weight := 0
														
 
															-	class, _ := s.Attr("class")
														
 
															-	id, _ := s.Attr("id")
														
 
															-	if class != "" {
														
 
															+	if class, ok := s.Attr("class"); ok {
														
 
															 		class = strings.ToLower(class)
														
 
															 		if negativeRegexp.MatchString(class) {
														
 
															 			weight -= 25
														
@@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
 
															 		}
														
 
															 	}
														
 
															-	if id != "" {
														
 
															+	if id, ok := s.Attr("id"); ok {
														
 
															 		id = strings.ToLower(id)
														
 
															 		if negativeRegexp.MatchString(id) {
														
 
															 			weight -= 25
														
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -4,6 +4,8 @@
 
															 package readability // import "miniflux.app/v2/internal/reader/readability"
														
 
															 import (
														
 
															+	"bytes"
														
 
															+	"os"
														
 
															 	"strings"
														
 
															 	"testing"
														
 
															 )
														
@@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
 
															 		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
														
 
															 	}
														
 
															 }
														
 
															+
														
 
															+func BenchmarkExtractContent(b *testing.B) {
														
 
															+	var testCases = map[string][]byte{
														
 
															+		"miniflux_github.html":    {},
														
 
															+		"miniflux_wikipedia.html": {},
														
 
															+	}
														
 
															+	for filename := range testCases {
														
 
															+		data, err := os.ReadFile("testdata/" + filename)
														
 
															+		if err != nil {
														
 
															+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
														
 
															+		}
														
 
															+		testCases[filename] = data
														
 
															+	}
														
 
															+	for range b.N {
														
 
															+		for _, v := range testCases {
														
 
															+			ExtractContent(bytes.NewReader(v))
														
 
															+		}
														
 
															+	}
														
 
															+}
														
--- a/internal/reader/readability/testdata
+++ b/internal/reader/readability/testdata
@@ -0,0 +1 @@
 
															+../../reader/sanitizer/testdata/