Selaa lähdekoodia

Refactor internal/reader/readability/testdata

- Use chained strings.Contains instead of a regex for
  blacklistCandidatesRegexp, as this is a bit faster
- Simplify a Find.Each.Remove to Find.Remove
- Don't concatenate id and class for removeUnlikelyCandidates, as it makes no
  sense to match on overlaps. It might also marginally improve performances, as
  regex now have to run on two strings separately, instead of both.
- Add a small benchmark
jvoisin 1 vuosi sitten
vanhempi
commit
2df59b4865

+ 21 - 15
internal/reader/readability/readability.go

@@ -23,7 +23,6 @@ const (
 var (
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
 
-	blacklistCandidatesRegexp  = regexp.MustCompile(`popupbody|-ad|g-plus`)
 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
 	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
 
@@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 		}
 	}
 
-	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
-		s.Remove()
-	})
+	document.Find("script,style").Remove()
 
 	transformMisusedDivsIntoParagraphs(document)
 	removeUnlikelyCandidates(document)
@@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 }
 
 func removeUnlikelyCandidates(document *goquery.Document) {
+	var shouldRemove = func(str string) bool {
+		str = strings.ToLower(str)
+		if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
+			return true
+		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
+			return true
+		}
+		return false
+	}
+
 	document.Find("*").Each(func(i int, s *goquery.Selection) {
 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
 			return
 		}
-		class, _ := s.Attr("class")
-		id, _ := s.Attr("id")
-		str := strings.ToLower(class + id)
 
-		if blacklistCandidatesRegexp.MatchString(str) {
-			s.Remove()
-		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
-			s.Remove()
+		if class, ok := s.Attr("class"); ok {
+			if shouldRemove(class) {
+				s.Remove()
+			}
+		} else if id, ok := s.Attr("id"); ok {
+			if shouldRemove(id) {
+				s.Remove()
+			}
 		}
 	})
 }
@@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
 // element looks good or bad.
 func getClassWeight(s *goquery.Selection) float32 {
 	weight := 0
-	class, _ := s.Attr("class")
-	id, _ := s.Attr("id")
 
-	if class != "" {
+	if class, ok := s.Attr("class"); ok {
 		class = strings.ToLower(class)
 		if negativeRegexp.MatchString(class) {
 			weight -= 25
@@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
 		}
 	}
 
-	if id != "" {
+	if id, ok := s.Attr("id"); ok {
 		id = strings.ToLower(id)
 		if negativeRegexp.MatchString(id) {
 			weight -= 25

+ 21 - 0
internal/reader/readability/readability_test.go

@@ -4,6 +4,8 @@
 package readability // import "miniflux.app/v2/internal/reader/readability"
 
 import (
+	"bytes"
+	"os"
 	"strings"
 	"testing"
 )
@@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
 		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
 	}
 }
+
+func BenchmarkExtractContent(b *testing.B) {
+	var testCases = map[string][]byte{
+		"miniflux_github.html":    {},
+		"miniflux_wikipedia.html": {},
+	}
+	for filename := range testCases {
+		data, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+		testCases[filename] = data
+	}
+	for range b.N {
+		for _, v := range testCases {
+			ExtractContent(bytes.NewReader(v))
+		}
+	}
+}

+ 1 - 0
internal/reader/readability/testdata

@@ -0,0 +1 @@
+../../reader/sanitizer/testdata/