Browse Source

Refactor internal/reader/readability/testdata

- Use chained strings.Contains instead of a regex for
  blacklistCandidatesRegexp, as this is a bit faster
- Simplify a Find.Each.Remove to Find.Remove
- Don't concatenate id and class for removeUnlikelyCandidates, as it makes no
  sense to match on overlaps. It might also marginally improve performances, as
  regex now have to run on two strings separately, instead of both.
- Add a small benchmark
jvoisin 1 year ago
parent
commit
2df59b4865

+ 21 - 15
internal/reader/readability/readability.go

@@ -23,7 +23,6 @@ const (
 var (
 var (
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
 
 
-	blacklistCandidatesRegexp  = regexp.MustCompile(`popupbody|-ad|g-plus`)
 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
 	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
 	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
 
 
@@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 		}
 		}
 	}
 	}
 
 
-	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
-		s.Remove()
-	})
+	document.Find("script,style").Remove()
 
 
 	transformMisusedDivsIntoParagraphs(document)
 	transformMisusedDivsIntoParagraphs(document)
 	removeUnlikelyCandidates(document)
 	removeUnlikelyCandidates(document)
@@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 }
 }
 
 
 func removeUnlikelyCandidates(document *goquery.Document) {
 func removeUnlikelyCandidates(document *goquery.Document) {
+	var shouldRemove = func(str string) bool {
+		str = strings.ToLower(str)
+		if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
+			return true
+		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
+			return true
+		}
+		return false
+	}
+
 	document.Find("*").Each(func(i int, s *goquery.Selection) {
 	document.Find("*").Each(func(i int, s *goquery.Selection) {
 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
 			return
 			return
 		}
 		}
-		class, _ := s.Attr("class")
-		id, _ := s.Attr("id")
-		str := strings.ToLower(class + id)
 
 
-		if blacklistCandidatesRegexp.MatchString(str) {
-			s.Remove()
-		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
-			s.Remove()
+		if class, ok := s.Attr("class"); ok {
+			if shouldRemove(class) {
+				s.Remove()
+			}
+		} else if id, ok := s.Attr("id"); ok {
+			if shouldRemove(id) {
+				s.Remove()
+			}
 		}
 		}
 	})
 	})
 }
 }
@@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
 // element looks good or bad.
 // element looks good or bad.
 func getClassWeight(s *goquery.Selection) float32 {
 func getClassWeight(s *goquery.Selection) float32 {
 	weight := 0
 	weight := 0
-	class, _ := s.Attr("class")
-	id, _ := s.Attr("id")
 
 
-	if class != "" {
+	if class, ok := s.Attr("class"); ok {
 		class = strings.ToLower(class)
 		class = strings.ToLower(class)
 		if negativeRegexp.MatchString(class) {
 		if negativeRegexp.MatchString(class) {
 			weight -= 25
 			weight -= 25
@@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
 		}
 		}
 	}
 	}
 
 
-	if id != "" {
+	if id, ok := s.Attr("id"); ok {
 		id = strings.ToLower(id)
 		id = strings.ToLower(id)
 		if negativeRegexp.MatchString(id) {
 		if negativeRegexp.MatchString(id) {
 			weight -= 25
 			weight -= 25

+ 21 - 0
internal/reader/readability/readability_test.go

@@ -4,6 +4,8 @@
 package readability // import "miniflux.app/v2/internal/reader/readability"
 package readability // import "miniflux.app/v2/internal/reader/readability"
 
 
 import (
 import (
+	"bytes"
+	"os"
 	"strings"
 	"strings"
 	"testing"
 	"testing"
 )
 )
@@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
 		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
 		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
 	}
 	}
 }
 }
+
+func BenchmarkExtractContent(b *testing.B) {
+	var testCases = map[string][]byte{
+		"miniflux_github.html":    {},
+		"miniflux_wikipedia.html": {},
+	}
+	for filename := range testCases {
+		data, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+		testCases[filename] = data
+	}
+	for range b.N {
+		for _, v := range testCases {
+			ExtractContent(bytes.NewReader(v))
+		}
+	}
+}

+ 1 - 0
internal/reader/readability/testdata

@@ -0,0 +1 @@
+../../reader/sanitizer/testdata/