Преглед на файлове

Refactor internal/reader/readability/testdata

- Use chained strings.Contains instead of a regex for
  blacklistCandidatesRegexp, as this is a bit faster
- Simplify a Find.Each.Remove to Find.Remove
- Don't concatenate id and class for removeUnlikelyCandidates, as it makes no
  sense to match on overlaps. It might also marginally improve performances, as
  regex now have to run on two strings separately, instead of both.
- Add a small benchmark
jvoisin преди 1 година
родител
ревизия
2df59b4865
променени са 3 файла, в които са добавени 43 реда и са изтрити 15 реда
  1. 21 15
      internal/reader/readability/readability.go
  2. 21 0
      internal/reader/readability/readability_test.go
  3. 1 0
      internal/reader/readability/testdata

+ 21 - 15
internal/reader/readability/readability.go

@@ -23,7 +23,6 @@ const (
 var (
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
 
-	blacklistCandidatesRegexp  = regexp.MustCompile(`popupbody|-ad|g-plus`)
 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
 	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
 
@@ -81,9 +80,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 		}
 	}
 
-	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
-		s.Remove()
-	})
+	document.Find("script,style").Remove()
 
 	transformMisusedDivsIntoParagraphs(document)
 	removeUnlikelyCandidates(document)
@@ -150,18 +147,29 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 }
 
 func removeUnlikelyCandidates(document *goquery.Document) {
+	var shouldRemove = func(str string) bool {
+		str = strings.ToLower(str)
+		if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
+			return true
+		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
+			return true
+		}
+		return false
+	}
+
 	document.Find("*").Each(func(i int, s *goquery.Selection) {
 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
 			return
 		}
-		class, _ := s.Attr("class")
-		id, _ := s.Attr("id")
-		str := strings.ToLower(class + id)
 
-		if blacklistCandidatesRegexp.MatchString(str) {
-			s.Remove()
-		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
-			s.Remove()
+		if class, ok := s.Attr("class"); ok {
+			if shouldRemove(class) {
+				s.Remove()
+			}
+		} else if id, ok := s.Attr("id"); ok {
+			if shouldRemove(id) {
+				s.Remove()
+			}
 		}
 	})
 }
@@ -279,10 +287,8 @@ func getLinkDensity(s *goquery.Selection) float32 {
 // element looks good or bad.
 func getClassWeight(s *goquery.Selection) float32 {
 	weight := 0
-	class, _ := s.Attr("class")
-	id, _ := s.Attr("id")
 
-	if class != "" {
+	if class, ok := s.Attr("class"); ok {
 		class = strings.ToLower(class)
 		if negativeRegexp.MatchString(class) {
 			weight -= 25
@@ -291,7 +297,7 @@ func getClassWeight(s *goquery.Selection) float32 {
 		}
 	}
 
-	if id != "" {
+	if id, ok := s.Attr("id"); ok {
 		id = strings.ToLower(id)
 		if negativeRegexp.MatchString(id) {
 			weight -= 25

+ 21 - 0
internal/reader/readability/readability_test.go

@@ -4,6 +4,8 @@
 package readability // import "miniflux.app/v2/internal/reader/readability"
 
 import (
+	"bytes"
+	"os"
 	"strings"
 	"testing"
 )
@@ -161,3 +163,22 @@ func TestRemoveBlacklist(t *testing.T) {
 		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
 	}
 }
+
+func BenchmarkExtractContent(b *testing.B) {
+	var testCases = map[string][]byte{
+		"miniflux_github.html":    {},
+		"miniflux_wikipedia.html": {},
+	}
+	for filename := range testCases {
+		data, err := os.ReadFile("testdata/" + filename)
+		if err != nil {
+			b.Fatalf(`Unable to read file %q: %v`, filename, err)
+		}
+		testCases[filename] = data
+	}
+	for range b.N {
+		for _, v := range testCases {
+			ExtractContent(bytes.NewReader(v))
+		}
+	}
+}

+ 1 - 0
internal/reader/readability/testdata

@@ -0,0 +1 @@
+../../reader/sanitizer/testdata/