Parcourir la source

perf(readability): simplify removeUnlikelyCandidates

- Use an iterator instead of generating a whole slice when iterating on the selection.
- Using an iterator allows to use a for-loop construct, instead of a lambda,
  which is a bit clearer
- Do the filtering Find()'s selector, instead of in the loop, which doesn't
  matter much now that we're using an iterator, but it makes the code a bit
  more obvious/simpler, and likely reduces a bit the number of iterations.
jvoisin il y a 9 mois
Parent
commit
1de9cf4241
1 fichiers modifiés avec 10 ajouts et 6 suppressions
  1. 10 6
      internal/reader/readability/readability.go

+ 10 - 6
internal/reader/readability/readability.go

@@ -208,14 +208,18 @@ func shouldRemoveCandidate(str string) bool {
 }
 
 func removeUnlikelyCandidates(document *goquery.Document) {
-	document.Find("*").Each(func(i int, s *goquery.Selection) {
-		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
-			return
+	// Only select tags with either a class or an id attribute,
+	// and never the html nor body tags, as we don't want to ever remove them.
+	selector := "[class]:not(body,html)" + "," + "[id]:not(body,html)"
+
+	for _, s := range document.Find(selector).EachIter() {
+		if s.Length() == 0 {
+			continue
 		}
 
 		// Don't remove elements within code blocks (pre or code tags)
-		if s.Closest("pre, code").Length() > 0 {
-			return
+		if s.Closest("pre,code").Length() > 0 {
+			continue
 		}
 
 		if class, ok := s.Attr("class"); ok && shouldRemoveCandidate(class) {
@@ -223,7 +227,7 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 		} else if id, ok := s.Attr("id"); ok && shouldRemoveCandidate(id) {
 			s.Remove()
 		}
-	})
+	}
 }
 
 func getTopCandidate(document *goquery.Document, candidates candidateList) *candidate {