Browse Source

perf(readability): Simplify removeUnlikelyCandidates

- Use an array of strings instead of a regex, like done in ef13756b1a7a7ba30fd34174a5367381fd8b4849
- Extract the `shouldRemove` function from `removeUnlikelyCandidates`, as there
  is no reason to have it there instead of being a proper standalone function.
- Improve a condition, where the goquery selection would have its `id`
  attribute left unchecked if a `class` one was present, regardless of if
  `class` was a candidate to removal or not.
- Add some comments
jvoisin 9 months ago
parent
commit
c064891314
1 changed files with 28 additions and 16 deletions
  1. 28 16
      internal/reader/readability/readability.go

+ 28 - 16
internal/reader/readability/readability.go

@@ -23,8 +23,9 @@ const (
 var (
 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(?:a|blockquote|dl|div|img|ol|p|pre|table|ul)[ />]`)
 
-	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
-	unlikelyCandidatesRegexp   = regexp.MustCompile(`banner|breadcrumbs|combx|comment|community|cover-wrap|disqus|extra|foot|header|legends|menu|modal|related|remark|replies|rss|shoutbox|sidebar|skyscraper|social|sponsor|supplemental|ad-break|agegate|pagination|pager|popup|yom-remote`)
+	strongCandidates  = [...]string{"popupbody", "-ad", "g-plus"}
+	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
+	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
 
 	negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
 	positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
@@ -145,18 +146,33 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 	output.WriteString("</div>")
 	return output.String()
 }
+func shouldRemoveCandidate(str string) bool {
+	str = strings.ToLower(str)
 
-func removeUnlikelyCandidates(document *goquery.Document) {
-	var shouldRemove = func(str string) bool {
-		str = strings.ToLower(str)
-		if strings.Contains(str, "popupbody") || strings.Contains(str, "-ad") || strings.Contains(str, "g-plus") {
+	// Those candidates have no false-positives, no need to check against `maybeCandidate`
+	for _, strong := range strongCandidates {
+		if strings.Contains(str, strong) {
 			return true
-		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
+		}
+	}
+
+	for _, unlikely := range unlikelyCandidate {
+		if strings.Contains(str, unlikely) {
+			// Do we have a false positive?
+			for _, maybe := range maybeCandidate {
+				if strings.Contains(str, maybe) {
+					return false
+				}
+			}
+
+			// Nope, it's a true positive!
 			return true
 		}
-		return false
 	}
+	return false
+}
 
+func removeUnlikelyCandidates(document *goquery.Document) {
 	document.Find("*").Each(func(i int, s *goquery.Selection) {
 		if s.Length() == 0 || s.Get(0).Data == "html" || s.Get(0).Data == "body" {
 			return
@@ -167,14 +183,10 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 			return
 		}
 
-		if class, ok := s.Attr("class"); ok {
-			if shouldRemove(class) {
-				s.Remove()
-			}
-		} else if id, ok := s.Attr("id"); ok {
-			if shouldRemove(id) {
-				s.Remove()
-			}
+		if class, ok := s.Attr("class"); ok && shouldRemoveCandidate(class) {
+			s.Remove()
+		} else if id, ok := s.Attr("id"); ok && shouldRemoveCandidate(id) {
+			s.Remove()
 		}
 	})
 }