浏览代码

refactor(readability): add a getSelectionLength function

When we're only interested in the length of contained Text, there is no need to
materialize it fully to then call len() on the result: we can simply iterate
over the text element and sum their length instead.
jvoisin 9 月之前
父节点
当前提交
8a98926674
共有 1 个文件被更改,包括 29 次插入5 次删除
  1. 29 5
      internal/reader/readability/readability.go

+ 29 - 5
internal/reader/readability/readability.go

@@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 	return baseURL, extractedContent, nil
 }
 
+func getSelectionLength(s *goquery.Selection) int {
+	var getLengthOfTextContent func(*html.Node) int
+	getLengthOfTextContent = func(n *html.Node) int {
+		total := 0
+		if n.Type == html.TextNode {
+			total += len(n.Data)
+		}
+		if n.FirstChild != nil {
+			for c := n.FirstChild; c != nil; c = c.NextSibling {
+				total += getLengthOfTextContent(c)
+			}
+		}
+		return total
+	}
+
+	sum := 0
+	for _, n := range s.Nodes {
+		sum += getLengthOfTextContent(n)
+	}
+	return sum
+}
+
 // Now that we have the top candidate, look through its siblings for content that might also be related.
 // Things like preambles, content split by ads that we removed, etc.
 func getArticle(topCandidate *candidate, candidates candidateList) string {
@@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 		} else if s.Is("p") {
 			tag = node.Data
 			linkDensity := getLinkDensity(s)
-			content := s.Text()
-			contentLength := len(content)
+			contentLength := getSelectionLength(s)
 
 			if contentLength >= 80 {
 				if linkDensity < .25 {
@@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 				}
 			} else {
 				if linkDensity == 0 {
+					// It's a small selection, so .Text doesn't impact performances too much.
+					content := s.Text()
 					if containsSentence(content) {
 						append = true
 					}
@@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
 	candidates := make(candidateList)
 
 	document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
-		text := s.Text()
+		textLen := getSelectionLength(s)
 
 		// If this paragraph is less than 25 characters, don't even count it.
-		if len(text) < 25 {
+		if textLen < 25 {
 			return
 		}
 
@@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
 		contentScore := float32(1.0)
 
 		// Add points for any commas within this paragraph.
+		text := s.Text()
 		contentScore += float32(strings.Count(text, ",") + 1)
 
 		// For every 100 characters in this paragraph, add another point. Up to 3 points.
-		contentScore += float32(min(len(text)/100.0, 3))
+		contentScore += float32(min(textLen/100.0, 3))
 
 		candidates[parentNode].score += contentScore
 		if grandParentNode != nil {