Browse Source

perf(readability): improve getLinkDensity

- There is no need to materialize all the content of a given Node when we can
  simply compute its length directly, saving a lot of memory, on the order of
  several megabytes on my instance, with peaks at a couple of dozen.
- One might object to the usage of a recursive construct, but this is a direct
  port of goquery's Text method, so this change doesn't make anything worse.
- The computation of linkLength can be similarly computed, but this can go in
  another commit, as it's a bit trickier, since we need to get the length of
  every Node that has a `a` Node as parent, without iterating on the whole
  parent chain every time.
jvoisin 9 months ago
parent
commit
2f7b2e7375

+ 21 - 3
internal/reader/readability/readability.go

@@ -300,15 +300,33 @@ func scoreNode(s *goquery.Selection) *candidate {
 // Get the density of links as a percentage of the content
 // This is the amount of text that is inside a link divided by the total text in the node.
 func getLinkDensity(s *goquery.Selection) float32 {
-	textLength := len(s.Text())
+	var getLengthOfTextContent func(*html.Node) int
+	getLengthOfTextContent = func(n *html.Node) int {
+		total := 0
+		if n.Type == html.TextNode {
+			total += len(n.Data)
+		}
+		if n.FirstChild != nil {
+			for c := n.FirstChild; c != nil; c = c.NextSibling {
+				total += getLengthOfTextContent(c)
+			}
+		}
+		return total
+	}
+
+	sum := 0
+	for _, n := range s.Nodes {
+		sum += getLengthOfTextContent(n)
+	}
 
-	if textLength == 0 {
+	if sum == 0 {
 		return 0
 	}
 
+	// TODO: use something better than materializing the HTML.
 	linkLength := len(s.Find("a").Text())
 
-	return float32(linkLength) / float32(textLength)
+	return float32(linkLength) / float32(sum)
 }
 
 // Get an elements class/id weight. Uses regular expressions to tell if this

+ 1 - 1
internal/reader/readability/readability_test.go

@@ -1274,7 +1274,7 @@ func TestGetLinkDensity(t *testing.T) {
 			// Use a small epsilon for float comparison
 			epsilon := float32(0.001)
 			if result < tc.expected-epsilon || result > tc.expected+epsilon {
-				t.Errorf("Expected link density %f, got %f", tc.expected, result)
+				t.Errorf("Expected link density %f, got %f for %s", tc.expected, result, tc.name)
 			}
 		})
 	}