Browse Source

perf(readability): significantly improve transformMisusedDivsIntoParagraphs

jvoisin 9 months ago
parent
commit
89c32d518d
1 changed files with 16 additions and 2 deletions
  1. 16 2
      internal/reader/readability/readability.go

+ 16 - 2
internal/reader/readability/readability.go

@@ -361,10 +361,24 @@ func getWeight(s string) int {
 
 func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
 	document.Find("div").Each(func(i int, s *goquery.Selection) {
-		html, _ := s.Html()
-		if !divToPElementsRegexp.MatchString(html) {
+		nodes := s.Children().Nodes
+
+		if len(nodes) == 0 {
 			node := s.Get(0)
 			node.Data = "p"
+			return
+		}
+
+		for _, node := range nodes {
+			switch node.Data {
+			case "a", "blockquote", "div", "dl",
+				"img", "ol", "p", "pre",
+				"table", "ul":
+				return
+			default:
+				node := s.Get(0)
+				node.Data = "p"
+			}
 		}
 	})
 }