1 سال پیش · 6ad5ad0bb2
--- a/internal/reader/readability/readability.go
+++ b/internal/reader/readability/readability.go
@@ -4,7 +4,6 @@
 
				 package readability // import "miniflux.app/v2/internal/reader/readability"
			
 
				 
			
 
				 import (
			
 
				-	"bytes"
			
 
				 	"fmt"
			
 
				 	"io"
			
 
				 	"log/slog"
			
@@ -23,7 +22,6 @@ const (
 
				 
			
 
				 var (
			
 
				 	divToPElementsRegexp = regexp.MustCompile(`(?i)<(a|blockquote|dl|div|img|ol|p|pre|table|ul)`)
			
 
				-	sentenceRegexp       = regexp.MustCompile(`\.( |$)`)
			
 
				 
			
 
				 	blacklistCandidatesRegexp  = regexp.MustCompile(`popupbody|-ad|g-plus`)
			
 
				 	okMaybeItsACandidateRegexp = regexp.MustCompile(`and|article|body|column|main|shadow`)
			
@@ -84,7 +82,7 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 
				 	}
			
 
				 
			
 
				 	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
			
 
				-		removeNodes(s)
			
 
				+		s.Remove()
			
 
				 	})
			
 
				 
			
 
				 	transformMisusedDivsIntoParagraphs(document)
			
@@ -106,7 +104,8 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
 
				 // Now that we have the top candidate, look through its siblings for content that might also be related.
			
 
				 // Things like preambles, content split by ads that we removed, etc.
			
 
				 func getArticle(topCandidate *candidate, candidates candidateList) string {
			
 
				-	output := bytes.NewBufferString("<div>")
			
 
				+	var output strings.Builder
			
 
				+	output.WriteString("<div>")
			
 
				 	siblingScoreThreshold := max(10, topCandidate.score*.2)
			
 
				 
			
 
				 	topCandidate.selection.Siblings().Union(topCandidate.selection).Each(func(i int, s *goquery.Selection) {
			
@@ -124,10 +123,14 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 
				 			content := s.Text()
			
 
				 			contentLength := len(content)
			
 
				 
			
 
				-			if contentLength >= 80 && linkDensity < .25 {
			
 
				-				append = true
			
 
				-			} else if contentLength < 80 && linkDensity == 0 && sentenceRegexp.MatchString(content) {
			
 
				-				append = true
			
 
				+			if contentLength >= 80 {
			
 
				+				if linkDensity < .25 {
			
 
				+					append = true
			
 
				+				}
			
 
				+			} else {
			
 
				+				if linkDensity == 0 && containsSentence(content) {
			
 
				+					append = true
			
 
				+				}
			
 
				 			}
			
 
				 		}
			
 
				 
			
@@ -138,7 +141,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
 
				 			}
			
 
				 
			
 
				 			html, _ := s.Html()
			
 
				-			fmt.Fprintf(output, "<%s>%s</%s>", tag, html, tag)
			
 
				+			output.WriteString("<" + tag + ">" + html + "</" + tag + ">")
			
 
				 		}
			
 
				 	})
			
 
				 
			
@@ -156,9 +159,9 @@ func removeUnlikelyCandidates(document *goquery.Document) {
 
				 		str := strings.ToLower(class + id)
			
 
				 
			
 
				 		if blacklistCandidatesRegexp.MatchString(str) {
			
 
				-			removeNodes(s)
			
 
				+			s.Remove()
			
 
				 		} else if unlikelyCandidatesRegexp.MatchString(str) && !okMaybeItsACandidateRegexp.MatchString(str) {
			
 
				-			removeNodes(s)
			
 
				+			s.Remove()
			
 
				 		}
			
 
				 	})
			
 
				 }
			
@@ -222,7 +225,7 @@ func getCandidates(document *goquery.Document) candidateList {
 
				 		contentScore += float32(strings.Count(text, ",") + 1)
			
 
				 
			
 
				 		// For every 100 characters in this paragraph, add another point. Up to 3 points.
			
 
				-		contentScore += float32(min(int(len(text)/100.0), 3))
			
 
				+		contentScore += float32(min(len(text)/100.0, 3))
			
 
				 
			
 
				 		candidates[parentNode].score += contentScore
			
 
				 		if grandParentNode != nil {
			
@@ -261,13 +264,14 @@ func scoreNode(s *goquery.Selection) *candidate {
 
				 // Get the density of links as a percentage of the content
			
 
				 // This is the amount of text that is inside a link divided by the total text in the node.
			
 
				 func getLinkDensity(s *goquery.Selection) float32 {
			
 
				-	linkLength := len(s.Find("a").Text())
			
 
				 	textLength := len(s.Text())
			
 
				 
			
 
				 	if textLength == 0 {
			
 
				 		return 0
			
 
				 	}
			
 
				 
			
 
				+	linkLength := len(s.Find("a").Text())
			
 
				+
			
 
				 	return float32(linkLength) / float32(textLength)
			
 
				 }
			
 
				 
			
@@ -278,25 +282,20 @@ func getClassWeight(s *goquery.Selection) float32 {
 
				 	class, _ := s.Attr("class")
			
 
				 	id, _ := s.Attr("id")
			
 
				 
			
 
				-	class = strings.ToLower(class)
			
 
				-	id = strings.ToLower(id)
			
 
				-
			
 
				 	if class != "" {
			
 
				+		class = strings.ToLower(class)
			
 
				 		if negativeRegexp.MatchString(class) {
			
 
				 			weight -= 25
			
 
				-		}
			
 
				-
			
 
				-		if positiveRegexp.MatchString(class) {
			
 
				+		} else if positiveRegexp.MatchString(class) {
			
 
				 			weight += 25
			
 
				 		}
			
 
				 	}
			
 
				 
			
 
				 	if id != "" {
			
 
				+		id = strings.ToLower(id)
			
 
				 		if negativeRegexp.MatchString(id) {
			
 
				 			weight -= 25
			
 
				-		}
			
 
				-
			
 
				-		if positiveRegexp.MatchString(id) {
			
 
				+		} else if positiveRegexp.MatchString(id) {
			
 
				 			weight += 25
			
 
				 		}
			
 
				 	}
			
@@ -314,11 +313,6 @@ func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
 
				 	})
			
 
				 }
			
 
				 
			
 
				-func removeNodes(s *goquery.Selection) {
			
 
				-	s.Each(func(i int, s *goquery.Selection) {
			
 
				-		parent := s.Parent()
			
 
				-		if parent.Length() > 0 {
			
 
				-			parent.Get(0).RemoveChild(s.Get(0))
			
 
				-		}
			
 
				-	})
			
 
				+func containsSentence(content string) bool {
			
 
				+	return strings.HasSuffix(content, ".") || strings.Contains(content, ". ")
			
 
				 }
			
--- a/internal/reader/readability/readability_test.go
+++ b/internal/reader/readability/readability_test.go
@@ -100,3 +100,64 @@ func TestWithoutBaseURL(t *testing.T) {
 
				 		t.Errorf(`Unexpected base URL, got %q instead of ""`, baseURL)
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+func TestRemoveStyleScript(t *testing.T) {
			
 
				+	html := `
			
 
				+		<html>
			
 
				+			<head>
			
 
				+				<title>Test</title>
			
 
				+				    <script src="tololo.js"></script>
			
 
				+			</head>
			
 
				+			<body>
			
 
				+				<script src="tololo.js"></script>
			
 
				+				<style>
			
 
				+			  		h1 {color:red;}
			
 
				+			  		p {color:blue;}
			
 
				+				</style>
			
 
				+				<article>Some content</article>
			
 
				+			</body>
			
 
				+		</html>`
			
 
				+	want := `<div><div><article>Somecontent</article></div></div>`
			
 
				+
			
 
				+	_, content, err := ExtractContent(strings.NewReader(html))
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	content = strings.ReplaceAll(content, "\n", "")
			
 
				+	content = strings.ReplaceAll(content, " ", "")
			
 
				+	content = strings.ReplaceAll(content, "\t", "")
			
 
				+
			
 
				+	if content != want {
			
 
				+		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
			
 
				+	}
			
 
				+}
			
 
				+
			
 
				+func TestRemoveBlacklist(t *testing.T) {
			
 
				+	html := `
			
 
				+		<html>
			
 
				+			<head>
			
 
				+				<title>Test</title>
			
 
				+			</head>
			
 
				+			<body>
			
 
				+				<article class="super-ad">Some content</article>
			
 
				+				<article class="g-plus-crap">Some other thing</article>
			
 
				+				<article class="stuff popupbody">And more</article>
			
 
				+				<article class="legit">Valid!</article>
			
 
				+			</body>
			
 
				+		</html>`
			
 
				+	want := `<div><div><articleclass="legit">Valid!</article></div></div>`
			
 
				+
			
 
				+	_, content, err := ExtractContent(strings.NewReader(html))
			
 
				+	if err != nil {
			
 
				+		t.Fatal(err)
			
 
				+	}
			
 
				+
			
 
				+	content = strings.ReplaceAll(content, "\n", "")
			
 
				+	content = strings.ReplaceAll(content, " ", "")
			
 
				+	content = strings.ReplaceAll(content, "\t", "")
			
 
				+
			
 
				+	if content != want {
			
 
				+		t.Errorf(`Invalid content, got %s instead of %s`, content, want)
			
 
				+	}
			
 
				+}