|
|
@@ -107,6 +107,28 @@ func ExtractContent(page io.Reader) (baseURL string, extractedContent string, er
|
|
|
return baseURL, extractedContent, nil
|
|
|
}
|
|
|
|
|
|
+func getSelectionLength(s *goquery.Selection) int {
|
|
|
+ var getLengthOfTextContent func(*html.Node) int
|
|
|
+ getLengthOfTextContent = func(n *html.Node) int {
|
|
|
+ total := 0
|
|
|
+ if n.Type == html.TextNode {
|
|
|
+ total += len(n.Data)
|
|
|
+ }
|
|
|
+ if n.FirstChild != nil {
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ total += getLengthOfTextContent(c)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return total
|
|
|
+ }
|
|
|
+
|
|
|
+ sum := 0
|
|
|
+ for _, n := range s.Nodes {
|
|
|
+ sum += getLengthOfTextContent(n)
|
|
|
+ }
|
|
|
+ return sum
|
|
|
+}
|
|
|
+
|
|
|
// Now that we have the top candidate, look through its siblings for content that might also be related.
|
|
|
// Things like preambles, content split by ads that we removed, etc.
|
|
|
func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|
|
@@ -127,8 +149,7 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|
|
} else if s.Is("p") {
|
|
|
tag = node.Data
|
|
|
linkDensity := getLinkDensity(s)
|
|
|
- content := s.Text()
|
|
|
- contentLength := len(content)
|
|
|
+ contentLength := getSelectionLength(s)
|
|
|
|
|
|
if contentLength >= 80 {
|
|
|
if linkDensity < .25 {
|
|
|
@@ -136,6 +157,8 @@ func getArticle(topCandidate *candidate, candidates candidateList) string {
|
|
|
}
|
|
|
} else {
|
|
|
if linkDensity == 0 {
|
|
|
+ // It's a small selection, so .Text doesn't impact performances too much.
|
|
|
+ content := s.Text()
|
|
|
if containsSentence(content) {
|
|
|
append = true
|
|
|
}
|
|
|
@@ -223,10 +246,10 @@ func getCandidates(document *goquery.Document) candidateList {
|
|
|
candidates := make(candidateList)
|
|
|
|
|
|
document.Find(defaultTagsToScore).Each(func(i int, s *goquery.Selection) {
|
|
|
- text := s.Text()
|
|
|
+ textLen := getSelectionLength(s)
|
|
|
|
|
|
// If this paragraph is less than 25 characters, don't even count it.
|
|
|
- if len(text) < 25 {
|
|
|
+ if textLen < 25 {
|
|
|
return
|
|
|
}
|
|
|
|
|
|
@@ -253,10 +276,11 @@ func getCandidates(document *goquery.Document) candidateList {
|
|
|
contentScore := float32(1.0)
|
|
|
|
|
|
// Add points for any commas within this paragraph.
|
|
|
+ text := s.Text()
|
|
|
contentScore += float32(strings.Count(text, ",") + 1)
|
|
|
|
|
|
// For every 100 characters in this paragraph, add another point. Up to 3 points.
|
|
|
- contentScore += float32(min(len(text)/100.0, 3))
|
|
|
+ contentScore += float32(min(textLen/100.0, 3))
|
|
|
|
|
|
candidates[parentNode].score += contentScore
|
|
|
if grandParentNode != nil {
|