فهرست منبع

refactor(readability): get rid of getClassWeight

Its naming was confusing, and its code simple enough that it could be inlined.
jvoisin 10 ماه پیش
والد
کامیت
a62b97bddd
2فایلهای تغییر یافته به همراه8 افزوده شده و 60 حذف شده
  1. 8 17
      internal/reader/readability/readability.go
  2. 0 43
      internal/reader/readability/readability_test.go

+ 8 - 17
internal/reader/readability/readability.go

@@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate {
 		c.score -= 5
 	}
 
-	c.score += getClassWeight(s)
+	if class, ok := s.Attr("class"); ok {
+		c.score += getWeight(class)
+	}
+	if id, ok := s.Attr("id"); ok {
+		c.score += getWeight(id)
+	}
+
 	return c
 }
 
@@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 {
 	return float32(linkLength) / float32(sum)
 }
 
-// Get an elements class/id weight. Uses regular expressions to tell if this
-// element looks good or bad.
-func getClassWeight(s *goquery.Selection) float32 {
-	weight := 0
-
-	if class, ok := s.Attr("class"); ok {
-		weight += getWeight(class)
-	}
-	if id, ok := s.Attr("id"); ok {
-		weight += getWeight(id)
-	}
-
-	return float32(weight)
-}
-
-func getWeight(s string) int {
+func getWeight(s string) float32 {
 	s = strings.ToLower(s)
 	for _, keyword := range negativeKeywords {
 		if strings.Contains(s, keyword) {

+ 0 - 43
internal/reader/readability/readability_test.go

@@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) {
 			if selection.Length() == 0 {
 				t.Fatal("No div element found in HTML")
 			}
-
-			result := getClassWeight(selection)
-			if result != tc.expected {
-				t.Errorf("Expected weight %f, got %f", tc.expected, result)
-			}
-		})
-	}
-}
-
-func TestGetClassWeightRegexPatterns(t *testing.T) {
-	// Test specific regex patterns used in getClassWeight
-	positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
-	negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
-
-	for _, word := range positiveWords {
-		t.Run("positive_"+word, func(t *testing.T) {
-			html := `<div class="` + word + `">content</div>`
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
-			if err != nil {
-				t.Fatalf("Failed to parse HTML: %v", err)
-			}
-
-			selection := doc.Find("div").First()
-			result := getClassWeight(selection)
-			if result != 25 {
-				t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
-			}
-		})
-	}
-
-	for _, word := range negativeWords {
-		t.Run("negative_"+word, func(t *testing.T) {
-			html := `<div class="` + word + `">content</div>`
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
-			if err != nil {
-				t.Fatalf("Failed to parse HTML: %v", err)
-			}
-
-			selection := doc.Find("div").First()
-			result := getClassWeight(selection)
-			if result != -25 {
-				t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
-			}
 		})
 	}
 }