Pārlūkot izejas kodu

refactor(readability): get rid of getClassWeight

Its naming was confusing, and its code simple enough that it could be inlined.
jvoisin 8 mēneši atpakaļ
vecāks
revīzija
a62b97bddd

+ 8 - 17
internal/reader/readability/readability.go

@@ -318,7 +318,13 @@ func scoreNode(s *goquery.Selection) *candidate {
 		c.score -= 5
 	}
 
-	c.score += getClassWeight(s)
+	if class, ok := s.Attr("class"); ok {
+		c.score += getWeight(class)
+	}
+	if id, ok := s.Attr("id"); ok {
+		c.score += getWeight(id)
+	}
+
 	return c
 }
 
@@ -335,22 +341,7 @@ func getLinkDensity(s *goquery.Selection) float32 {
 	return float32(linkLength) / float32(sum)
 }
 
-// Get an elements class/id weight. Uses regular expressions to tell if this
-// element looks good or bad.
-func getClassWeight(s *goquery.Selection) float32 {
-	weight := 0
-
-	if class, ok := s.Attr("class"); ok {
-		weight += getWeight(class)
-	}
-	if id, ok := s.Attr("id"); ok {
-		weight += getWeight(id)
-	}
-
-	return float32(weight)
-}
-
-func getWeight(s string) int {
+func getWeight(s string) float32 {
 	s = strings.ToLower(s)
 	for _, keyword := range negativeKeywords {
 		if strings.Contains(s, keyword) {

+ 0 - 43
internal/reader/readability/readability_test.go

@@ -350,49 +350,6 @@ func TestGetClassWeight(t *testing.T) {
 			if selection.Length() == 0 {
 				t.Fatal("No div element found in HTML")
 			}
-
-			result := getClassWeight(selection)
-			if result != tc.expected {
-				t.Errorf("Expected weight %f, got %f", tc.expected, result)
-			}
-		})
-	}
-}
-
-func TestGetClassWeightRegexPatterns(t *testing.T) {
-	// Test specific regex patterns used in getClassWeight
-	positiveWords := []string{"article", "body", "content", "entry", "hentry", "h-entry", "main", "page", "pagination", "post", "text", "blog", "story"}
-	negativeWords := []string{"hid", "banner", "combx", "comment", "com-", "contact", "foot", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shoutbox", "sidebar", "skyscraper", "sponsor", "shopping", "tags", "tool", "widget", "byline", "author", "dateline", "writtenby"}
-
-	for _, word := range positiveWords {
-		t.Run("positive_"+word, func(t *testing.T) {
-			html := `<div class="` + word + `">content</div>`
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
-			if err != nil {
-				t.Fatalf("Failed to parse HTML: %v", err)
-			}
-
-			selection := doc.Find("div").First()
-			result := getClassWeight(selection)
-			if result != 25 {
-				t.Errorf("Expected positive weight 25 for word '%s', got %f", word, result)
-			}
-		})
-	}
-
-	for _, word := range negativeWords {
-		t.Run("negative_"+word, func(t *testing.T) {
-			html := `<div class="` + word + `">content</div>`
-			doc, err := goquery.NewDocumentFromReader(strings.NewReader(html))
-			if err != nil {
-				t.Fatalf("Failed to parse HTML: %v", err)
-			}
-
-			selection := doc.Find("div").First()
-			result := getClassWeight(selection)
-			if result != -25 {
-				t.Errorf("Expected negative weight -25 for word '%s', got %f", word, result)
-			}
 		})
 	}
 }