Pārlūkot izejas kodu

perf(readability): improve getClassWeight speed

Before

```console
$ go test -bench=.
goos: linux
goarch: arm64
pkg: miniflux.app/v2/internal/reader/readability
BenchmarkExtractContent-8   	     34	 86102474 ns/op
BenchmarkGetWeight-8        	  10573	    103045 ns/op
PASS
ok  	miniflux.app/v2/internal/reader/readability	5.409s
```

After

```console
$ go test -bench=.
goos: linux
goarch: arm64
pkg: miniflux.app/v2/internal/reader/readability
BenchmarkExtractContent-8   	     56	 83130924 ns/op
BenchmarkGetWeight-8        	 246541	     5241 ns/op
PASS
ok  	miniflux.app/v2/internal/reader/readability	6.026s
```

This should make ProcessFeedEntries marginally faster, while saving
some memory.
jvoisin 9 mēneši atpakaļ
vecāks
revīzija
aed99e65c1

+ 19 - 15
internal/reader/readability/readability.go

@@ -27,8 +27,8 @@ var (
 	maybeCandidate    = [...]string{"and", "article", "body", "column", "main", "shadow"}
 	unlikelyCandidate = [...]string{"banner", "breadcrumbs", "combx", "comment", "community", "cover-wrap", "disqus", "extra", "foot", "header", "legends", "menu", "modal", "related", "remark", "replies", "rss", "shoutbox", "sidebar", "skyscraper", "social", "sponsor", "supplemental", "ad-break", "agegate", "pagination", "pager", "popup", "yom-remote"}
 
-	negativeRegexp = regexp.MustCompile(`hid|banner|combx|comment|com-|contact|foot|masthead|media|meta|modal|outbrain|promo|related|scroll|share|shoutbox|sidebar|skyscraper|sponsor|shopping|tags|tool|widget|byline|author|dateline|writtenby`)
-	positiveRegexp = regexp.MustCompile(`article|body|content|entry|hentry|h-entry|main|page|pagination|post|text|blog|story`)
+	positive = [...]string{"article", "blog", "body", "content", "entry", "h-entry", "hentry", "main", "page", "pagination", "post", "story", "text"}
+	negative = [...]string{"author", "banner", "byline", "com-", "combx", "comment", "contact", "dateline", "foot", "hid", "masthead", "media", "meta", "modal", "outbrain", "promo", "related", "scroll", "share", "shopping", "shoutbox", "sidebar", "skyscraper", "sponsor", "tags", "tool", "widget", "writtenby"}
 )
 
 type candidate struct {
@@ -303,26 +303,30 @@ func getClassWeight(s *goquery.Selection) float32 {
 	weight := 0
 
 	if class, ok := s.Attr("class"); ok {
-		class = strings.ToLower(class)
-		if negativeRegexp.MatchString(class) {
-			weight -= 25
-		} else if positiveRegexp.MatchString(class) {
-			weight += 25
-		}
+		weight += getWeight(class)
 	}
-
 	if id, ok := s.Attr("id"); ok {
-		id = strings.ToLower(id)
-		if negativeRegexp.MatchString(id) {
-			weight -= 25
-		} else if positiveRegexp.MatchString(id) {
-			weight += 25
-		}
+		weight += getWeight(id)
 	}
 
 	return float32(weight)
 }
 
+func getWeight(s string) int {
+	s = strings.ToLower(s)
+	for _, pos := range negative {
+		if strings.Contains(s, pos) {
+			return -25
+		}
+	}
+	for _, pos := range positive {
+		if strings.Contains(s, pos) {
+			return +25
+		}
+	}
+	return 0
+}
+
 func transformMisusedDivsIntoParagraphs(document *goquery.Document) {
 	document.Find("div").Each(func(i int, s *goquery.Selection) {
 		html, _ := s.Html()

+ 16 - 0
internal/reader/readability/readability_test.go

@@ -1314,3 +1314,19 @@ func TestContainsSentence(t *testing.T) {
 		})
 	}
 }
+
+func BenchmarkGetWeight(b *testing.B) {
+	testCases := []string{
+		"p-3 color-bg-accent-emphasis color-fg-on-emphasis show-on-focus js-skip-to-content",
+		"d-flex flex-column mb-3",
+		"AppHeader-search-control AppHeader-search-control-overflow",
+		"Button Button--iconOnly Button--invisible Button--medium mr-1 px-2 py-0 d-flex flex-items-center rounded-1 color-fg-muted",
+		"sr-only",
+		"validation-12753bbc-b4d1-4e10-bec6-92e585d1699d",
+	}
+	for range b.N {
+		for _, v := range testCases {
+			getWeight(v)
+		}
+	}
+}