فهرست منبع

Add rewrite rule to remove dom elements

Lukas Dietrich 4 سال پیش
والد
کامیت
93596c1218
3فایلهای تغییر یافته به همراه112 افزوده شده و 38 حذف شده
  1. 12 0
      reader/rewrite/rewrite_functions.go
  2. 73 37
      reader/rewrite/rewriter.go
  3. 27 1
      reader/rewrite/rewriter_test.go

+ 12 - 0
reader/rewrite/rewrite_functions.go

@@ -229,3 +229,15 @@ func replaceCustom(entryContent string, searchTerm string, replaceTerm string) s
 	}
 	return entryContent
 }
+
+func removeCustom(entryContent string, selector string) string {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
+	if err != nil {
+		return entryContent
+	}
+
+	doc.Find(selector).Remove()
+
+	output, _ := doc.Find("body").First().Html()
+	return output
+}

+ 73 - 37
reader/rewrite/rewriter.go

@@ -5,14 +5,18 @@
 package rewrite // import "miniflux.app/reader/rewrite"
 
 import (
-	"regexp"
+	"strconv"
 	"strings"
+	"text/scanner"
 
 	"miniflux.app/logger"
 	"miniflux.app/url"
 )
 
-var customReplaceRuleRegex = regexp.MustCompile(`replace\("(.*)"\|"(.*)"\)`)
+type rule struct {
+	name string
+	args []string
+}
 
 // Rewriter modify item contents with a set of rewriting rules.
 func Rewriter(entryURL, entryContent, customRewriteRules string) string {
@@ -21,46 +25,78 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 		rulesList = customRewriteRules
 	}
 
-	rules := strings.Split(rulesList, ",")
-	rules = append(rules, "add_pdf_download_link")
+	rules := parseRules(rulesList)
+	rules = append(rules, rule{name: "add_pdf_download_link"})
 
 	logger.Debug(`[Rewrite] Applying rules %v for %q`, rules, entryURL)
 
 	for _, rule := range rules {
-		rule := strings.TrimSpace(rule)
-		switch rule {
-		case "add_image_title":
-			entryContent = addImageTitle(entryURL, entryContent)
-		case "add_mailto_subject":
-			entryContent = addMailtoSubject(entryURL, entryContent)
-		case "add_dynamic_image":
-			entryContent = addDynamicImage(entryURL, entryContent)
-		case "add_youtube_video":
-			entryContent = addYoutubeVideo(entryURL, entryContent)
-		case "add_invidious_video":
-			entryContent = addInvidiousVideo(entryURL, entryContent)
-		case "add_youtube_video_using_invidious_player":
-			entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent)
-		case "add_pdf_download_link":
-			entryContent = addPDFLink(entryURL, entryContent)
-		case "nl2br":
-			entryContent = replaceLineFeeds(entryContent)
-		case "convert_text_link", "convert_text_links":
-			entryContent = replaceTextLinks(entryContent)
-		case "fix_medium_images":
-			entryContent = fixMediumImages(entryURL, entryContent)
-		case "use_noscript_figure_images":
-			entryContent = useNoScriptImages(entryURL, entryContent)
-		default:
-			if strings.Contains(rule, "replace") {
-				// Format: replace("search-term"|"replace-term")
-				args := customReplaceRuleRegex.FindStringSubmatch(rule)
-				if len(args) >= 3 {
-					entryContent = replaceCustom(entryContent, args[1], args[2])
-				} else {
-					logger.Debug("[Rewrite] Cannot find search and replace terms for replace rule %s", rule)
-				}
+		entryContent = applyRule(entryURL, entryContent, rule)
+	}
+
+	return entryContent
+}
+
+func parseRules(rulesText string) (rules []rule) {
+	scan := scanner.Scanner{Mode: scanner.ScanIdents | scanner.ScanStrings}
+	scan.Init(strings.NewReader(rulesText))
+
+	for {
+		switch scan.Scan() {
+		case scanner.Ident:
+			rules = append(rules, rule{name: scan.TokenText()})
+
+		case scanner.String:
+			if l := len(rules) - 1; l >= 0 {
+				text := scan.TokenText()
+				text, _ = strconv.Unquote(text)
+
+				rules[l].args = append(rules[l].args, text)
 			}
+
+		case scanner.EOF:
+			return
+		}
+	}
+}
+
+func applyRule(entryURL, entryContent string, rule rule) string {
+	switch rule.name {
+	case "add_image_title":
+		entryContent = addImageTitle(entryURL, entryContent)
+	case "add_mailto_subject":
+		entryContent = addMailtoSubject(entryURL, entryContent)
+	case "add_dynamic_image":
+		entryContent = addDynamicImage(entryURL, entryContent)
+	case "add_youtube_video":
+		entryContent = addYoutubeVideo(entryURL, entryContent)
+	case "add_invidious_video":
+		entryContent = addInvidiousVideo(entryURL, entryContent)
+	case "add_youtube_video_using_invidious_player":
+		entryContent = addYoutubeVideoUsingInvidiousPlayer(entryURL, entryContent)
+	case "add_pdf_download_link":
+		entryContent = addPDFLink(entryURL, entryContent)
+	case "nl2br":
+		entryContent = replaceLineFeeds(entryContent)
+	case "convert_text_link", "convert_text_links":
+		entryContent = replaceTextLinks(entryContent)
+	case "fix_medium_images":
+		entryContent = fixMediumImages(entryURL, entryContent)
+	case "use_noscript_figure_images":
+		entryContent = useNoScriptImages(entryURL, entryContent)
+	case "replace":
+		// Format: replace("search-term"|"replace-term")
+		if len(rule.args) >= 2 {
+			entryContent = replaceCustom(entryContent, rule.args[0], rule.args[1])
+		} else {
+			logger.Debug("[Rewrite] Cannot find search and replace terms for replace rule %s", rule)
+		}
+	case "remove":
+		// Format: remove("#selector > .element, .another")
+		if len(rule.args) >= 1 {
+			entryContent = removeCustom(entryContent, rule.args[0])
+		} else {
+			logger.Debug("[Rewrite] Cannot find selector for remove rule %s", rule)
 		}
 	}
 

+ 27 - 1
reader/rewrite/rewriter_test.go

@@ -5,10 +5,26 @@
 package rewrite // import "miniflux.app/reader/rewrite"
 
 import (
+	"reflect"
 	"strings"
 	"testing"
 )
 
+func TestParseRules(t *testing.T) {
+	rulesText := `add_dynamic_image,replace("article/(.*).svg"|"article/$1.png"),remove(".spam, .ads:not(.keep)")`
+	expected := []rule{
+		{name: "add_dynamic_image"},
+		{name: "replace", args: []string{"article/(.*).svg", "article/$1.png"}},
+		{name: "remove", args: []string{".spam, .ads:not(.keep)"}},
+	}
+
+	actual := parseRules(rulesText)
+
+	if !reflect.DeepEqual(expected, actual) {
+		t.Errorf(`Parsed rules do not match expected rules: got %v instead of %v`, actual, expected)
+	}
+}
+
 func TestReplaceTextLinks(t *testing.T) {
 	scenarios := map[string]string{
 		`This is a link to example.org`:                                              `This is a link to example.org`,
@@ -234,7 +250,17 @@ func TestRewriteNoScriptImageWithNoScriptTag(t *testing.T) {
 func TestRewriteReplaceCustom(t *testing.T) {
 	content := `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.svg">`
 	expected := `<img src="http://example.org/logo.svg"><img src="https://example.org/article/picture.png">`
-	output := Rewriter("https://example.org/artcle", content, `replace("article/(.*).svg"|"article/$1.png")`)
+	output := Rewriter("https://example.org/article", content, `replace("article/(.*).svg"|"article/$1.png")`)
+
+	if expected != output {
+		t.Errorf(`Not expected output: %s`, output)
+	}
+}
+
+func TestRewriteRemoveCustom(t *testing.T) {
+	content := `<div>Lorem Ipsum <span class="spam">I dont want to see this</span><span class="ads keep">Super important info</span></div>`
+	expected := `<div>Lorem Ipsum <span class="ads keep">Super important info</span></div>`
+	output := Rewriter("https://example.org/article", content, `remove(".spam, .ads:not(.keep)")`)
 
 	if expected != output {
 		t.Errorf(`Not expected output: %s`, output)