Pārlūkot izejas kodu

Improve content scraper

Frédéric Guillot 8 gadi atpakaļ
vecāks
revīzija
c6d9eb3614

+ 8 - 0
reader/rewrite/rewrite_functions.go

@@ -5,6 +5,7 @@
 package rewrite
 
 import (
+	"fmt"
 	"regexp"
 	"strings"
 
@@ -38,3 +39,10 @@ func addYoutubeVideo(entryURL, entryContent string) string {
 	}
 	return entryContent
 }
+
+func addPDFLink(entryURL, entryContent string) string {
+	if strings.HasSuffix(entryURL, ".pdf") {
+		return fmt.Sprintf(`<a href="%s">PDF</a><br>%s`, entryURL, entryContent)
+	}
+	return entryContent
+}

+ 4 - 0
reader/rewrite/rewriter.go

@@ -18,12 +18,16 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 	}
 
 	rules := strings.Split(rulesList, ",")
+	rules = append(rules, "add_pdf_download_link")
+
 	for _, rule := range rules {
 		switch strings.TrimSpace(rule) {
 		case "add_image_title":
 			entryContent = addImageTitle(entryURL, entryContent)
 		case "add_youtube_video":
 			entryContent = addYoutubeVideo(entryURL, entryContent)
+		case "add_pdf_download_link":
+			entryContent = addPDFLink(entryURL, entryContent)
 		}
 	}
 

+ 10 - 0
reader/rewrite/rewriter_test.go

@@ -58,3 +58,13 @@ func TestRewriteWithXkcdAndNoImage(t *testing.T) {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+
+func TestRewriteWithPDFLink(t *testing.T) {
+	description := "test"
+	output := Rewriter("https://example.org/document.pdf", description, ``)
+	expected := `<a href="https://example.org/document.pdf">PDF</a><br>test`
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}

+ 6 - 0
reader/scraper/rules.go

@@ -7,10 +7,16 @@ package scraper
 // List of predefined scraper rules (alphabetically sorted)
 // domain => CSS selectors
 var predefinedRules = map[string]string{
+	"github.com":        "article.entry-content",
+	"igen.fr":           "section.corps",
 	"lemonde.fr":        "div#articleBody",
 	"lesjoiesducode.fr": ".blog-post-content img",
 	"linux.com":         "div.content, div[property]",
+	"medium.com":        ".section-content",
 	"opensource.com":    "div[property]",
+	"osnews.com":        "div.newscontent1",
 	"phoronix.com":      "div.content",
 	"techcrunch.com":    "div.article-entry",
+	"theregister.co.uk": "#body",
+	"wired.com":         "main figure, article",
 }

+ 3 - 0
reader/scraper/scraper.go

@@ -33,6 +33,9 @@ func Fetch(websiteURL, rules string) (string, error) {
 		return "", err
 	}
 
+	// The entry URL could be a redirect somewhere else.
+	websiteURL = response.EffectiveURL
+
 	if rules == "" {
 		rules = getPredefinedScraperRules(websiteURL)
 	}

Failā izmaiņas netiks attēlotas, jo tās ir par lielu
+ 1 - 1
server/static/css.go


+ 5 - 0
server/static/css/common.css

@@ -568,6 +568,11 @@ a.button {
     max-width: 100%;
 }
 
+.entry-content figure {
+    margin-top: 15px;
+    margin-bottom: 15px;
+}
+
 .entry-content figure img {
     border: 1px solid #000;
 }

Daži faili netika attēloti, jo izmaiņu fails ir pārāk liels