Browse Source

Add rewrite rule to fix Medium.com images

Frédéric Guillot 5 years ago
parent
commit
31435ef83e

+ 1 - 1
reader/readability/readability.go

@@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
 		return "", err
 	}
 
-	document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
+	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
 		removeNodes(s)
 	})
 

+ 15 - 0
reader/rewrite/rewrite_functions.go

@@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
 	return entryContent
 }
 
+func fixMediumImages(entryURL, entryContent string) string {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
+	if err != nil {
+		return entryContent
+	}
+
+	doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
+		noscriptElement := paragraphImage.Find("noscript")
+		paragraphImage.ReplaceWithHtml(noscriptElement.Text())
+	})
+
+	output, _ := doc.Find("body").First().Html()
+	return output
+}
+
 func addYoutubeVideo(entryURL, entryContent string) string {
 	matches := youtubeRegex.FindStringSubmatch(entryURL)
 

+ 2 - 0
reader/rewrite/rewriter.go

@@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 			entryContent = replaceLineFeeds(entryContent)
 		case "convert_text_link", "convert_text_links":
 			entryContent = replaceTextLinks(entryContent)
+		case "fix_medium_images":
+			entryContent = fixMediumImages(entryURL, entryContent)
 		}
 	}
 

+ 33 - 1
reader/rewrite/rewriter_test.go

@@ -4,7 +4,10 @@
 
 package rewrite // import "miniflux.app/reader/rewrite"
 
-import "testing"
+import (
+	"strings"
+	"testing"
+)
 
 func TestReplaceTextLinks(t *testing.T) {
 	scenarios := map[string]string{
@@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
 		t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
 	}
 }
+
+func TestMediumImage(t *testing.T) {
+	content := `
+		<figure class="ht hu hv hw hx hy cy cz paragraph-image">
+			<div class="hz ia ib ic aj">
+				<div class="cy cz hs">
+					<div class="ii s ib ij">
+						<div class="ik il s">
+							<div class="id ie t u v if aj bk ig ih">
+								<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
+							</div>
+							<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
+							<noscript>
+								<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
+							</noscript>
+						</div>
+					</div>
+				</div>
+			</div>
+		</figure>
+	`
+	expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
+	output := Rewriter("https://example.org/article", content, "fix_medium_images")
+	output = strings.TrimSpace(output)
+
+	if expected != output {
+		t.Errorf(`Not expected output: %s`, output)
+	}
+}

+ 1 - 0
reader/rewrite/rules.go

@@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
 	"invidio.us":             "add_invidious_video",
 	"xkcd.com":               "add_image_title",
 	"framatube.org":          "nl2br,convert_text_link",
+	"medium.com":             "fix_medium_images",
 }

+ 37 - 38
reader/scraper/rules.go

@@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
 // List of predefined scraper rules (alphabetically sorted)
 // domain => CSS selectors
 var predefinedRules = map[string]string{
-	"bbc.co.uk":           "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
-	"cbc.ca":              ".story-content",
-	"darkreading.com":     "#article-main:not(header)",
-	"developpez.com":      "div[itemprop=articleBody]",
-	"dilbert.com":         "span.comic-title-name, img.img-comic",
+	"bbc.co.uk":            "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
+	"cbc.ca":               ".story-content",
+	"darkreading.com":      "#article-main:not(header)",
+	"developpez.com":       "div[itemprop=articleBody]",
+	"dilbert.com":          "span.comic-title-name, img.img-comic",
 	"financialsamurai.com": "article",
-	"francetvinfo.fr":     ".text",
-	"github.com":          "article.entry-content",
-	"heise.de":            "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
-	"igen.fr":             "section.corps",
-	"ing.dk":              "section.body",
-	"lapresse.ca":         ".amorce, .entry",
-	"lemonde.fr":          "article",
-	"lepoint.fr":          ".art-text",
-	"lesjoiesducode.fr":   ".blog-post-content img",
-	"lesnumeriques.com":   ".text",
-	"linux.com":           "div.content, div[property]",
-	"medium.com":          ".section-content",
-	"mac4ever.com":        "div[itemprop=articleBody]",
-	"monwindows.com":      ".blog-post-body",
-	"npr.org":             "#storytext",
-	"oneindia.com":        ".io-article-body",
-	"opensource.com":      "div[property]",
-	"osnews.com":          "div.newscontent1",
-	"phoronix.com":        "div.content",
-	"pseudo-sciences.org": "#art_main",
-	"raywenderlich.com":   "article",
-	"slate.fr":            ".field-items",
-	"techcrunch.com":      "div.article-entry",
-	"theoatmeal.com":      "div#comic",
-	"theregister.co.uk":   "#body",
-	"turnoff.us":          "article.post-content",
-	"universfreebox.com":  "#corps_corps",
-	"version2.dk":         "section.body",
-	"wdwnt.com":           "div.entry-content",
-	"wired.com":           "main figure, article",
-	"zeit.de":             ".summary, .article-body",
-	"zdnet.com":           "div.storyBody",
-	"openingsource.org":   "article.suxing-popup-gallery",
+	"francetvinfo.fr":      ".text",
+	"github.com":           "article.entry-content",
+	"heise.de":             "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
+	"igen.fr":              "section.corps",
+	"ing.dk":               "section.body",
+	"lapresse.ca":          ".amorce, .entry",
+	"lemonde.fr":           "article",
+	"lepoint.fr":           ".art-text",
+	"lesjoiesducode.fr":    ".blog-post-content img",
+	"lesnumeriques.com":    ".text",
+	"linux.com":            "div.content, div[property]",
+	"mac4ever.com":         "div[itemprop=articleBody]",
+	"monwindows.com":       ".blog-post-body",
+	"npr.org":              "#storytext",
+	"oneindia.com":         ".io-article-body",
+	"opensource.com":       "div[property]",
+	"osnews.com":           "div.newscontent1",
+	"phoronix.com":         "div.content",
+	"pseudo-sciences.org":  "#art_main",
+	"raywenderlich.com":    "article",
+	"slate.fr":             ".field-items",
+	"techcrunch.com":       "div.article-entry",
+	"theoatmeal.com":       "div#comic",
+	"theregister.co.uk":    "#body",
+	"turnoff.us":           "article.post-content",
+	"universfreebox.com":   "#corps_corps",
+	"version2.dk":          "section.body",
+	"wdwnt.com":            "div.entry-content",
+	"wired.com":            "main figure, article",
+	"zeit.de":              ".summary, .article-body",
+	"zdnet.com":            "div.storyBody",
+	"openingsource.org":    "article.suxing-popup-gallery",
 }