5 years ago · 31435ef83e
--- a/reader/readability/readability.go
+++ b/reader/readability/readability.go
@@ -76,7 +76,7 @@ func ExtractContent(page io.Reader) (string, error) {
 
				 		return "", err
			
 
				 	}
			
 
				 
			
 
				-	document.Find("script,style,noscript").Each(func(i int, s *goquery.Selection) {
			
 
				+	document.Find("script,style").Each(func(i int, s *goquery.Selection) {
			
 
				 		removeNodes(s)
			
 
				 	})
			
 
				 
			
--- a/reader/rewrite/rewrite_functions.go
+++ b/reader/rewrite/rewrite_functions.go
@@ -139,6 +139,21 @@ func addDynamicImage(entryURL, entryContent string) string {
 
				 	return entryContent
			
 
				 }
			
 
				 
			
 
				+func fixMediumImages(entryURL, entryContent string) string {
			
 
				+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
			
 
				+	if err != nil {
			
 
				+		return entryContent
			
 
				+	}
			
 
				+
			
 
				+	doc.Find("figure.paragraph-image").Each(func(i int, paragraphImage *goquery.Selection) {
			
 
				+		noscriptElement := paragraphImage.Find("noscript")
			
 
				+		paragraphImage.ReplaceWithHtml(noscriptElement.Text())
			
 
				+	})
			
 
				+
			
 
				+	output, _ := doc.Find("body").First().Html()
			
 
				+	return output
			
 
				+}
			
 
				+
			
 
				 func addYoutubeVideo(entryURL, entryContent string) string {
			
 
				 	matches := youtubeRegex.FindStringSubmatch(entryURL)
			
 
				 
			
--- a/reader/rewrite/rewriter.go
+++ b/reader/rewrite/rewriter.go
@@ -43,6 +43,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 
				 			entryContent = replaceLineFeeds(entryContent)
			
 
				 		case "convert_text_link", "convert_text_links":
			
 
				 			entryContent = replaceTextLinks(entryContent)
			
 
				+		case "fix_medium_images":
			
 
				+			entryContent = fixMediumImages(entryURL, entryContent)
			
 
				 		}
			
 
				 	}
			
 
				 
			
--- a/reader/rewrite/rewriter_test.go
+++ b/reader/rewrite/rewriter_test.go
@@ -4,7 +4,10 @@
 
				 
			
 
				 package rewrite // import "miniflux.app/reader/rewrite"
			
 
				 
			
 
				-import "testing"
			
 
				+import (
			
 
				+	"strings"
			
 
				+	"testing"
			
 
				+)
			
 
				 
			
 
				 func TestReplaceTextLinks(t *testing.T) {
			
 
				 	scenarios := map[string]string{
			
@@ -176,3 +179,32 @@ func TestConvertTextLinkRewriteRule(t *testing.T) {
 
				 		t.Errorf(`Not expected output: got %q instead of %q`, output, expected)
			
 
				 	}
			
 
				 }
			
 
				+
			
 
				+func TestMediumImage(t *testing.T) {
			
 
				+	content := `
			
 
				+		<figure class="ht hu hv hw hx hy cy cz paragraph-image">
			
 
				+			<div class="hz ia ib ic aj">
			
 
				+				<div class="cy cz hs">
			
 
				+					<div class="ii s ib ij">
			
 
				+						<div class="ik il s">
			
 
				+							<div class="id ie t u v if aj bk ig ih">
			
 
				+								<img alt="Image for post" class="t u v if aj im in io" src="https://miro.medium.com/max/60/1*ephLSqSzQYLvb7faDwzRbw.jpeg?q=20" width="1280" height="720"/>
			
 
				+							</div>
			
 
				+							<img alt="Image for post" class="id ie t u v if aj c" width="1280" height="720"/>
			
 
				+							<noscript>
			
 
				+								<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcSet="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>
			
 
				+							</noscript>
			
 
				+						</div>
			
 
				+					</div>
			
 
				+				</div>
			
 
				+			</div>
			
 
				+		</figure>
			
 
				+	`
			
 
				+	expected := `<img alt="Image for post" class="t u v if aj" src="https://miro.medium.com/max/2560/1*ephLSqSzQYLvb7faDwzRbw.jpeg" width="1280" height="720" srcset="https://miro.medium.com/max/552/1*ephLSqSzQYLvb7faDwzRbw.jpeg 276w, https://miro.medium.com/max/1104/1*ephLSqSzQYLvb7faDwzRbw.jpeg 552w, https://miro.medium.com/max/1280/1*ephLSqSzQYLvb7faDwzRbw.jpeg 640w, https://miro.medium.com/max/1400/1*ephLSqSzQYLvb7faDwzRbw.jpeg 700w" sizes="700px"/>`
			
 
				+	output := Rewriter("https://example.org/article", content, "fix_medium_images")
			
 
				+	output = strings.TrimSpace(output)
			
 
				+
			
 
				+	if expected != output {
			
 
				+		t.Errorf(`Not expected output: %s`, output)
			
 
				+	}
			
 
				+}
			
--- a/reader/rewrite/rules.go
+++ b/reader/rewrite/rules.go
@@ -30,4 +30,5 @@ var predefinedRules = map[string]string{
 
				 	"invidio.us":             "add_invidious_video",
			
 
				 	"xkcd.com":               "add_image_title",
			
 
				 	"framatube.org":          "nl2br,convert_text_link",
			
 
				+	"medium.com":             "fix_medium_images",
			
 
				 }
			
--- a/reader/scraper/rules.go
+++ b/reader/scraper/rules.go
@@ -7,43 +7,42 @@ package scraper // import "miniflux.app/reader/scraper"
 
				 // List of predefined scraper rules (alphabetically sorted)
			
 
				 // domain => CSS selectors
			
 
				 var predefinedRules = map[string]string{
			
 
				-	"bbc.co.uk":           "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
			
 
				-	"cbc.ca":              ".story-content",
			
 
				-	"darkreading.com":     "#article-main:not(header)",
			
 
				-	"developpez.com":      "div[itemprop=articleBody]",
			
 
				-	"dilbert.com":         "span.comic-title-name, img.img-comic",
			
 
				+	"bbc.co.uk":            "div.vxp-column--single, div.story-body__inner, ul.gallery-images__list",
			
 
				+	"cbc.ca":               ".story-content",
			
 
				+	"darkreading.com":      "#article-main:not(header)",
			
 
				+	"developpez.com":       "div[itemprop=articleBody]",
			
 
				+	"dilbert.com":          "span.comic-title-name, img.img-comic",
			
 
				 	"financialsamurai.com": "article",
			
 
				-	"francetvinfo.fr":     ".text",
			
 
				-	"github.com":          "article.entry-content",
			
 
				-	"heise.de":            "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
			
 
				-	"igen.fr":             "section.corps",
			
 
				-	"ing.dk":              "section.body",
			
 
				-	"lapresse.ca":         ".amorce, .entry",
			
 
				-	"lemonde.fr":          "article",
			
 
				-	"lepoint.fr":          ".art-text",
			
 
				-	"lesjoiesducode.fr":   ".blog-post-content img",
			
 
				-	"lesnumeriques.com":   ".text",
			
 
				-	"linux.com":           "div.content, div[property]",
			
 
				-	"medium.com":          ".section-content",
			
 
				-	"mac4ever.com":        "div[itemprop=articleBody]",
			
 
				-	"monwindows.com":      ".blog-post-body",
			
 
				-	"npr.org":             "#storytext",
			
 
				-	"oneindia.com":        ".io-article-body",
			
 
				-	"opensource.com":      "div[property]",
			
 
				-	"osnews.com":          "div.newscontent1",
			
 
				-	"phoronix.com":        "div.content",
			
 
				-	"pseudo-sciences.org": "#art_main",
			
 
				-	"raywenderlich.com":   "article",
			
 
				-	"slate.fr":            ".field-items",
			
 
				-	"techcrunch.com":      "div.article-entry",
			
 
				-	"theoatmeal.com":      "div#comic",
			
 
				-	"theregister.co.uk":   "#body",
			
 
				-	"turnoff.us":          "article.post-content",
			
 
				-	"universfreebox.com":  "#corps_corps",
			
 
				-	"version2.dk":         "section.body",
			
 
				-	"wdwnt.com":           "div.entry-content",
			
 
				-	"wired.com":           "main figure, article",
			
 
				-	"zeit.de":             ".summary, .article-body",
			
 
				-	"zdnet.com":           "div.storyBody",
			
 
				-	"openingsource.org":   "article.suxing-popup-gallery",
			
 
				+	"francetvinfo.fr":      ".text",
			
 
				+	"github.com":           "article.entry-content",
			
 
				+	"heise.de":             "header .article-content__lead, header .article-image, div.article-layout__content.article-content",
			
 
				+	"igen.fr":              "section.corps",
			
 
				+	"ing.dk":               "section.body",
			
 
				+	"lapresse.ca":          ".amorce, .entry",
			
 
				+	"lemonde.fr":           "article",
			
 
				+	"lepoint.fr":           ".art-text",
			
 
				+	"lesjoiesducode.fr":    ".blog-post-content img",
			
 
				+	"lesnumeriques.com":    ".text",
			
 
				+	"linux.com":            "div.content, div[property]",
			
 
				+	"mac4ever.com":         "div[itemprop=articleBody]",
			
 
				+	"monwindows.com":       ".blog-post-body",
			
 
				+	"npr.org":              "#storytext",
			
 
				+	"oneindia.com":         ".io-article-body",
			
 
				+	"opensource.com":       "div[property]",
			
 
				+	"osnews.com":           "div.newscontent1",
			
 
				+	"phoronix.com":         "div.content",
			
 
				+	"pseudo-sciences.org":  "#art_main",
			
 
				+	"raywenderlich.com":    "article",
			
 
				+	"slate.fr":             ".field-items",
			
 
				+	"techcrunch.com":       "div.article-entry",
			
 
				+	"theoatmeal.com":       "div#comic",
			
 
				+	"theregister.co.uk":    "#body",
			
 
				+	"turnoff.us":           "article.post-content",
			
 
				+	"universfreebox.com":   "#corps_corps",
			
 
				+	"version2.dk":          "section.body",
			
 
				+	"wdwnt.com":            "div.entry-content",
			
 
				+	"wired.com":            "main figure, article",
			
 
				+	"zeit.de":              ".summary, .article-body",
			
 
				+	"zdnet.com":            "div.storyBody",
			
 
				+	"openingsource.org":    "article.suxing-popup-gallery",
			
 
				 }