Просмотр исходного кода

New `add_dynamic_image` rewriter for JavaScript-loaded images.

Searches tags for various `data-*` attributes and sets `img` tag `src` attribute appropriately. Falls back to searching `noscript` for `img` tags.

Includes unit tests.
dzaikos 7 лет назад
Родитель
Сommit
6d25e02cb5
3 измененных файлов с 107 добавлено и 0 удалено
  1. 63 0
      reader/rewrite/rewrite_functions.go
  2. 2 0
      reader/rewrite/rewriter.go
  3. 42 0
      reader/rewrite/rewriter_test.go

+ 63 - 0
reader/rewrite/rewrite_functions.go

@@ -14,6 +14,7 @@ import (
 
 var (
 	youtubeRegex = regexp.MustCompile(`youtube\.com/watch\?v=(.*)`)
+	imgRegex = regexp.MustCompile(`<img [^>]+>`)
 )
 
 func addImageTitle(entryURL, entryContent string) string {
@@ -40,6 +41,68 @@ func addImageTitle(entryURL, entryContent string) string {
 	return entryContent
 }
 
+func addDynamicImage(entryURL, entryContent string) string {
+	doc, err := goquery.NewDocumentFromReader(strings.NewReader(entryContent))
+	if err != nil {
+		return entryContent
+	}
+
+	// Ordered most preferred to least preferred.
+	candidateAttrs := []string{
+		"data-src",
+		"data-original",
+		"data-orig",
+		"data-url",
+		"data-orig-file",
+		"data-large-file",
+		"data-medium-file",
+		"data-2000src",
+		"data-1000src",
+		"data-800src",
+		"data-655src",
+		"data-500src",
+		"data-380src",
+	}
+
+	changed := false
+
+	doc.Find("img,div").Each(func(i int, img *goquery.Selection) {
+		for _, candidateAttr := range candidateAttrs {
+			if srcAttr, found := img.Attr(candidateAttr); found {
+				changed = true
+
+				if img.Is("img") {
+					img.SetAttr("src",srcAttr)
+				} else {
+					altAttr := img.AttrOr("alt", "")
+					img.ReplaceWithHtml(`<img src="` + srcAttr + `" alt="` + altAttr + `"/>`)
+				}
+
+				break;
+			}
+		}
+	})
+
+	if !changed {
+		doc.Find("noscript").Each(func(i int, noscript *goquery.Selection) {
+			matches := imgRegex.FindAllString(noscript.Text(), 2)
+
+			if len(matches) == 1 {
+				changed = true
+
+				noscript.ReplaceWithHtml(matches[0])
+			}
+		})
+	}
+
+	if changed {
+		output, _ := doc.Find("body").First().Html()
+		return output
+	}
+
+	return entryContent
+}
+
 func addYoutubeVideo(entryURL, entryContent string) string {
 	matches := youtubeRegex.FindStringSubmatch(entryURL)
 

+ 2 - 0
reader/rewrite/rewriter.go

@@ -24,6 +24,8 @@ func Rewriter(entryURL, entryContent, customRewriteRules string) string {
 		switch strings.TrimSpace(rule) {
 		case "add_image_title":
 			entryContent = addImageTitle(entryURL, entryContent)
+		case "add_dynamic_image":
+			entryContent = addDynamicImage(entryURL, entryContent)
 		case "add_youtube_video":
 			entryContent = addYoutubeVideo(entryURL, entryContent)
 		case "add_pdf_download_link":

+ 42 - 0
reader/rewrite/rewriter_test.go

@@ -40,6 +40,7 @@ func TestRewriteWithXkcdLink(t *testing.T) {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+
 func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) {
 	description := `<img src="https://imgs.xkcd.com/comics/thermostat.png" alt="Your problem is so terrible, I worry that, if I help you, I risk drawing the attention of whatever god of technology inflicted it on you." />`
 	output := Rewriter("https://xkcd.com/1912/", description, ``)
@@ -48,6 +49,7 @@ func TestRewriteWithXkcdLinkAndImageNoTitle(t *testing.T) {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+
 func TestRewriteWithXkcdLinkAndNoImage(t *testing.T) {
 	description := "test"
 	output := Rewriter("https://xkcd.com/1912/", description, ``)
@@ -76,3 +78,43 @@ func TestRewriteWithPDFLink(t *testing.T) {
 		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
 	}
 }
+
+func TestRewriteWithNoLazyImage(t *testing.T) {
+	description := `<img src="https://example.org/image.jpg" alt="Image"><noscript><p>Some text</p></noscript>`
+	output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+	expected := description
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
+
+func TestRewriteWithLazyImage(t *testing.T) {
+	description := `<img src="" data-url="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+	output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+	expected := `<img src="https://example.org/image.jpg" data-url="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
+
+func TestRewriteWithLazyDivImage(t *testing.T) {
+	description := `<div data-url="https://example.org/image.jpg" alt="Image"></div><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+	output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+	expected := `<img src="https://example.org/image.jpg" alt="Image"/><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}
+
+func TestRewriteWithUnknownLazyNoScriptImage(t *testing.T) {
+	description := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"><noscript><img src="https://example.org/fallback.jpg" alt="Fallback"></noscript>`
+	output := Rewriter("https://example.org/article", description, "add_dynamic_image")
+	expected := `<img src="" data-non-candidate="https://example.org/image.jpg" alt="Image"/><img src="https://example.org/fallback.jpg" alt="Fallback"/>`
+
+	if expected != output {
+		t.Errorf(`Not expected output: got "%s" instead of "%s"`, output, expected)
+	}
+}