Browse Source

sanitizer: handle image URLs in srcset attribute with comma

Frédéric Guillot 3 years ago
parent
commit
806a069785

+ 14 - 34
proxy/image_proxy.go

@@ -5,18 +5,16 @@
 package proxy // import "miniflux.app/proxy"
 
 import (
-	"regexp"
 	"strings"
 
 	"miniflux.app/config"
+	"miniflux.app/reader/sanitizer"
 	"miniflux.app/url"
 
 	"github.com/PuerkitoBio/goquery"
 	"github.com/gorilla/mux"
 )
 
-var regexSplitSrcset = regexp.MustCompile(`,\s+`)
-
 // ImageProxyRewriter replaces image URLs with internal proxy URLs.
 func ImageProxyRewriter(router *mux.Router, data string) string {
 	proxyImages := config.Opts.ProxyImages()
@@ -30,24 +28,20 @@ func ImageProxyRewriter(router *mux.Router, data string) string {
 	}
 
 	doc.Find("img").Each(func(i int, img *goquery.Selection) {
-		if srcAttr, ok := img.Attr("src"); ok {
-			if !isDataURL(srcAttr) && (proxyImages == "all" || !url.IsHTTPS(srcAttr)) {
-				img.SetAttr("src", ProxifyURL(router, srcAttr))
+		if srcAttrValue, ok := img.Attr("src"); ok {
+			if !isDataURL(srcAttrValue) && (proxyImages == "all" || !url.IsHTTPS(srcAttrValue)) {
+				img.SetAttr("src", ProxifyURL(router, srcAttrValue))
 			}
 		}
 
-		if srcsetAttr, ok := img.Attr("srcset"); ok {
-			if proxyImages == "all" || !url.IsHTTPS(srcsetAttr) {
-				proxifySourceSet(img, router, srcsetAttr)
-			}
+		if srcsetAttrValue, ok := img.Attr("srcset"); ok {
+			proxifySourceSet(img, router, proxyImages, srcsetAttrValue)
 		}
 	})
 
 	doc.Find("picture source").Each(func(i int, sourceElement *goquery.Selection) {
-		if srcsetAttr, ok := sourceElement.Attr("srcset"); ok {
-			if proxyImages == "all" || !url.IsHTTPS(srcsetAttr) {
-				proxifySourceSet(sourceElement, router, srcsetAttr)
-			}
+		if srcsetAttrValue, ok := sourceElement.Attr("srcset"); ok {
+			proxifySourceSet(sourceElement, router, proxyImages, srcsetAttrValue)
 		}
 	})
 
@@ -59,30 +53,16 @@ func ImageProxyRewriter(router *mux.Router, data string) string {
 	return output
 }
 
-func proxifySourceSet(element *goquery.Selection, router *mux.Router, attributeValue string) {
-	var proxifiedSources []string
+func proxifySourceSet(element *goquery.Selection, router *mux.Router, proxyImages, srcsetAttrValue string) {
+	imageCandidates := sanitizer.ParseSrcSetAttribute(srcsetAttrValue)
 
-	for _, source := range regexSplitSrcset.Split(attributeValue, -1) {
-		parts := strings.Split(strings.TrimSpace(source), " ")
-		nbParts := len(parts)
-
-		if nbParts > 0 {
-			rewrittenSource := parts[0]
-			if !isDataURL(rewrittenSource) {
-				rewrittenSource = ProxifyURL(router, rewrittenSource)
-			}
-
-			if nbParts > 1 {
-				rewrittenSource += " " + parts[1]
-			}
-
-			proxifiedSources = append(proxifiedSources, rewrittenSource)
+	for _, imageCandidate := range imageCandidates {
+		if !isDataURL(imageCandidate.ImageURL) && (proxyImages == "all" || !url.IsHTTPS(imageCandidate.ImageURL)) {
+			imageCandidate.ImageURL = ProxifyURL(router, imageCandidate.ImageURL)
 		}
 	}
 
-	if len(proxifiedSources) > 0 {
-		element.SetAttr("srcset", strings.Join(proxifiedSources, ", "))
-	}
+	element.SetAttr("srcset", imageCandidates.String())
 }
 
 func isDataURL(s string) bool {

+ 25 - 2
proxy/image_proxy_test.go

@@ -234,8 +234,31 @@ func TestProxyFilterWithPictureSource(t *testing.T) {
 	r := mux.NewRouter()
 	r.HandleFunc("/proxy/{encodedURL}", func(w http.ResponseWriter, r *http.Request) {}).Name("proxy")
 
-	input := `<picture><source srcset="http://website/folder/image2.png 656w,   http://website/folder/image3.png 360w"></picture>`
-	expected := `<picture><source srcset="/proxy/aHR0cDovL3dlYnNpdGUvZm9sZGVyL2ltYWdlMi5wbmc= 656w, /proxy/aHR0cDovL3dlYnNpdGUvZm9sZGVyL2ltYWdlMy5wbmc= 360w"/></picture>`
+	input := `<picture><source srcset="http://website/folder/image2.png 656w,   http://website/folder/image3.png 360w, https://website/some,image.png 2x"></picture>`
+	expected := `<picture><source srcset="/proxy/aHR0cDovL3dlYnNpdGUvZm9sZGVyL2ltYWdlMi5wbmc= 656w, /proxy/aHR0cDovL3dlYnNpdGUvZm9sZGVyL2ltYWdlMy5wbmc= 360w, /proxy/aHR0cHM6Ly93ZWJzaXRlL3NvbWUsaW1hZ2UucG5n 2x"/></picture>`
+	output := ImageProxyRewriter(r, input)
+
+	if expected != output {
+		t.Errorf(`Not expected output: got %s`, output)
+	}
+}
+
+func TestProxyFilterOnlyNonHTTPWithPictureSource(t *testing.T) {
+	os.Clearenv()
+	os.Setenv("PROXY_IMAGES", "https")
+
+	var err error
+	parser := config.NewParser()
+	config.Opts, err = parser.ParseEnvironmentVariables()
+	if err != nil {
+		t.Fatalf(`Parsing failure: %v`, err)
+	}
+
+	r := mux.NewRouter()
+	r.HandleFunc("/proxy/{encodedURL}", func(w http.ResponseWriter, r *http.Request) {}).Name("proxy")
+
+	input := `<picture><source srcset="http://website/folder/image2.png 656w, https://website/some,image.png 2x"></picture>`
+	expected := `<picture><source srcset="/proxy/aHR0cDovL3dlYnNpdGUvZm9sZGVyL2ltYWdlMi5wbmc= 656w, https://website/some,image.png 2x"/></picture>`
 	output := ImageProxyRewriter(r, input)
 
 	if expected != output {

+ 6 - 42
reader/sanitizer/sanitizer.go

@@ -20,7 +20,6 @@ import (
 
 var (
 	youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
-	splitSrcsetRegex  = regexp.MustCompile(`,\s?`)
 )
 
 // Sanitize returns safe HTML.
@@ -447,52 +446,17 @@ func isBlockedTag(tagName string) bool {
 	return false
 }
 
-/*
-
-One or more strings separated by commas, indicating possible image sources for the user agent to use.
-
-Each string is composed of:
-- A URL to an image
-- Optionally, whitespace followed by one of:
-- A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
-- A pixel density descriptor (a positive floating point number directly followed by x).
-
-*/
 func sanitizeSrcsetAttr(baseURL, value string) string {
-	var sanitizedSources []string
-	rawSources := splitSrcsetRegex.Split(value, -1)
-	for _, rawSource := range rawSources {
-		parts := strings.Split(strings.TrimSpace(rawSource), " ")
-		nbParts := len(parts)
-
-		if nbParts > 0 {
-			sanitizedSource, err := url.AbsoluteURL(baseURL, parts[0])
-			if err != nil {
-				continue
-			}
-
-			if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
-				sanitizedSource += " " + parts[1]
-			}
+	imageCandidates := ParseSrcSetAttribute(value)
 
-			sanitizedSources = append(sanitizedSources, sanitizedSource)
+	for _, imageCandidate := range imageCandidates {
+		absoluteURL, err := url.AbsoluteURL(baseURL, imageCandidate.ImageURL)
+		if err == nil {
+			imageCandidate.ImageURL = absoluteURL
 		}
 	}
-	return strings.Join(sanitizedSources, ", ")
-}
-
-func isValidWidthOrDensityDescriptor(value string) bool {
-	if value == "" {
-		return false
-	}
-
-	lastChar := value[len(value)-1:]
-	if lastChar != "w" && lastChar != "x" {
-		return false
-	}
 
-	_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
-	return err == nil
+	return imageCandidates.String()
 }
 
 func isValidDataAttribute(value string) bool {

+ 0 - 10
reader/sanitizer/sanitizer_test.go

@@ -85,16 +85,6 @@ func TestMediumImgWithSrcset(t *testing.T) {
 	}
 }
 
-func TestEconomistImgWithSrcset(t *testing.T) {
-	input := `<img loading="lazy" src="https://www.economist.com/img/b/608/634/90/sites/default/files/images/print-edition/20211009_WWC585.png" srcSet="https://www.economist.com/img/b/200/209/90/sites/default/files/images/print-edition/20211009_WWC585.png 200w,https://www.economist.com/img/b/300/313/90/sites/default/files/images/print-edition/20211009_WWC585.png 300w,https://www.economist.com/img/b/400/417/90/sites/default/files/images/print-edition/20211009_WWC585.png 400w,https://www.economist.com/img/b/600/626/90/sites/default/files/images/print-edition/20211009_WWC585.png 600w,https://www.economist.com/img/b/640/667/90/sites/default/files/images/print-edition/20211009_WWC585.png 640w,https://www.economist.com/img/b/800/834/90/sites/default/files/images/print-edition/20211009_WWC585.png 800w,https://www.economist.com/img/b/1000/1043/90/sites/default/files/images/print-edition/20211009_WWC585.png 1000w,https://www.economist.com/img/b/1280/1335/90/sites/default/files/images/print-edition/20211009_WWC585.png 1280w" sizes="300px" alt=""/>`
-	expected := `<img src="https://www.economist.com/img/b/608/634/90/sites/default/files/images/print-edition/20211009_WWC585.png" srcset="https://www.economist.com/img/b/200/209/90/sites/default/files/images/print-edition/20211009_WWC585.png 200w, https://www.economist.com/img/b/300/313/90/sites/default/files/images/print-edition/20211009_WWC585.png 300w, https://www.economist.com/img/b/400/417/90/sites/default/files/images/print-edition/20211009_WWC585.png 400w, https://www.economist.com/img/b/600/626/90/sites/default/files/images/print-edition/20211009_WWC585.png 600w, https://www.economist.com/img/b/640/667/90/sites/default/files/images/print-edition/20211009_WWC585.png 640w, https://www.economist.com/img/b/800/834/90/sites/default/files/images/print-edition/20211009_WWC585.png 800w, https://www.economist.com/img/b/1000/1043/90/sites/default/files/images/print-edition/20211009_WWC585.png 1000w, https://www.economist.com/img/b/1280/1335/90/sites/default/files/images/print-edition/20211009_WWC585.png 1280w" sizes="300px" alt="" loading="lazy"/>`
-	output := Sanitize("http://example.org/", input)
-
-	if output != expected {
-		t.Errorf(`Wrong output: %s`, output)
-	}
-}
-
 func TestSelfClosingTags(t *testing.T) {
 	input := `<p>This <br> is a <strong>text</strong> <br/>with an image: <img src="http://example.org/" alt="Test" loading="lazy"/>.</p>`
 	output := Sanitize("http://example.org/", input)

+ 82 - 0
reader/sanitizer/srcset.go

@@ -0,0 +1,82 @@
+// Copyright 2022 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import (
+	"fmt"
+	"strconv"
+	"strings"
+)
+
+type ImageCandidate struct {
+	ImageURL   string
+	Descriptor string
+}
+
+type ImageCandidates []*ImageCandidate
+
+func (c ImageCandidates) String() string {
+	var htmlCandidates []string
+
+	for _, imageCandidate := range c {
+		var htmlCandidate string
+		if imageCandidate.Descriptor != "" {
+			htmlCandidate = fmt.Sprintf(`%s %s`, imageCandidate.ImageURL, imageCandidate.Descriptor)
+		} else {
+			htmlCandidate = imageCandidate.ImageURL
+		}
+
+		htmlCandidates = append(htmlCandidates, htmlCandidate)
+	}
+
+	return strings.Join(htmlCandidates, ", ")
+}
+
+// ParseSrcSetAttribute returns the list of image candidates from the set.
+// https://html.spec.whatwg.org/#parse-a-srcset-attribute
+func ParseSrcSetAttribute(attributeValue string) (imageCandidates ImageCandidates) {
+	unparsedCandidates := strings.Split(attributeValue, ", ")
+
+	for _, unparsedCandidate := range unparsedCandidates {
+		if candidate, err := parseImageCandidate(unparsedCandidate); err == nil {
+			imageCandidates = append(imageCandidates, candidate)
+		}
+	}
+
+	return imageCandidates
+}
+
+func parseImageCandidate(input string) (*ImageCandidate, error) {
+	input = strings.TrimSpace(input)
+	parts := strings.Split(strings.TrimSpace(input), " ")
+	nbParts := len(parts)
+
+	if nbParts > 2 || nbParts == 0 {
+		return nil, fmt.Errorf(`srcset: invalid number of descriptors`)
+	}
+
+	if nbParts == 2 {
+		if !isValidWidthOrDensityDescriptor(parts[1]) {
+			return nil, fmt.Errorf(`srcset: invalid descriptor`)
+		}
+		return &ImageCandidate{ImageURL: parts[0], Descriptor: parts[1]}, nil
+	}
+
+	return &ImageCandidate{ImageURL: parts[0]}, nil
+}
+
+func isValidWidthOrDensityDescriptor(value string) bool {
+	if value == "" {
+		return false
+	}
+
+	lastChar := value[len(value)-1:]
+	if lastChar != "w" && lastChar != "x" {
+		return false
+	}
+
+	_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
+	return err == nil
+}

+ 85 - 0
reader/sanitizer/srcset_test.go

@@ -0,0 +1,85 @@
+// Copyright 2022 Frédéric Guillot. All rights reserved.
+// Use of this source code is governed by the Apache 2.0
+// license that can be found in the LICENSE file.
+
+package sanitizer
+
+import "testing"
+
+func TestParseSrcSetAttributeWithRelativeURLs(t *testing.T) {
+	input := `example-320w.jpg, example-480w.jpg 1.5x,   example-640,w.jpg 2x, example-640w.jpg 640w`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 4 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w` {
+		t.Errorf(`Unexpected string output`)
+	}
+}
+
+func TestParseSrcSetAttributeWithAbsoluteURLs(t *testing.T) {
+	input := `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 2 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x` {
+		t.Errorf(`Unexpected string output`)
+	}
+}
+
+func TestParseSrcSetAttributeWithOneCandidate(t *testing.T) {
+	input := `http://example.org/example-320w.jpg`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 1 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `http://example.org/example-320w.jpg` {
+		t.Errorf(`Unexpected string output`)
+	}
+}
+
+func TestParseSrcSetAttributeWithCommaURL(t *testing.T) {
+	input := `http://example.org/example,a:b/d.jpg , example-480w.jpg 1.5x`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 2 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `http://example.org/example,a:b/d.jpg, example-480w.jpg 1.5x` {
+		t.Errorf(`Unexpected string output`)
+	}
+}
+
+func TestParseSrcSetAttributeWithIncorrectDescriptor(t *testing.T) {
+	input := `http://example.org/example-320w.jpg test`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 0 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `` {
+		t.Errorf(`Unexpected string output`)
+	}
+}
+
+func TestParseSrcSetAttributeWithTooManyDescriptors(t *testing.T) {
+	input := `http://example.org/example-320w.jpg 10w 1x`
+	candidates := ParseSrcSetAttribute(input)
+
+	if len(candidates) != 0 {
+		t.Error(`Incorrect number of candidates`)
+	}
+
+	if candidates.String() != `` {
+		t.Errorf(`Unexpected string output`)
+	}
+}