Browse Source

refactor(sanitizer): enforce isBlockedResource() on srcset URLs

Frédéric Guillot 2 months ago
parent
commit
5a8431b144
2 changed files with 73 additions and 15 deletions
  1. 24 5
      internal/reader/sanitizer/sanitizer.go
  2. 49 10
      internal/reader/sanitizer/sanitizer_test.go

+ 24 - 5
internal/reader/sanitizer/sanitizer.go

@@ -368,10 +368,16 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []htm
 				}
 			case "srcset":
 				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
+				if value == "" {
+					continue
+				}
 			}
 		case "source":
 			if attribute.Key == "srcset" {
 				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
+				if value == "" {
+					continue
+				}
 			}
 		}
 
@@ -581,15 +587,28 @@ func isBlockedTag(tagName string) bool {
 }
 
 func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
-	imageCandidates := ParseSrcSetAttribute(value)
+	candidates := ParseSrcSetAttribute(value)
+	if len(candidates) == 0 {
+		return ""
+	}
 
-	for _, imageCandidate := range imageCandidates {
-		if absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL); err == nil {
-			imageCandidate.ImageURL = absoluteURL
+	sanitizedCandidates := make([]*imageCandidate, 0, len(candidates))
+
+	for _, imageCandidate := range candidates {
+		absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL)
+		if err != nil {
+			continue
+		}
+
+		if !hasValidURIScheme(absoluteURL) || isBlockedResource(absoluteURL) {
+			continue
 		}
+
+		imageCandidate.ImageURL = absoluteURL
+		sanitizedCandidates = append(sanitizedCandidates, imageCandidate)
 	}
 
-	return imageCandidates.String()
+	return imageCandidates(sanitizedCandidates).String()
 }
 
 func isValidDataAttribute(value string) bool {

+ 49 - 10
internal/reader/sanitizer/sanitizer_test.go

@@ -131,6 +131,16 @@ func TestImgSanitization(t *testing.T) {
 			input:    `<img srcset="example-320w.jpg, example-480w.jpg 1.5x,   example-640w.jpg 2x, example-640w.jpg 640w" alt="Example">`,
 			expected: `<img srcset="http://example.org/example-320w.jpg, http://example.org/example-480w.jpg 1.5x, http://example.org/example-640w.jpg 2x, http://example.org/example-640w.jpg 640w" alt="Example" loading="lazy">`,
 		},
+		{
+			name:     "srcset-attribute-with-blocked-candidate",
+			input:    `<img srcset="https://stats.wordpress.com/tracker.png 1x, /example-640w.jpg 2x" src="/example-640w.jpg" alt="Example">`,
+			expected: `<img srcset="http://example.org/example-640w.jpg 2x" src="http://example.org/example-640w.jpg" alt="Example" loading="lazy">`,
+		},
+		{
+			name:     "srcset-attribute-all-candidates-invalid",
+			input:    `<img srcset="javascript:alert(1) 1x, data:text/plain;base64,SGVsbG8= 2x" alt="Example">`,
+			expected: ``,
+		},
 		{
 			name:     "fetchpriority-high",
 			input:    `<img src="https://example.org/image.png" fetchpriority="high">`,
@@ -203,16 +213,6 @@ func TestNonImgWithDecodingAttribute(t *testing.T) {
 	}
 }
 
-func TestSourceWithSrcsetAndMedia(t *testing.T) {
-	input := `<picture><source media="(min-width: 800px)" srcset="elva-800w.jpg"></picture>`
-	expected := `<picture><source media="(min-width: 800px)" srcset="http://example.org/elva-800w.jpg"></picture>`
-	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
-
-	if output != expected {
-		t.Errorf(`Wrong output: %s`, output)
-	}
-}
-
 func TestMediumImgWithSrcset(t *testing.T) {
 	input := `<img alt="Image for post" class="t u v ef aj" src="https://miro.medium.com/max/5460/1*aJ9JibWDqO81qMfNtqgqrw.jpeg" srcset="https://miro.medium.com/max/552/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 276w, https://miro.medium.com/max/1000/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 500w" sizes="500px" width="2730" height="3407">`
 	expected := `<img alt="Image for post" src="https://miro.medium.com/max/5460/1*aJ9JibWDqO81qMfNtqgqrw.jpeg" srcset="https://miro.medium.com/max/552/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 276w, https://miro.medium.com/max/1000/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 500w" sizes="500px" width="2730" height="3407" loading="lazy">`
@@ -306,6 +306,45 @@ func TestInvalidTag(t *testing.T) {
 	}
 }
 
+func TestSourceSanitization(t *testing.T) {
+	baseURL := "http://example.org/"
+	testCases := []struct {
+		name     string
+		input    string
+		expected string
+	}{
+		{
+			name:     "srcset-and-media",
+			input:    `<picture><source media="(min-width: 800px)" srcset="elva-800w.jpg"></picture>`,
+			expected: `<picture><source media="(min-width: 800px)" srcset="http://example.org/elva-800w.jpg"></picture>`,
+		},
+		{
+			name:     "src-attribute",
+			input:    `<picture><source src="video.mp4" type="video/mp4"></picture>`,
+			expected: `<picture><source src="http://example.org/video.mp4" type="video/mp4"></picture>`,
+		},
+		{
+			name:     "srcset-with-blocked-candidate",
+			input:    `<picture><source srcset="https://stats.wordpress.com/tracker.png 1x, /elva-800w.jpg 2x"></picture>`,
+			expected: `<picture><source srcset="http://example.org/elva-800w.jpg 2x"></picture>`,
+		},
+		{
+			name:     "srcset-all-invalid",
+			input:    `<picture><source srcset="javascript:alert(1) 1x, data:text/plain;base64,SGVsbG8= 2x"></picture>`,
+			expected: `<picture></picture>`,
+		},
+	}
+
+	for _, tc := range testCases {
+		t.Run(tc.name, func(t *testing.T) {
+			output := sanitizeHTMLWithDefaultOptions(baseURL, tc.input)
+			if output != tc.expected {
+				t.Errorf(`Wrong output for input %q: expected %q, got %q`, tc.input, tc.expected, output)
+			}
+		})
+	}
+}
+
 func TestVideoTag(t *testing.T) {
 	input := `<p>My valid <video src="videofile.webm" autoplay poster="posterimage.jpg">fallback</video>.</p>`
 	expected := `<p>My valid <video src="http://example.org/videofile.webm" poster="http://example.org/posterimage.jpg" controls>fallback</video>.</p>`