Browse Source

refactor(sanitizer): remove workaround that strip large img width attribute

The browser will do the right thing by default even if the image width
is larger than Miniflux's layout.
Frédéric Guillot 2 months ago
parent
commit
89638b448a
2 changed files with 38 additions and 59 deletions
  1. 37 48
      internal/reader/sanitizer/sanitizer.go
  2. 1 11
      internal/reader/sanitizer/sanitizer_test.go

+ 37 - 48
internal/reader/sanitizer/sanitizer.go

@@ -199,10 +199,44 @@ var (
 	}
 )
 
+// SanitizerOptions holds options for the HTML sanitizer.
 type SanitizerOptions struct {
 	OpenLinksInNewTab bool
 }
 
+// SanitizeHTML takes raw HTML input and removes any disallowed tags and attributes.
+func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
+	var buffer strings.Builder
+
+	// Educated guess about how big the sanitized HTML will be,
+	// to reduce the amount of buffer re-allocations in this function.
+	estimatedRatio := len(rawHTML) * 3 / 4
+	buffer.Grow(estimatedRatio)
+
+	// We need to surround `rawHTML` with body tags so that html.Parse
+	// will consider it a valid html document.
+	doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
+	if err != nil {
+		return ""
+	}
+
+	/* The structure of `doc` is always:
+	<html>
+	<head>...</head>
+	<body>..</body>
+	</html>
+	*/
+	body := doc.FirstChild.FirstChild.NextSibling
+
+	// Errors are a non-issue, so they're handled in filterAndRenderHTML
+	parsedBaseUrl, _ := url.Parse(baseURL)
+	for c := body.FirstChild; c != nil; c = c.NextSibling {
+		filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions)
+	}
+
+	return buffer.String()
+}
+
 func isHidden(n *html.Node) bool {
 	for _, attr := range n.Attr {
 		if strings.ToLower(attr.Key) == "hidden" {
@@ -293,40 +327,10 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 	}
 }
 
-func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
-	var buffer strings.Builder
-
-	// Educated guess about how big the sanitized HTML will be,
-	// to reduce the amount of buffer re-allocations in this function.
-	estimatedRatio := len(rawHTML) * 3 / 4
-	buffer.Grow(estimatedRatio)
-
-	// We need to surround `rawHTML` with body tags so that html.Parse
-	// will consider it a valid html document.
-	doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
-	if err != nil {
-		return ""
-	}
-
-	/* The structure of `doc` is always:
-	<html>
-	<head>...</head>
-	<body>..</body>
-	*/
-	body := doc.FirstChild.FirstChild.NextSibling
-
-	// Errors are a non-issue, so they're handled in filterAndRenderHTML
-	parsedBaseUrl, _ := url.Parse(baseURL)
-	for c := body.FirstChild; c != nil; c = c.NextSibling {
-		filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions)
-	}
-
-	return buffer.String()
-}
-
 func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
 	htmlAttrs := make([]string, 0, len(attributes))
 	attrNames := make([]string, 0, len(attributes))
+
 	var err error
 	var isAnchorLink bool
 	var isYouTubeEmbed bool
@@ -359,11 +363,6 @@ func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []htm
 				if !isPositiveInteger(value) {
 					continue
 				}
-
-				// Discard width and height attributes when width is larger than Miniflux layout (750px)
-				if imgWidth := getIntegerAttributeValue("width", attributes); imgWidth > 750 {
-					continue
-				}
 			case "srcset":
 				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
 			}
@@ -474,8 +473,8 @@ func isExternalResourceAttribute(attribute string) bool {
 	}
 }
 
-func isPixelTracker(tag string, attributes []html.Attribute) bool {
-	if tag != "img" {
+func isPixelTracker(tagName string, attributes []html.Attribute) bool {
+	if tagName != "img" {
 		return false
 	}
 	hasHeight := false
@@ -615,16 +614,6 @@ func isPositiveInteger(value string) bool {
 	return false
 }
 
-func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
-	for _, attribute := range attributes {
-		if attribute.Key == name {
-			number, _ := strconv.Atoi(attribute.Val)
-			return number
-		}
-	}
-	return 0
-}
-
 func isValidFetchPriorityValue(value string) bool {
 	switch value {
 	case "high", "low", "auto":

+ 1 - 11
internal/reader/sanitizer/sanitizer_test.go

@@ -79,16 +79,6 @@ func TestImgWithWidthAndHeightAttribute(t *testing.T) {
 	}
 }
 
-func TestImgWithWidthAttributeLargerThanMinifluxLayout(t *testing.T) {
-	input := `<img src="https://example.org/image.png" width="1200" height="675">`
-	expected := `<img src="https://example.org/image.png" loading="lazy">`
-	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
-
-	if output != expected {
-		t.Errorf(`Wrong output: %s`, output)
-	}
-}
-
 func TestImgWithIncorrectWidthAndHeightAttribute(t *testing.T) {
 	input := `<img src="https://example.org/image.png" width="10px" height="20px">`
 	expected := `<img src="https://example.org/image.png" loading="lazy">`
@@ -295,7 +285,7 @@ func TestSourceWithSrcsetAndMedia(t *testing.T) {
 
 func TestMediumImgWithSrcset(t *testing.T) {
 	input := `<img alt="Image for post" class="t u v ef aj" src="https://miro.medium.com/max/5460/1*aJ9JibWDqO81qMfNtqgqrw.jpeg" srcset="https://miro.medium.com/max/552/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 276w, https://miro.medium.com/max/1000/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 500w" sizes="500px" width="2730" height="3407">`
-	expected := `<img alt="Image for post" src="https://miro.medium.com/max/5460/1*aJ9JibWDqO81qMfNtqgqrw.jpeg" srcset="https://miro.medium.com/max/552/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 276w, https://miro.medium.com/max/1000/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 500w" sizes="500px" loading="lazy">`
+	expected := `<img alt="Image for post" src="https://miro.medium.com/max/5460/1*aJ9JibWDqO81qMfNtqgqrw.jpeg" srcset="https://miro.medium.com/max/552/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 276w, https://miro.medium.com/max/1000/1*aJ9JibWDqO81qMfNtqgqrw.jpeg 500w" sizes="500px" width="2730" height="3407" loading="lazy">`
 	output := sanitizeHTMLWithDefaultOptions("http://example.org/", input)
 
 	if output != expected {