Преглед на файлове

refactor(sanitizer): rewrite srcset parser according to HTML specs

The new implementation follows the WebKit HTMLSrcsetParser approach. It
correctly handles various edge cases that the previous string-splitting
method could not parse.
Frédéric Guillot преди 2 месеца
родител
ревизия
3aea68e725
променени са 2 файла, в които са добавени 394 реда и са изтрити 84 реда
  1. 254 24
      internal/reader/sanitizer/srcset.go
  2. 140 60
      internal/reader/sanitizer/srcset_test.go

+ 254 - 24
internal/reader/sanitizer/srcset.go

@@ -4,7 +4,7 @@
 package sanitizer
 
 import (
-	"errors"
+	"math"
 	"strconv"
 	"strings"
 )
@@ -34,44 +34,274 @@ func (c imageCandidates) String() string {
 }
 
 // ParseSrcSetAttribute returns the list of image candidates from the set.
+// Parsing behavior follows the WebKit HTMLSrcsetParser implementation.
 // https://html.spec.whatwg.org/#parse-a-srcset-attribute
-func ParseSrcSetAttribute(attributeValue string) (imageCandidates imageCandidates) {
-	for _, unparsedCandidate := range strings.Split(attributeValue, ", ") {
-		if candidate, err := parseImageCandidate(unparsedCandidate); err == nil {
-			imageCandidates = append(imageCandidates, candidate)
+func ParseSrcSetAttribute(attributeValue string) (candidates imageCandidates) {
+	if attributeValue == "" {
+		return nil
+	}
+
+	position := 0
+	for position < len(attributeValue) {
+		position = skipWhileHTMLSpaceOrComma(attributeValue, position)
+		if position >= len(attributeValue) {
+			break
 		}
+
+		urlStart := position
+		position = skipUntilASCIIWhitespace(attributeValue, position)
+		imageURL := attributeValue[urlStart:position]
+		if imageURL == "" {
+			continue
+		}
+
+		var result descriptorParsingResult
+		if imageURL[len(imageURL)-1] == ',' {
+			imageURL = strings.TrimRight(imageURL, ",")
+			if imageURL == "" {
+				continue
+			}
+		} else {
+			position = skipWhileASCIIWhitespace(attributeValue, position)
+			descriptorTokens, newPosition := tokenizeDescriptors(attributeValue, position)
+			position = newPosition
+			if !parseDescriptors(descriptorTokens, &result) {
+				continue
+			}
+		}
+
+		candidates = append(candidates, &imageCandidate{
+			ImageURL:   imageURL,
+			Descriptor: serializeDescriptor(result),
+		})
 	}
 
-	return imageCandidates
+	return candidates
+}
+
+type descriptorParsingResult struct {
+	density        float64
+	resourceWidth  int
+	resourceHeight int
+	hasDensity     bool
+	hasWidth       bool
+	hasHeight      bool
 }
 
-func parseImageCandidate(input string) (*imageCandidate, error) {
-	parts := strings.Split(strings.TrimSpace(input), " ")
-	nbParts := len(parts)
+func (r *descriptorParsingResult) setDensity(value float64) {
+	r.density = value
+	r.hasDensity = true
+}
 
-	switch nbParts {
-	case 1:
-		return &imageCandidate{ImageURL: parts[0]}, nil
-	case 2:
-		if !isValidWidthOrDensityDescriptor(parts[1]) {
-			return nil, errors.New(`srcset: invalid descriptor`)
+func (r *descriptorParsingResult) setResourceWidth(value int) {
+	r.resourceWidth = value
+	r.hasWidth = true
+}
+
+func (r *descriptorParsingResult) setResourceHeight(value int) {
+	r.resourceHeight = value
+	r.hasHeight = true
+}
+
+func serializeDescriptor(result descriptorParsingResult) string {
+	if result.hasDensity {
+		return formatFloat(result.density) + "x"
+	}
+	if result.hasWidth {
+		return strconv.Itoa(result.resourceWidth) + "w"
+	}
+	return ""
+}
+
+func parseDescriptors(descriptors []string, result *descriptorParsingResult) bool {
+	for _, descriptor := range descriptors {
+		if descriptor == "" {
+			continue
+		}
+		lastIndex := len(descriptor) - 1
+		descriptorChar := descriptor[lastIndex]
+		value := descriptor[:lastIndex]
+
+		switch descriptorChar {
+		case 'x':
+			if result.hasDensity || result.hasHeight || result.hasWidth {
+				return false
+			}
+			density, ok := parseValidHTMLFloatingPointNumber(value)
+			if !ok || density < 0 {
+				return false
+			}
+			result.setDensity(density)
+		case 'w':
+			if result.hasDensity || result.hasWidth {
+				return false
+			}
+			width, ok := parseValidHTMLNonNegativeInteger(value)
+			if !ok || width <= 0 {
+				return false
+			}
+			result.setResourceWidth(width)
+		case 'h':
+			if result.hasDensity || result.hasHeight {
+				return false
+			}
+			height, ok := parseValidHTMLNonNegativeInteger(value)
+			if !ok || height <= 0 {
+				return false
+			}
+			result.setResourceHeight(height)
+		default:
+			return false
 		}
-		return &imageCandidate{ImageURL: parts[0], Descriptor: parts[1]}, nil
-	default:
-		return nil, errors.New(`srcset: invalid number of descriptors`)
 	}
+
+	return !result.hasHeight || result.hasWidth
 }
 
-func isValidWidthOrDensityDescriptor(value string) bool {
+type descriptorTokenizerState int
+
+const (
+	descriptorStateInitial descriptorTokenizerState = iota
+	descriptorStateInParenthesis
+	descriptorStateAfterToken
+)
+
+func tokenizeDescriptors(input string, start int) (tokens []string, newPosition int) {
+	state := descriptorStateInitial
+	currentStart := start
+	currentSet := true
+	position := start
+
+	appendDescriptorAndReset := func(position int) {
+		if currentSet && position > currentStart {
+			tokens = append(tokens, input[currentStart:position])
+		}
+		currentSet = false
+	}
+
+	appendCharacter := func(position int) {
+		if !currentSet {
+			currentStart = position
+			currentSet = true
+		}
+	}
+
+	for {
+		if position >= len(input) {
+			if state != descriptorStateAfterToken {
+				appendDescriptorAndReset(position)
+			}
+			return tokens, position
+		}
+
+		character := input[position]
+		switch state {
+		case descriptorStateInitial:
+			switch {
+			case isComma(character):
+				appendDescriptorAndReset(position)
+				position++
+				return tokens, position
+			case isASCIIWhitespace(character):
+				appendDescriptorAndReset(position)
+				currentStart = position + 1
+				currentSet = true
+				state = descriptorStateAfterToken
+			case character == '(':
+				appendCharacter(position)
+				state = descriptorStateInParenthesis
+			default:
+				appendCharacter(position)
+			}
+		case descriptorStateInParenthesis:
+			if character == ')' {
+				appendCharacter(position)
+				state = descriptorStateInitial
+			} else {
+				appendCharacter(position)
+			}
+		case descriptorStateAfterToken:
+			if !isASCIIWhitespace(character) {
+				state = descriptorStateInitial
+				currentStart = position
+				currentSet = true
+				position--
+			}
+		}
+
+		position++
+	}
+}
+
+func parseValidHTMLNonNegativeInteger(value string) (int, bool) {
 	if value == "" {
-		return false
+		return 0, false
+	}
+
+	for i := 0; i < len(value); i++ {
+		if value[i] < '0' || value[i] > '9' {
+			return 0, false
+		}
+	}
+
+	parsed, err := strconv.Atoi(value)
+	if err != nil {
+		return 0, false
+	}
+
+	return parsed, true
+}
+
+func parseValidHTMLFloatingPointNumber(value string) (float64, bool) {
+	if value == "" {
+		return 0, false
+	}
+	if value[0] == '+' || value[len(value)-1] == '.' {
+		return 0, false
 	}
 
-	lastChar := value[len(value)-1:]
-	if lastChar != "w" && lastChar != "x" {
+	parsed, err := strconv.ParseFloat(value, 64)
+	if err != nil || math.IsNaN(parsed) || math.IsInf(parsed, 0) {
+		return 0, false
+	}
+
+	return parsed, true
+}
+
+func formatFloat(value float64) string {
+	return strconv.FormatFloat(value, 'g', -1, 64)
+}
+
+func skipWhileHTMLSpaceOrComma(value string, position int) int {
+	for position < len(value) && (isASCIIWhitespace(value[position]) || isComma(value[position])) {
+		position++
+	}
+	return position
+}
+
+func skipWhileASCIIWhitespace(value string, position int) int {
+	for position < len(value) && isASCIIWhitespace(value[position]) {
+		position++
+	}
+	return position
+}
+
+func skipUntilASCIIWhitespace(value string, position int) int {
+	for position < len(value) && !isASCIIWhitespace(value[position]) {
+		position++
+	}
+	return position
+}
+
+func isASCIIWhitespace(character byte) bool {
+	switch character {
+	case '\t', '\n', '\f', '\r', ' ':
+		return true
+	default:
 		return false
 	}
+}
 
-	_, err := strconv.ParseFloat(value[0:len(value)-1], 32)
-	return err == nil
+func isComma(character byte) bool {
+	return character == ','
 }

+ 140 - 60
internal/reader/sanitizer/srcset_test.go

@@ -5,80 +5,160 @@ package sanitizer
 
 import "testing"
 
-func TestParseSrcSetAttributeWithRelativeURLs(t *testing.T) {
-	input := `example-320w.jpg, example-480w.jpg 1.5x,   example-640,w.jpg 2x, example-640w.jpg 640w`
-	candidates := ParseSrcSetAttribute(input)
-
-	if len(candidates) != 4 {
-		t.Error(`Incorrect number of candidates`)
-	}
-
-	if candidates.String() != `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w` {
-		t.Errorf(`Unexpected string output`)
-	}
-}
-
-func TestParseSrcSetAttributeWithAbsoluteURLs(t *testing.T) {
-	input := `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`
-	candidates := ParseSrcSetAttribute(input)
-
-	if len(candidates) != 2 {
-		t.Error(`Incorrect number of candidates`)
-	}
-
-	if candidates.String() != `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x` {
-		t.Errorf(`Unexpected string output`)
-	}
-}
-
-func TestParseSrcSetAttributeWithOneCandidate(t *testing.T) {
-	input := `http://example.org/example-320w.jpg`
-	candidates := ParseSrcSetAttribute(input)
-
-	if len(candidates) != 1 {
-		t.Error(`Incorrect number of candidates`)
-	}
-
-	if candidates.String() != `http://example.org/example-320w.jpg` {
-		t.Errorf(`Unexpected string output`)
-	}
-}
+func assertCandidates(t *testing.T, input string, expectedCount int, expectedString string) {
+	t.Helper()
 
-func TestParseSrcSetAttributeWithCommaURL(t *testing.T) {
-	input := `http://example.org/example,a:b/d.jpg , example-480w.jpg 1.5x`
 	candidates := ParseSrcSetAttribute(input)
 
-	if len(candidates) != 2 {
-		t.Error(`Incorrect number of candidates`)
+	if len(candidates) != expectedCount {
+		t.Fatalf("Incorrect number of candidates for input %q: got %d, want %d", input, len(candidates), expectedCount)
 	}
 
-	if candidates.String() != `http://example.org/example,a:b/d.jpg, example-480w.jpg 1.5x` {
-		t.Errorf(`Unexpected string output`)
+	if output := candidates.String(); output != expectedString {
+		t.Fatalf("Unexpected string output for input %q. Got %q, want %q", input, output, expectedString)
 	}
 }
 
-func TestParseSrcSetAttributeWithIncorrectDescriptor(t *testing.T) {
-	input := `http://example.org/example-320w.jpg test`
-	candidates := ParseSrcSetAttribute(input)
-
-	if len(candidates) != 0 {
-		t.Error(`Incorrect number of candidates`)
+func TestParseSrcSetAttributeValidCandidates(t *testing.T) {
+	testCases := []struct {
+		name           string
+		input          string
+		expectedCount  int
+		expectedString string
+	}{
+		{
+			name:           "relative urls",
+			input:          `example-320w.jpg, example-480w.jpg 1.5x,   example-640,w.jpg 2x, example-640w.jpg 640w`,
+			expectedCount:  4,
+			expectedString: `example-320w.jpg, example-480w.jpg 1.5x, example-640,w.jpg 2x, example-640w.jpg 640w`,
+		},
+		{
+			name:           "relative urls no space after comma",
+			input:          `a.png 1x,b.png 2x`,
+			expectedCount:  2,
+			expectedString: `a.png 1x, b.png 2x`,
+		},
+		{
+			name:           "relative urls extra spaces",
+			input:          `  a.png   1x  ,   b.png    2x   `,
+			expectedCount:  2,
+			expectedString: `a.png 1x, b.png 2x`,
+		},
+		{
+			name:           "absolute urls",
+			input:          `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`,
+			expectedCount:  2,
+			expectedString: `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`,
+		},
+		{
+			name:           "absolute urls no space after comma",
+			input:          `http://example.org/example-320w.jpg 320w,http://example.org/example-480w.jpg 1.5x`,
+			expectedCount:  2,
+			expectedString: `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`,
+		},
+		{
+			name:           "absolute urls trailing comma",
+			input:          `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x, `,
+			expectedCount:  2,
+			expectedString: `http://example.org/example-320w.jpg 320w, http://example.org/example-480w.jpg 1.5x`,
+		},
+		{
+			name:           "one candidate",
+			input:          `http://example.org/example-320w.jpg`,
+			expectedCount:  1,
+			expectedString: `http://example.org/example-320w.jpg`,
+		},
+		{
+			name:           "comma inside url",
+			input:          `http://example.org/example,a:b/d.jpg , example-480w.jpg 1.5x`,
+			expectedCount:  2,
+			expectedString: `http://example.org/example,a:b/d.jpg, example-480w.jpg 1.5x`,
+		},
+		{
+			name:           "width and height descriptor",
+			input:          `http://example.org/example-320w.jpg 320w 240h`,
+			expectedCount:  1,
+			expectedString: `http://example.org/example-320w.jpg 320w`,
+		},
+		{
+			name:           "data url",
+			input:          `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA 1x, image@2x.png 2x`,
+			expectedCount:  2,
+			expectedString: `data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAUA 1x, image@2x.png 2x`,
+		},
+		{
+			name:           "url with parentheses",
+			input:          `http://example.org/example(1).jpg 1x`,
+			expectedCount:  1,
+			expectedString: `http://example.org/example(1).jpg 1x`,
+		},
 	}
 
-	if candidates.String() != `` {
-		t.Errorf(`Unexpected string output`)
+	for _, testCase := range testCases {
+		t.Run(testCase.name, func(t *testing.T) {
+			assertCandidates(t, testCase.input, testCase.expectedCount, testCase.expectedString)
+		})
 	}
 }
 
-func TestParseSrcSetAttributeWithTooManyDescriptors(t *testing.T) {
-	input := `http://example.org/example-320w.jpg 10w 1x`
-	candidates := ParseSrcSetAttribute(input)
-
-	if len(candidates) != 0 {
-		t.Error(`Incorrect number of candidates`)
+func TestParseSrcSetAttributeInvalidCandidates(t *testing.T) {
+	testCases := []struct {
+		name  string
+		input string
+	}{
+		{
+			name:  "empty input",
+			input: ``,
+		},
+		{
+			name:  "incorrect descriptor",
+			input: `http://example.org/example-320w.jpg test`,
+		},
+		{
+			name:  "too many descriptors",
+			input: `http://example.org/example-320w.jpg 10w 1x`,
+		},
+		{
+			name:  "height descriptor only",
+			input: `http://example.org/example-320w.jpg 20h`,
+		},
+		{
+			name:  "invalid density descriptor +",
+			input: `http://example.org/example-320w.jpg +1x`,
+		},
+		{
+			name:  "invalid density descriptor dot",
+			input: `http://example.org/example-320w.jpg 1.x`,
+		},
+		{
+			name:  "invalid density descriptor -",
+			input: `http://example.org/example-320w.jpg -1x`,
+		},
+		{
+			name:  "invalid width descriptor zero",
+			input: `http://example.org/example-320w.jpg 0w`,
+		},
+		{
+			name:  "invalid width descriptor negative",
+			input: `http://example.org/example-320w.jpg -10w`,
+		},
+		{
+			name:  "invalid width descriptor float",
+			input: `http://example.org/example-320w.jpg 10.5w`,
+		},
+		{
+			name:  "descriptor with parentheses",
+			input: `http://example.org/example-320w.jpg (1x)`,
+		},
+		{
+			name:  "descriptor with parentheses and comma",
+			input: `http://example.org/example-320w.jpg calc(1x, 2x)`,
+		},
 	}
 
-	if candidates.String() != `` {
-		t.Errorf(`Unexpected string output`)
+	for _, testCase := range testCases {
+		t.Run(testCase.name, func(t *testing.T) {
+			assertCandidates(t, testCase.input, 0, "")
+		})
 	}
 }