| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307 |
- // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
- // SPDX-License-Identifier: Apache-2.0
- package sanitizer
- import (
- "math"
- "strconv"
- "strings"
- )
- type imageCandidate struct {
- ImageURL string
- Descriptor string
- }
- type imageCandidates []*imageCandidate
- func (c imageCandidates) String() string {
- htmlCandidates := make([]string, 0, len(c))
- for _, imageCandidate := range c {
- var htmlCandidate string
- if imageCandidate.Descriptor != "" {
- htmlCandidate = imageCandidate.ImageURL + " " + imageCandidate.Descriptor
- } else {
- htmlCandidate = imageCandidate.ImageURL
- }
- htmlCandidates = append(htmlCandidates, htmlCandidate)
- }
- return strings.Join(htmlCandidates, ", ")
- }
- // ParseSrcSetAttribute returns the list of image candidates from the set.
- // Parsing behavior follows the WebKit HTMLSrcsetParser implementation.
- // https://html.spec.whatwg.org/#parse-a-srcset-attribute
- func ParseSrcSetAttribute(attributeValue string) (candidates imageCandidates) {
- if attributeValue == "" {
- return nil
- }
- var position uint = 0
- for position < uint(len(attributeValue)) {
- position = skipWhileHTMLSpaceOrComma(attributeValue, position)
- if position >= uint(len(attributeValue)) {
- break
- }
- urlStart := position
- position = skipUntilASCIIWhitespace(attributeValue, position)
- imageURL := attributeValue[urlStart:position]
- if imageURL == "" {
- continue
- }
- var result descriptorParsingResult
- if imageURL[len(imageURL)-1] == ',' {
- imageURL = strings.TrimRight(imageURL, ",")
- if imageURL == "" {
- continue
- }
- } else {
- position = skipWhileASCIIWhitespace(attributeValue, position)
- descriptorTokens, newPosition := tokenizeDescriptors(attributeValue, position)
- position = newPosition
- if !parseDescriptors(descriptorTokens, &result) {
- continue
- }
- }
- candidates = append(candidates, &imageCandidate{
- ImageURL: imageURL,
- Descriptor: serializeDescriptor(result),
- })
- }
- return candidates
- }
- type descriptorParsingResult struct {
- density float64
- resourceWidth uint
- resourceHeight uint
- hasDensity bool
- hasWidth bool
- hasHeight bool
- }
- func (r *descriptorParsingResult) setDensity(value float64) {
- r.density = value
- r.hasDensity = true
- }
- func (r *descriptorParsingResult) setResourceWidth(value uint) {
- r.resourceWidth = value
- r.hasWidth = true
- }
- func (r *descriptorParsingResult) setResourceHeight(value uint) {
- r.resourceHeight = value
- r.hasHeight = true
- }
- func serializeDescriptor(result descriptorParsingResult) string {
- if result.hasDensity {
- return formatFloat(result.density) + "x"
- }
- if result.hasWidth {
- return strconv.FormatUint(uint64(result.resourceWidth), 10) + "w"
- }
- return ""
- }
- func parseDescriptors(descriptors []string, result *descriptorParsingResult) bool {
- for _, descriptor := range descriptors {
- if descriptor == "" {
- continue
- }
- lastIndex := len(descriptor) - 1
- descriptorChar := descriptor[lastIndex]
- value := descriptor[:lastIndex]
- switch descriptorChar {
- case 'x':
- if result.hasDensity || result.hasHeight || result.hasWidth {
- return false
- }
- density, ok := parseValidHTMLFloatingPointNumber(value)
- if !ok || density < 0 {
- return false
- }
- result.setDensity(density)
- case 'w':
- if result.hasDensity || result.hasWidth {
- return false
- }
- width, ok := parseValidHTMLNonNegativeInteger(value)
- if !ok || width <= 0 {
- return false
- }
- result.setResourceWidth(width)
- case 'h':
- if result.hasDensity || result.hasHeight {
- return false
- }
- height, ok := parseValidHTMLNonNegativeInteger(value)
- if !ok || height <= 0 {
- return false
- }
- result.setResourceHeight(height)
- default:
- return false
- }
- }
- return !result.hasHeight || result.hasWidth
- }
- type descriptorTokenizerState int
- const (
- descriptorStateInitial descriptorTokenizerState = iota
- descriptorStateInParenthesis
- descriptorStateAfterToken
- )
- func tokenizeDescriptors(input string, start uint) (tokens []string, newPosition uint) {
- state := descriptorStateInitial
- currentStart := start
- currentSet := true
- position := start
- appendDescriptorAndReset := func(position uint) {
- if currentSet && position > currentStart {
- tokens = append(tokens, input[currentStart:position])
- }
- currentSet = false
- }
- appendCharacter := func(position uint) {
- if !currentSet {
- currentStart = position
- currentSet = true
- }
- }
- for {
- if position >= uint(len(input)) {
- if state != descriptorStateAfterToken {
- appendDescriptorAndReset(position)
- }
- return tokens, position
- }
- character := input[position]
- switch state {
- case descriptorStateInitial:
- switch {
- case isComma(character):
- appendDescriptorAndReset(position)
- position++
- return tokens, position
- case isASCIIWhitespace(character):
- appendDescriptorAndReset(position)
- currentStart = position + 1
- currentSet = true
- state = descriptorStateAfterToken
- case character == '(':
- appendCharacter(position)
- state = descriptorStateInParenthesis
- default:
- appendCharacter(position)
- }
- case descriptorStateInParenthesis:
- if character == ')' {
- appendCharacter(position)
- state = descriptorStateInitial
- } else {
- appendCharacter(position)
- }
- case descriptorStateAfterToken:
- if !isASCIIWhitespace(character) {
- state = descriptorStateInitial
- currentStart = position
- currentSet = true
- position--
- }
- }
- position++
- }
- }
- func parseValidHTMLNonNegativeInteger(value string) (uint, bool) {
- if value == "" {
- return 0, false
- }
- for i := 0; i < len(value); i++ {
- if value[i] < '0' || value[i] > '9' {
- return 0, false
- }
- }
- parsed, err := strconv.ParseUint(value, 10, 0)
- if err != nil {
- return 0, false
- }
- return uint(parsed), true
- }
- func parseValidHTMLFloatingPointNumber(value string) (float64, bool) {
- if value == "" {
- return 0, false
- }
- if value[0] == '+' || value[len(value)-1] == '.' {
- return 0, false
- }
- parsed, err := strconv.ParseFloat(value, 64)
- if err != nil || math.IsNaN(parsed) || math.IsInf(parsed, 0) {
- return 0, false
- }
- return parsed, true
- }
- func formatFloat(value float64) string {
- return strconv.FormatFloat(value, 'g', -1, 64)
- }
- func skipWhileHTMLSpaceOrComma(value string, position uint) uint {
- for position < uint(len(value)) && (isASCIIWhitespace(value[position]) || isComma(value[position])) {
- position++
- }
- return position
- }
- func skipWhileASCIIWhitespace(value string, position uint) uint {
- for position < uint(len(value)) && isASCIIWhitespace(value[position]) {
- position++
- }
- return position
- }
- func skipUntilASCIIWhitespace(value string, position uint) uint {
- for position < uint(len(value)) && !isASCIIWhitespace(value[position]) {
- position++
- }
- return position
- }
- func isASCIIWhitespace(character byte) bool {
- switch character {
- case '\t', '\n', '\f', '\r', ' ':
- return true
- default:
- return false
- }
- }
- func isComma(character byte) bool {
- return character == ','
- }
|