Przeglądaj źródła

refactor(sanitizer): reorder non-public functions alphabetically

Frédéric Guillot 2 miesięcy temu
rodzic
commit
4d27f66ff9
1 zmienionych plików z 202 dodań i 202 usunięć
  1. 202 202
      internal/reader/sanitizer/sanitizer.go

+ 202 - 202
internal/reader/sanitizer/sanitizer.go

@@ -237,42 +237,22 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
 	return buffer.String()
 }
 
-func isHidden(n *html.Node) bool {
-	for _, attr := range n.Attr {
-		if attr.Key == "hidden" {
-			return true
-		}
-	}
-	return false
-}
+func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
+	iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
 
-func shouldIgnoreTag(n *html.Node, tag string) bool {
-	if isPixelTracker(tag, n.Attr) {
-		return true
-	}
-	if isBlockedTag(tag) {
-		return true
-	}
-	if isHidden(n) {
-		return true
+	if _, ok := iframeAllowList[iframeSourceDomain]; ok {
+		return iframeSourceDomain, true
 	}
 
-	return false
-}
-
-func isSelfContainedTag(tag string) bool {
-	switch tag {
-	case "area", "base", "br", "col", "embed", "hr", "img", "input",
-		"link", "meta", "param", "source", "track", "wbr":
-		return true
+	if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
+		return iframeSourceDomain, true
 	}
-	return false
-}
 
-func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
-	for c := n.FirstChild; c != nil; c = c.NextSibling {
-		filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
+	if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
+		return iframeSourceDomain, true
 	}
+
+	return "", false
 }
 
 func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
@@ -325,112 +305,10 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
 	}
 }
 
-func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
-	htmlAttrs := make([]string, 0, len(attributes))
-	attrNames := make([]string, 0, len(attributes))
-
-	var isAnchorLink bool
-	var isYouTubeEmbed bool
-
-	allowedAttributes, ok := allowedHTMLTagsAndAttributes[tagName]
-	if !ok {
-		// This should never happen, as the tag was validated in the caller of `sanitizeAttributes`
-		return []string{}, ""
-	}
-
-	for _, attribute := range attributes {
-		if !slices.Contains(allowedAttributes, attribute.Key) {
-			continue
-		}
-
-		value := attribute.Val
-
-		switch tagName {
-		case "math":
-			if attribute.Key == "xmlns" {
-				if value != "http://www.w3.org/1998/Math/MathML" {
-					value = "http://www.w3.org/1998/Math/MathML"
-				}
-			}
-		case "img":
-			switch attribute.Key {
-			case "fetchpriority":
-				if !isValidFetchPriorityValue(value) {
-					continue
-				}
-			case "decoding":
-				if !isValidDecodingValue(value) {
-					continue
-				}
-			case "width", "height":
-				if !isPositiveInteger(value) {
-					continue
-				}
-			case "srcset":
-				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
-				if value == "" {
-					continue
-				}
-			}
-		case "source":
-			if attribute.Key == "srcset" {
-				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
-				if value == "" {
-					continue
-				}
-			}
-		}
-
-		if isExternalResourceAttribute(attribute.Key) {
-			switch {
-			case tagName == "iframe":
-				iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
-				if !trustedIframeDomain {
-					continue
-				}
-
-				value = rewriteIframeURL(attribute.Val)
-
-				if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
-					isYouTubeEmbed = true
-				}
-			case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
-				value = attribute.Val
-			case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
-				value = attribute.Val
-				isAnchorLink = true
-			default:
-				var err error
-				value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
-				if err != nil {
-					continue
-				}
-
-				if !hasValidURIScheme(value) || isBlockedResource(value) {
-					continue
-				}
-
-				// TODO use feedURL instead of baseURL twice.
-				parsedValueUrl, _ := url.Parse(value)
-				if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
-					value = cleanedURL
-				}
-			}
-		}
-
-		attrNames = append(attrNames, attribute.Key)
-		htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
-	}
-
-	if !isAnchorLink {
-		extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
-		if len(extraAttrNames) > 0 {
-			attrNames = append(attrNames, extraAttrNames...)
-			htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
-		}
+func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
+	for c := n.FirstChild; c != nil; c = c.NextSibling {
+		filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
 	}
-
-	return attrNames, strings.Join(htmlAttrs, " ")
 }
 
 func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) ([]string, []string) {
@@ -467,6 +345,50 @@ func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *S
 	}
 }
 
+func hasRequiredAttributes(tagName string, attributes []string) bool {
+	switch tagName {
+	case "a":
+		return slices.Contains(attributes, "href")
+	case "iframe":
+		return slices.Contains(attributes, "src")
+	case "source", "img":
+		for _, attribute := range attributes {
+			if attribute == "src" || attribute == "srcset" {
+				return true
+			}
+		}
+		return false
+	default:
+		return true
+	}
+}
+
+func hasValidURIScheme(absoluteURL string) bool {
+	for _, scheme := range validURISchemes {
+		if strings.HasPrefix(absoluteURL, scheme) {
+			return true
+		}
+	}
+	return false
+}
+
+func isBlockedResource(absoluteURL string) bool {
+	for _, blockedURL := range blockedResourceURLSubstrings {
+		if strings.Contains(absoluteURL, blockedURL) {
+			return true
+		}
+	}
+	return false
+}
+
+func isBlockedTag(tagName string) bool {
+	switch tagName {
+	case "noscript", "script", "style":
+		return true
+	}
+	return false
+}
+
 func isExternalResourceAttribute(attribute string) bool {
 	switch attribute {
 	case "src", "href", "poster", "cite":
@@ -476,6 +398,15 @@ func isExternalResourceAttribute(attribute string) bool {
 	}
 }
 
+func isHidden(n *html.Node) bool {
+	for _, attr := range n.Attr {
+		if attr.Key == "hidden" {
+			return true
+		}
+	}
+	return false
+}
+
 func isPixelTracker(tagName string, attributes []html.Attribute) bool {
 	if tagName != "img" {
 		return false
@@ -497,58 +428,48 @@ func isPixelTracker(tagName string, attributes []html.Attribute) bool {
 	return hasHeight && hasWidth
 }
 
-func hasRequiredAttributes(tagName string, attributes []string) bool {
-	switch tagName {
-	case "a":
-		return slices.Contains(attributes, "href")
-	case "iframe":
-		return slices.Contains(attributes, "src")
-	case "source", "img":
-		for _, attribute := range attributes {
-			if attribute == "src" || attribute == "srcset" {
-				return true
-			}
-		}
+func isPositiveInteger(value string) bool {
+	if value == "" {
 		return false
-	default:
-		return true
 	}
+	if number, err := strconv.Atoi(value); err == nil {
+		return number > 0
+	}
+	return false
 }
 
-func hasValidURIScheme(absoluteURL string) bool {
-	for _, scheme := range validURISchemes {
-		if strings.HasPrefix(absoluteURL, scheme) {
-			return true
-		}
+func isSelfContainedTag(tag string) bool {
+	switch tag {
+	case "area", "base", "br", "col", "embed", "hr", "img", "input",
+		"link", "meta", "param", "source", "track", "wbr":
+		return true
 	}
 	return false
 }
 
-func isBlockedResource(absoluteURL string) bool {
-	for _, blockedURL := range blockedResourceURLSubstrings {
-		if strings.Contains(absoluteURL, blockedURL) {
+func isValidDataAttribute(value string) bool {
+	for _, prefix := range dataAttributeAllowedPrefixes {
+		if strings.HasPrefix(value, prefix) {
 			return true
 		}
 	}
 	return false
 }
 
-func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
-	iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
-
-	if _, ok := iframeAllowList[iframeSourceDomain]; ok {
-		return iframeSourceDomain, true
-	}
-
-	if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
-		return iframeSourceDomain, true
+func isValidDecodingValue(value string) bool {
+	switch value {
+	case "sync", "async", "auto":
+		return true
 	}
+	return false
+}
 
-	if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
-		return iframeSourceDomain, true
+func isValidFetchPriorityValue(value string) bool {
+	switch value {
+	case "high", "low", "auto":
+		return true
 	}
-
-	return "", false
+	return false
 }
 
 func rewriteIframeURL(link string) string {
@@ -578,12 +499,112 @@ func rewriteIframeURL(link string) string {
 	return link
 }
 
-func isBlockedTag(tagName string) bool {
-	switch tagName {
-	case "noscript", "script", "style":
-		return true
+func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
+	htmlAttrs := make([]string, 0, len(attributes))
+	attrNames := make([]string, 0, len(attributes))
+
+	var isAnchorLink bool
+	var isYouTubeEmbed bool
+
+	allowedAttributes, ok := allowedHTMLTagsAndAttributes[tagName]
+	if !ok {
+		// This should never happen, as the tag was validated in the caller of `sanitizeAttributes`
+		return []string{}, ""
 	}
-	return false
+
+	for _, attribute := range attributes {
+		if !slices.Contains(allowedAttributes, attribute.Key) {
+			continue
+		}
+
+		value := attribute.Val
+
+		switch tagName {
+		case "math":
+			if attribute.Key == "xmlns" {
+				if value != "http://www.w3.org/1998/Math/MathML" {
+					value = "http://www.w3.org/1998/Math/MathML"
+				}
+			}
+		case "img":
+			switch attribute.Key {
+			case "fetchpriority":
+				if !isValidFetchPriorityValue(value) {
+					continue
+				}
+			case "decoding":
+				if !isValidDecodingValue(value) {
+					continue
+				}
+			case "width", "height":
+				if !isPositiveInteger(value) {
+					continue
+				}
+			case "srcset":
+				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
+				if value == "" {
+					continue
+				}
+			}
+		case "source":
+			if attribute.Key == "srcset" {
+				value = sanitizeSrcsetAttr(parsedBaseUrl, value)
+				if value == "" {
+					continue
+				}
+			}
+		}
+
+		if isExternalResourceAttribute(attribute.Key) {
+			switch {
+			case tagName == "iframe":
+				iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
+				if !trustedIframeDomain {
+					continue
+				}
+
+				value = rewriteIframeURL(attribute.Val)
+
+				if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
+					isYouTubeEmbed = true
+				}
+			case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
+				value = attribute.Val
+			case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
+				value = attribute.Val
+				isAnchorLink = true
+			default:
+				var err error
+				value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
+				if err != nil {
+					continue
+				}
+
+				if !hasValidURIScheme(value) || isBlockedResource(value) {
+					continue
+				}
+
+				// TODO use feedURL instead of baseURL twice.
+				parsedValueUrl, _ := url.Parse(value)
+				if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
+					value = cleanedURL
+				}
+			}
+		}
+
+		attrNames = append(attrNames, attribute.Key)
+		htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
+	}
+
+	if !isAnchorLink {
+		extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
+		if len(extraAttrNames) > 0 {
+			attrNames = append(attrNames, extraAttrNames...)
+			htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
+		}
+	}
+
+	return attrNames, strings.Join(htmlAttrs, " ")
 }
 
 func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
@@ -611,37 +632,16 @@ func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
 	return imageCandidates(sanitizedCandidates).String()
 }
 
-func isValidDataAttribute(value string) bool {
-	for _, prefix := range dataAttributeAllowedPrefixes {
-		if strings.HasPrefix(value, prefix) {
-			return true
-		}
-	}
-	return false
-}
-
-func isPositiveInteger(value string) bool {
-	if value == "" {
-		return false
-	}
-	if number, err := strconv.Atoi(value); err == nil {
-		return number > 0
+func shouldIgnoreTag(n *html.Node, tag string) bool {
+	if isPixelTracker(tag, n.Attr) {
+		return true
 	}
-	return false
-}
-
-func isValidFetchPriorityValue(value string) bool {
-	switch value {
-	case "high", "low", "auto":
+	if isBlockedTag(tag) {
 		return true
 	}
-	return false
-}
-
-func isValidDecodingValue(value string) bool {
-	switch value {
-	case "sync", "async", "auto":
+	if isHidden(n) {
 		return true
 	}
+
 	return false
 }