|
|
@@ -237,42 +237,22 @@ func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) s
|
|
|
return buffer.String()
|
|
|
}
|
|
|
|
|
|
-func isHidden(n *html.Node) bool {
|
|
|
- for _, attr := range n.Attr {
|
|
|
- if attr.Key == "hidden" {
|
|
|
- return true
|
|
|
- }
|
|
|
- }
|
|
|
- return false
|
|
|
-}
|
|
|
+func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
|
|
|
+ iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
|
|
|
|
|
|
-func shouldIgnoreTag(n *html.Node, tag string) bool {
|
|
|
- if isPixelTracker(tag, n.Attr) {
|
|
|
- return true
|
|
|
- }
|
|
|
- if isBlockedTag(tag) {
|
|
|
- return true
|
|
|
- }
|
|
|
- if isHidden(n) {
|
|
|
- return true
|
|
|
+ if _, ok := iframeAllowList[iframeSourceDomain]; ok {
|
|
|
+ return iframeSourceDomain, true
|
|
|
}
|
|
|
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-func isSelfContainedTag(tag string) bool {
|
|
|
- switch tag {
|
|
|
- case "area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
|
- "link", "meta", "param", "source", "track", "wbr":
|
|
|
- return true
|
|
|
+ if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
|
|
|
+ return iframeSourceDomain, true
|
|
|
}
|
|
|
- return false
|
|
|
-}
|
|
|
|
|
|
-func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
|
|
|
- for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
- filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
|
|
|
+ if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
|
|
|
+ return iframeSourceDomain, true
|
|
|
}
|
|
|
+
|
|
|
+ return "", false
|
|
|
}
|
|
|
|
|
|
func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
|
|
|
@@ -325,112 +305,10 @@ func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.
|
|
|
}
|
|
|
}
|
|
|
|
|
|
-func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
|
|
- htmlAttrs := make([]string, 0, len(attributes))
|
|
|
- attrNames := make([]string, 0, len(attributes))
|
|
|
-
|
|
|
- var isAnchorLink bool
|
|
|
- var isYouTubeEmbed bool
|
|
|
-
|
|
|
- allowedAttributes, ok := allowedHTMLTagsAndAttributes[tagName]
|
|
|
- if !ok {
|
|
|
- // This should never happen, as the tag was validated in the caller of `sanitizeAttributes`
|
|
|
- return []string{}, ""
|
|
|
- }
|
|
|
-
|
|
|
- for _, attribute := range attributes {
|
|
|
- if !slices.Contains(allowedAttributes, attribute.Key) {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- value := attribute.Val
|
|
|
-
|
|
|
- switch tagName {
|
|
|
- case "math":
|
|
|
- if attribute.Key == "xmlns" {
|
|
|
- if value != "http://www.w3.org/1998/Math/MathML" {
|
|
|
- value = "http://www.w3.org/1998/Math/MathML"
|
|
|
- }
|
|
|
- }
|
|
|
- case "img":
|
|
|
- switch attribute.Key {
|
|
|
- case "fetchpriority":
|
|
|
- if !isValidFetchPriorityValue(value) {
|
|
|
- continue
|
|
|
- }
|
|
|
- case "decoding":
|
|
|
- if !isValidDecodingValue(value) {
|
|
|
- continue
|
|
|
- }
|
|
|
- case "width", "height":
|
|
|
- if !isPositiveInteger(value) {
|
|
|
- continue
|
|
|
- }
|
|
|
- case "srcset":
|
|
|
- value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
|
|
- if value == "" {
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
- case "source":
|
|
|
- if attribute.Key == "srcset" {
|
|
|
- value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
|
|
- if value == "" {
|
|
|
- continue
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- if isExternalResourceAttribute(attribute.Key) {
|
|
|
- switch {
|
|
|
- case tagName == "iframe":
|
|
|
- iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
|
|
|
- if !trustedIframeDomain {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- value = rewriteIframeURL(attribute.Val)
|
|
|
-
|
|
|
- if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
|
|
|
- isYouTubeEmbed = true
|
|
|
- }
|
|
|
- case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
|
|
|
- value = attribute.Val
|
|
|
- case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
|
|
|
- value = attribute.Val
|
|
|
- isAnchorLink = true
|
|
|
- default:
|
|
|
- var err error
|
|
|
- value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
|
|
|
- if err != nil {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- if !hasValidURIScheme(value) || isBlockedResource(value) {
|
|
|
- continue
|
|
|
- }
|
|
|
-
|
|
|
- // TODO use feedURL instead of baseURL twice.
|
|
|
- parsedValueUrl, _ := url.Parse(value)
|
|
|
- if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
|
|
|
- value = cleanedURL
|
|
|
- }
|
|
|
- }
|
|
|
- }
|
|
|
-
|
|
|
- attrNames = append(attrNames, attribute.Key)
|
|
|
- htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
|
|
|
- }
|
|
|
-
|
|
|
- if !isAnchorLink {
|
|
|
- extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
|
|
|
- if len(extraAttrNames) > 0 {
|
|
|
- attrNames = append(attrNames, extraAttrNames...)
|
|
|
- htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
|
|
|
- }
|
|
|
+func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
|
|
|
+ for c := n.FirstChild; c != nil; c = c.NextSibling {
|
|
|
+ filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
|
|
|
}
|
|
|
-
|
|
|
- return attrNames, strings.Join(htmlAttrs, " ")
|
|
|
}
|
|
|
|
|
|
func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) ([]string, []string) {
|
|
|
@@ -467,6 +345,50 @@ func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *S
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+func hasRequiredAttributes(tagName string, attributes []string) bool {
|
|
|
+ switch tagName {
|
|
|
+ case "a":
|
|
|
+ return slices.Contains(attributes, "href")
|
|
|
+ case "iframe":
|
|
|
+ return slices.Contains(attributes, "src")
|
|
|
+ case "source", "img":
|
|
|
+ for _, attribute := range attributes {
|
|
|
+ if attribute == "src" || attribute == "srcset" {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+ default:
|
|
|
+ return true
|
|
|
+ }
|
|
|
+}
|
|
|
+
|
|
|
+func hasValidURIScheme(absoluteURL string) bool {
|
|
|
+ for _, scheme := range validURISchemes {
|
|
|
+ if strings.HasPrefix(absoluteURL, scheme) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+func isBlockedResource(absoluteURL string) bool {
|
|
|
+ for _, blockedURL := range blockedResourceURLSubstrings {
|
|
|
+ if strings.Contains(absoluteURL, blockedURL) {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
+func isBlockedTag(tagName string) bool {
|
|
|
+ switch tagName {
|
|
|
+ case "noscript", "script", "style":
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
func isExternalResourceAttribute(attribute string) bool {
|
|
|
switch attribute {
|
|
|
case "src", "href", "poster", "cite":
|
|
|
@@ -476,6 +398,15 @@ func isExternalResourceAttribute(attribute string) bool {
|
|
|
}
|
|
|
}
|
|
|
|
|
|
+func isHidden(n *html.Node) bool {
|
|
|
+ for _, attr := range n.Attr {
|
|
|
+ if attr.Key == "hidden" {
|
|
|
+ return true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return false
|
|
|
+}
|
|
|
+
|
|
|
func isPixelTracker(tagName string, attributes []html.Attribute) bool {
|
|
|
if tagName != "img" {
|
|
|
return false
|
|
|
@@ -497,58 +428,48 @@ func isPixelTracker(tagName string, attributes []html.Attribute) bool {
|
|
|
return hasHeight && hasWidth
|
|
|
}
|
|
|
|
|
|
-func hasRequiredAttributes(tagName string, attributes []string) bool {
|
|
|
- switch tagName {
|
|
|
- case "a":
|
|
|
- return slices.Contains(attributes, "href")
|
|
|
- case "iframe":
|
|
|
- return slices.Contains(attributes, "src")
|
|
|
- case "source", "img":
|
|
|
- for _, attribute := range attributes {
|
|
|
- if attribute == "src" || attribute == "srcset" {
|
|
|
- return true
|
|
|
- }
|
|
|
- }
|
|
|
+func isPositiveInteger(value string) bool {
|
|
|
+ if value == "" {
|
|
|
return false
|
|
|
- default:
|
|
|
- return true
|
|
|
}
|
|
|
+ if number, err := strconv.Atoi(value); err == nil {
|
|
|
+ return number > 0
|
|
|
+ }
|
|
|
+ return false
|
|
|
}
|
|
|
|
|
|
-func hasValidURIScheme(absoluteURL string) bool {
|
|
|
- for _, scheme := range validURISchemes {
|
|
|
- if strings.HasPrefix(absoluteURL, scheme) {
|
|
|
- return true
|
|
|
- }
|
|
|
+func isSelfContainedTag(tag string) bool {
|
|
|
+ switch tag {
|
|
|
+ case "area", "base", "br", "col", "embed", "hr", "img", "input",
|
|
|
+ "link", "meta", "param", "source", "track", "wbr":
|
|
|
+ return true
|
|
|
}
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
-func isBlockedResource(absoluteURL string) bool {
|
|
|
- for _, blockedURL := range blockedResourceURLSubstrings {
|
|
|
- if strings.Contains(absoluteURL, blockedURL) {
|
|
|
+func isValidDataAttribute(value string) bool {
|
|
|
+ for _, prefix := range dataAttributeAllowedPrefixes {
|
|
|
+ if strings.HasPrefix(value, prefix) {
|
|
|
return true
|
|
|
}
|
|
|
}
|
|
|
return false
|
|
|
}
|
|
|
|
|
|
-func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
|
|
|
- iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
|
|
|
-
|
|
|
- if _, ok := iframeAllowList[iframeSourceDomain]; ok {
|
|
|
- return iframeSourceDomain, true
|
|
|
- }
|
|
|
-
|
|
|
- if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
|
|
|
- return iframeSourceDomain, true
|
|
|
+func isValidDecodingValue(value string) bool {
|
|
|
+ switch value {
|
|
|
+ case "sync", "async", "auto":
|
|
|
+ return true
|
|
|
}
|
|
|
+ return false
|
|
|
+}
|
|
|
|
|
|
- if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
|
|
|
- return iframeSourceDomain, true
|
|
|
+func isValidFetchPriorityValue(value string) bool {
|
|
|
+ switch value {
|
|
|
+ case "high", "low", "auto":
|
|
|
+ return true
|
|
|
}
|
|
|
-
|
|
|
- return "", false
|
|
|
+ return false
|
|
|
}
|
|
|
|
|
|
func rewriteIframeURL(link string) string {
|
|
|
@@ -578,12 +499,112 @@ func rewriteIframeURL(link string) string {
|
|
|
return link
|
|
|
}
|
|
|
|
|
|
-func isBlockedTag(tagName string) bool {
|
|
|
- switch tagName {
|
|
|
- case "noscript", "script", "style":
|
|
|
- return true
|
|
|
+func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
|
|
|
+ htmlAttrs := make([]string, 0, len(attributes))
|
|
|
+ attrNames := make([]string, 0, len(attributes))
|
|
|
+
|
|
|
+ var isAnchorLink bool
|
|
|
+ var isYouTubeEmbed bool
|
|
|
+
|
|
|
+ allowedAttributes, ok := allowedHTMLTagsAndAttributes[tagName]
|
|
|
+ if !ok {
|
|
|
+ // This should never happen, as the tag was validated in the caller of `sanitizeAttributes`
|
|
|
+ return []string{}, ""
|
|
|
}
|
|
|
- return false
|
|
|
+
|
|
|
+ for _, attribute := range attributes {
|
|
|
+ if !slices.Contains(allowedAttributes, attribute.Key) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ value := attribute.Val
|
|
|
+
|
|
|
+ switch tagName {
|
|
|
+ case "math":
|
|
|
+ if attribute.Key == "xmlns" {
|
|
|
+ if value != "http://www.w3.org/1998/Math/MathML" {
|
|
|
+ value = "http://www.w3.org/1998/Math/MathML"
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "img":
|
|
|
+ switch attribute.Key {
|
|
|
+ case "fetchpriority":
|
|
|
+ if !isValidFetchPriorityValue(value) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ case "decoding":
|
|
|
+ if !isValidDecodingValue(value) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ case "width", "height":
|
|
|
+ if !isPositiveInteger(value) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ case "srcset":
|
|
|
+ value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
|
|
+ if value == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "source":
|
|
|
+ if attribute.Key == "srcset" {
|
|
|
+ value = sanitizeSrcsetAttr(parsedBaseUrl, value)
|
|
|
+ if value == "" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ if isExternalResourceAttribute(attribute.Key) {
|
|
|
+ switch {
|
|
|
+ case tagName == "iframe":
|
|
|
+ iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
|
|
|
+ if !trustedIframeDomain {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ value = rewriteIframeURL(attribute.Val)
|
|
|
+
|
|
|
+ if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
|
|
|
+ isYouTubeEmbed = true
|
|
|
+ }
|
|
|
+ case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
|
|
|
+ value = attribute.Val
|
|
|
+ case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
|
|
|
+ value = attribute.Val
|
|
|
+ isAnchorLink = true
|
|
|
+ default:
|
|
|
+ var err error
|
|
|
+ value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
|
|
|
+ if err != nil {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ if !hasValidURIScheme(value) || isBlockedResource(value) {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+
|
|
|
+ // TODO use feedURL instead of baseURL twice.
|
|
|
+ parsedValueUrl, _ := url.Parse(value)
|
|
|
+ if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
|
|
|
+ value = cleanedURL
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ attrNames = append(attrNames, attribute.Key)
|
|
|
+ htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
|
|
|
+ }
|
|
|
+
|
|
|
+ if !isAnchorLink {
|
|
|
+ extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
|
|
|
+ if len(extraAttrNames) > 0 {
|
|
|
+ attrNames = append(attrNames, extraAttrNames...)
|
|
|
+ htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return attrNames, strings.Join(htmlAttrs, " ")
|
|
|
}
|
|
|
|
|
|
func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
|
|
|
@@ -611,37 +632,16 @@ func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
|
|
|
return imageCandidates(sanitizedCandidates).String()
|
|
|
}
|
|
|
|
|
|
-func isValidDataAttribute(value string) bool {
|
|
|
- for _, prefix := range dataAttributeAllowedPrefixes {
|
|
|
- if strings.HasPrefix(value, prefix) {
|
|
|
- return true
|
|
|
- }
|
|
|
- }
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-func isPositiveInteger(value string) bool {
|
|
|
- if value == "" {
|
|
|
- return false
|
|
|
- }
|
|
|
- if number, err := strconv.Atoi(value); err == nil {
|
|
|
- return number > 0
|
|
|
+func shouldIgnoreTag(n *html.Node, tag string) bool {
|
|
|
+ if isPixelTracker(tag, n.Attr) {
|
|
|
+ return true
|
|
|
}
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-func isValidFetchPriorityValue(value string) bool {
|
|
|
- switch value {
|
|
|
- case "high", "low", "auto":
|
|
|
+ if isBlockedTag(tag) {
|
|
|
return true
|
|
|
}
|
|
|
- return false
|
|
|
-}
|
|
|
-
|
|
|
-func isValidDecodingValue(value string) bool {
|
|
|
- switch value {
|
|
|
- case "sync", "async", "auto":
|
|
|
+ if isHidden(n) {
|
|
|
return true
|
|
|
}
|
|
|
+
|
|
|
return false
|
|
|
}
|