| 123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663 |
- // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
- // SPDX-License-Identifier: Apache-2.0
- package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
- import (
- "errors"
- "net/url"
- "slices"
- "strconv"
- "strings"
- "miniflux.app/v2/internal/config"
- "miniflux.app/v2/internal/reader/urlcleaner"
- "miniflux.app/v2/internal/urllib"
- "golang.org/x/net/html"
- )
- const (
- maxDepth = 512 // The maximum allowed depths for nested HTML tags, same was WebKit.
- )
- var (
- allowedHTMLTagsAndAttributes = map[string][]string{
- "a": {"href", "title", "id"},
- "abbr": {"title"},
- "acronym": {"title"},
- "aside": {},
- "audio": {"src"},
- "blockquote": {},
- "b": {},
- "br": {},
- "caption": {},
- "cite": {},
- "code": {},
- "dd": {"id"},
- "del": {},
- "dfn": {},
- "dl": {"id"},
- "dt": {"id"},
- "em": {},
- "figcaption": {},
- "figure": {},
- "h1": {"id"},
- "h2": {"id"},
- "h3": {"id"},
- "h4": {"id"},
- "h5": {"id"},
- "h6": {"id"},
- "hr": {},
- "i": {},
- "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
- "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
- "ins": {},
- "kbd": {},
- "li": {"id"},
- "ol": {"id"},
- "p": {},
- "picture": {},
- "pre": {},
- "q": {"cite"},
- "rp": {},
- "rt": {},
- "rtc": {},
- "ruby": {},
- "s": {},
- "small": {},
- "samp": {},
- "source": {"src", "type", "srcset", "sizes", "media"},
- "strong": {},
- "sub": {},
- "sup": {"id"},
- "table": {},
- "td": {"rowspan", "colspan"},
- "tfoot": {},
- "th": {"rowspan", "colspan"},
- "thead": {},
- "time": {"datetime"},
- "tr": {},
- "u": {},
- "ul": {"id"},
- "var": {},
- "video": {"poster", "height", "width", "src"},
- "wbr": {},
- // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
- "annotation": {},
- "annotation-xml": {},
- "maction": {},
- "math": {"xmlns"},
- "merror": {},
- "mfrac": {},
- "mi": {},
- "mmultiscripts": {},
- "mn": {},
- "mo": {},
- "mover": {},
- "mpadded": {},
- "mphantom": {},
- "mprescripts": {},
- "mroot": {},
- "mrow": {},
- "ms": {},
- "mspace": {},
- "msqrt": {},
- "mstyle": {},
- "msub": {},
- "msubsup": {},
- "msup": {},
- "mtable": {},
- "mtd": {},
- "mtext": {},
- "mtr": {},
- "munder": {},
- "munderover": {},
- "semantics": {},
- }
- iframeAllowList = map[string]struct{}{
- "bandcamp.com": {},
- "cdn.embedly.com": {},
- "dailymotion.com": {},
- "open.spotify.com": {},
- "player.bilibili.com": {},
- "player.twitch.tv": {},
- "player.vimeo.com": {},
- "soundcloud.com": {},
- "vk.com": {},
- "w.soundcloud.com": {},
- "youtube-nocookie.com": {},
- "youtube.com": {},
- }
- blockedResourceURLSubstrings = []string{
- "api.flattr.com",
- "www.facebook.com/sharer.php",
- "feeds.feedburner.com",
- "feedsportal.com",
- "linkedin.com/shareArticle",
- "pinterest.com/pin/create/button/",
- "stats.wordpress.com",
- "twitter.com/intent/tweet",
- "twitter.com/share",
- "x.com/intent/tweet",
- "x.com/share",
- }
- // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
- validURISchemes = []string{
- // Most commong schemes on top.
- "https:",
- "http:",
- // Then the rest.
- "apt:",
- "bitcoin:",
- "callto:",
- "dav:",
- "davs:",
- "ed2k:",
- "facetime:",
- "feed:",
- "ftp:",
- "geo:",
- "git:",
- "gopher:",
- "irc:",
- "irc6:",
- "ircs:",
- "itms-apps:",
- "itms:",
- "magnet:",
- "mailto:",
- "news:",
- "nntp:",
- "rtmp:",
- "sftp:",
- "sip:",
- "sips:",
- "skype:",
- "spotify:",
- "ssh:",
- "steam:",
- "svn:",
- "svn+ssh:",
- "tel:",
- "webcal:",
- "xmpp:",
- // iOS Apps
- "opener:", // https://www.opener.link
- "hack:", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
- }
- dataAttributeAllowedPrefixes = []string{
- "data:image/avif",
- "data:image/apng",
- "data:image/png",
- "data:image/svg",
- "data:image/svg+xml",
- "data:image/jpg",
- "data:image/jpeg",
- "data:image/gif",
- "data:image/webp",
- }
- )
- // SanitizerOptions holds options for the HTML sanitizer.
- type SanitizerOptions struct {
- OpenLinksInNewTab bool
- }
- // SanitizeHTML takes raw HTML input and removes any disallowed tags and attributes.
- func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
- var buffer strings.Builder
- // Educated guess about how big the sanitized HTML will be,
- // to reduce the amount of buffer re-allocations in this function.
- estimatedRatio := len(rawHTML) * 3 / 4
- buffer.Grow(estimatedRatio)
- // We need to surround `rawHTML` with body tags so that html.Parse
- // will consider it a valid html document.
- doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
- if err != nil {
- return ""
- }
- /* The structure of `doc` is always:
- <html>
- <head>...</head>
- <body>..</body>
- </html>
- */
- body := doc.FirstChild.FirstChild.NextSibling
- // Errors are a non-issue, so they're handled in filterAndRenderHTML
- parsedBaseUrl, _ := url.Parse(baseURL)
- for c := body.FirstChild; c != nil; c = c.NextSibling {
- // -2 because of `<html><body>…`
- if err := filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions, maxDepth-2); err != nil {
- return ""
- }
- }
- return buffer.String()
- }
- func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
- iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
- if _, ok := iframeAllowList[iframeSourceDomain]; ok {
- return iframeSourceDomain, true
- }
- if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
- return iframeSourceDomain, true
- }
- if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
- return iframeSourceDomain, true
- }
- return "", false
- }
- func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
- if n == nil {
- return nil
- }
- if depth == 0 {
- return errors.New("maximum nested tags limit reached")
- }
- switch n.Type {
- case html.TextNode:
- buf.WriteString(html.EscapeString(n.Data))
- case html.ElementNode:
- tag := strings.ToLower(n.Data)
- if shouldIgnoreTag(n, tag) {
- return nil
- }
- _, ok := allowedHTMLTagsAndAttributes[tag]
- if !ok {
- // The tag isn't allowed, but we're still interested in its content
- return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
- }
- htmlAttributes, hasAllRequiredAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
- if !hasAllRequiredAttributes {
- // The tag doesn't have every required attributes but we're still interested in its content
- return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
- }
- buf.WriteByte('<')
- buf.WriteString(n.Data)
- if htmlAttributes != "" {
- buf.WriteByte(' ')
- buf.WriteString(htmlAttributes)
- }
- buf.WriteByte('>')
- if isSelfContainedTag(tag) {
- return nil
- }
- if tag != "iframe" {
- // iframes aren't allowed to have child nodes.
- filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
- }
- buf.WriteString("</")
- buf.WriteString(n.Data)
- buf.WriteByte('>')
- default:
- }
- return nil
- }
- func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
- for c := n.FirstChild; c != nil; c = c.NextSibling {
- if err := filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions, depth); err != nil {
- return err
- }
- }
- return nil
- }
- func hasRequiredAttributes(s *mandatoryAttributesStruct, tagName string) bool {
- switch tagName {
- case "a":
- return s.href
- case "iframe":
- return s.src
- case "source", "img":
- return s.src || s.srcset
- }
- return true
- }
- func hasValidURIScheme(absoluteURL string) bool {
- for _, scheme := range validURISchemes {
- if strings.HasPrefix(absoluteURL, scheme) {
- return true
- }
- }
- return false
- }
- func isBlockedResource(absoluteURL string) bool {
- for _, blockedURL := range blockedResourceURLSubstrings {
- if strings.Contains(absoluteURL, blockedURL) {
- return true
- }
- }
- return false
- }
- func isBlockedTag(tagName string) bool {
- switch tagName {
- case "noscript", "script", "style":
- return true
- }
- return false
- }
- func isExternalResourceAttribute(attribute string) bool {
- switch attribute {
- case "src", "href", "poster", "cite":
- return true
- default:
- return false
- }
- }
- func isHidden(n *html.Node) bool {
- for _, attr := range n.Attr {
- if attr.Key == "hidden" {
- return true
- }
- }
- return false
- }
- func isPixelTracker(tagName string, attributes []html.Attribute) bool {
- if tagName != "img" {
- return false
- }
- hasHeight := false
- hasWidth := false
- for _, attribute := range attributes {
- if attribute.Val == "1" || attribute.Val == "0" {
- switch attribute.Key {
- case "height":
- hasHeight = true
- case "width":
- hasWidth = true
- }
- }
- }
- return hasHeight && hasWidth
- }
- func isPositiveInteger(value string) bool {
- if value == "" {
- return false
- }
- if number, err := strconv.Atoi(value); err == nil {
- return number > 0
- }
- return false
- }
- func isSelfContainedTag(tag string) bool {
- switch tag {
- case "area", "base", "br", "col", "embed", "hr", "img", "input",
- "link", "meta", "param", "source", "track", "wbr":
- return true
- }
- return false
- }
- func isValidDataAttribute(value string) bool {
- for _, prefix := range dataAttributeAllowedPrefixes {
- if strings.HasPrefix(value, prefix) {
- return true
- }
- }
- return false
- }
- func isValidDecodingValue(value string) bool {
- switch value {
- case "sync", "async", "auto":
- return true
- }
- return false
- }
- func isValidFetchPriorityValue(value string) bool {
- switch value {
- case "high", "low", "auto":
- return true
- }
- return false
- }
- func rewriteIframeURL(link string) string {
- u, err := url.Parse(link)
- if err != nil {
- return link
- }
- switch strings.TrimPrefix(u.Hostname(), "www.") {
- case "youtube.com":
- if pathWithoutEmbed, ok := strings.CutPrefix(u.Path, "/embed/"); ok {
- if len(u.RawQuery) > 0 {
- return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed + "?" + u.RawQuery
- }
- return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed
- }
- case "player.vimeo.com":
- // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
- if strings.HasPrefix(u.Path, "/video/") {
- if len(u.RawQuery) > 0 {
- return link + "&dnt=1"
- }
- return link + "?dnt=1"
- }
- }
- return link
- }
- type mandatoryAttributesStruct struct {
- href bool
- src bool
- srcset bool
- }
- func trackAttributes(s *mandatoryAttributesStruct, attributeName string) {
- switch attributeName {
- case "href":
- s.href = true
- case "src":
- s.src = true
- case "srcset":
- s.srcset = true
- }
- }
- func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) (string, bool) {
- htmlAttrs := make([]string, 0, len(attributes))
- // Keep track of mandatory attributes for some tags
- mandatoryAttributes := mandatoryAttributesStruct{false, false, false}
- var isAnchorLink bool
- var isYouTubeEmbed bool
- // We know the element is present, as the tag was validated in the caller of `sanitizeAttributes`
- allowedAttributes := allowedHTMLTagsAndAttributes[tagName]
- for _, attribute := range attributes {
- if !slices.Contains(allowedAttributes, attribute.Key) {
- continue
- }
- value := attribute.Val
- switch tagName {
- case "math":
- if attribute.Key == "xmlns" {
- if value != "http://www.w3.org/1998/Math/MathML" {
- value = "http://www.w3.org/1998/Math/MathML"
- }
- }
- case "img":
- switch attribute.Key {
- case "fetchpriority":
- if !isValidFetchPriorityValue(value) {
- continue
- }
- case "decoding":
- if !isValidDecodingValue(value) {
- continue
- }
- case "width", "height":
- if !isPositiveInteger(value) {
- continue
- }
- case "srcset":
- value = sanitizeSrcsetAttr(parsedBaseUrl, value)
- if value == "" {
- continue
- }
- }
- case "source":
- if attribute.Key == "srcset" {
- value = sanitizeSrcsetAttr(parsedBaseUrl, value)
- if value == "" {
- continue
- }
- }
- }
- if isExternalResourceAttribute(attribute.Key) {
- switch {
- case tagName == "iframe":
- iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
- if !trustedIframeDomain {
- return "", false
- }
- value = rewriteIframeURL(attribute.Val)
- if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
- isYouTubeEmbed = true
- }
- case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
- value = attribute.Val
- case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
- value = attribute.Val
- isAnchorLink = true
- default:
- if isBlockedResource(value) {
- return "", false
- }
- var err error
- value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
- if err != nil {
- continue
- }
- if !hasValidURIScheme(value) {
- continue
- }
- // TODO use feedURL instead of baseURL twice.
- parsedValueUrl, _ := url.Parse(value)
- if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
- value = cleanedURL
- }
- }
- }
- trackAttributes(&mandatoryAttributes, attribute.Key)
- htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
- }
- if !hasRequiredAttributes(&mandatoryAttributes, tagName) {
- return "", false
- }
- if !isAnchorLink {
- switch tagName {
- case "a":
- htmlAttrs = append(htmlAttrs, `rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`)
- if sanitizerOptions.OpenLinksInNewTab {
- htmlAttrs = append(htmlAttrs, `target="_blank"`)
- }
- case "video", "audio":
- htmlAttrs = append(htmlAttrs, "controls")
- case "iframe":
- htmlAttrs = append(htmlAttrs, `sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`)
- // Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
- // See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
- if isYouTubeEmbed {
- htmlAttrs = append(htmlAttrs, `referrerpolicy="strict-origin-when-cross-origin"`)
- }
- case "img":
- htmlAttrs = append(htmlAttrs, `loading="lazy"`)
- }
- }
- return strings.Join(htmlAttrs, " "), true
- }
- func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
- candidates := ParseSrcSetAttribute(value)
- if len(candidates) == 0 {
- return ""
- }
- sanitizedCandidates := make([]*imageCandidate, 0, len(candidates))
- for _, imageCandidate := range candidates {
- absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL)
- if err != nil {
- continue
- }
- if !hasValidURIScheme(absoluteURL) || isBlockedResource(absoluteURL) {
- continue
- }
- imageCandidate.ImageURL = absoluteURL
- sanitizedCandidates = append(sanitizedCandidates, imageCandidate)
- }
- return imageCandidates(sanitizedCandidates).String()
- }
- func shouldIgnoreTag(n *html.Node, tag string) bool {
- if isPixelTracker(tag, n.Attr) {
- return true
- }
- if isBlockedTag(tag) {
- return true
- }
- if isHidden(n) {
- return true
- }
- return false
- }
|