sanitizer.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "io"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. allowedHTMLTagsAndAttributes = map[string][]string{
  17. "a": {"href", "title", "id"},
  18. "abbr": {"title"},
  19. "acronym": {"title"},
  20. "aside": {},
  21. "audio": {"src"},
  22. "blockquote": {},
  23. "b": {},
  24. "br": {},
  25. "caption": {},
  26. "cite": {},
  27. "code": {},
  28. "dd": {"id"},
  29. "del": {},
  30. "dfn": {},
  31. "dl": {"id"},
  32. "dt": {"id"},
  33. "em": {},
  34. "figcaption": {},
  35. "figure": {},
  36. "h1": {"id"},
  37. "h2": {"id"},
  38. "h3": {"id"},
  39. "h4": {"id"},
  40. "h5": {"id"},
  41. "h6": {"id"},
  42. "hr": {},
  43. "i": {},
  44. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  45. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
  46. "ins": {},
  47. "kbd": {},
  48. "li": {"id"},
  49. "ol": {"id"},
  50. "p": {},
  51. "picture": {},
  52. "pre": {},
  53. "q": {"cite"},
  54. "rp": {},
  55. "rt": {},
  56. "rtc": {},
  57. "ruby": {},
  58. "s": {},
  59. "small": {},
  60. "samp": {},
  61. "source": {"src", "type", "srcset", "sizes", "media"},
  62. "strong": {},
  63. "sub": {},
  64. "sup": {"id"},
  65. "table": {},
  66. "td": {"rowspan", "colspan"},
  67. "tfoot": {},
  68. "th": {"rowspan", "colspan"},
  69. "thead": {},
  70. "time": {"datetime"},
  71. "tr": {},
  72. "u": {},
  73. "ul": {"id"},
  74. "var": {},
  75. "video": {"poster", "height", "width", "src"},
  76. "wbr": {},
  77. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  78. "annotation": {},
  79. "annotation-xml": {},
  80. "maction": {},
  81. "math": {"xmlns"},
  82. "merror": {},
  83. "mfrac": {},
  84. "mi": {},
  85. "mmultiscripts": {},
  86. "mn": {},
  87. "mo": {},
  88. "mover": {},
  89. "mpadded": {},
  90. "mphantom": {},
  91. "mprescripts": {},
  92. "mroot": {},
  93. "mrow": {},
  94. "ms": {},
  95. "mspace": {},
  96. "msqrt": {},
  97. "mstyle": {},
  98. "msub": {},
  99. "msubsup": {},
  100. "msup": {},
  101. "mtable": {},
  102. "mtd": {},
  103. "mtext": {},
  104. "mtr": {},
  105. "munder": {},
  106. "munderover": {},
  107. "semantics": {},
  108. }
  109. iframeAllowList = map[string]struct{}{
  110. "bandcamp.com": {},
  111. "cdn.embedly.com": {},
  112. "dailymotion.com": {},
  113. "framatube.org": {},
  114. "open.spotify.com": {},
  115. "player.bilibili.com": {},
  116. "player.twitch.tv": {},
  117. "player.vimeo.com": {},
  118. "soundcloud.com": {},
  119. "vk.com": {},
  120. "w.soundcloud.com": {},
  121. "youtube-nocookie.com": {},
  122. "youtube.com": {},
  123. }
  124. blockedResourceURLSubstrings = []string{
  125. "api.flattr.com",
  126. "www.facebook.com/sharer.php",
  127. "feeds.feedburner.com",
  128. "feedsportal.com",
  129. "linkedin.com/shareArticle",
  130. "pinterest.com/pin/create/button/",
  131. "stats.wordpress.com",
  132. "twitter.com/intent/tweet",
  133. "twitter.com/share",
  134. "x.com/intent/tweet",
  135. "x.com/share",
  136. }
  137. dataAttributeAllowedPrefixes = []string{
  138. "data:image/avif",
  139. "data:image/apng",
  140. "data:image/png",
  141. "data:image/svg",
  142. "data:image/svg+xml",
  143. "data:image/jpg",
  144. "data:image/jpeg",
  145. "data:image/gif",
  146. "data:image/webp",
  147. }
  148. )
  149. // SanitizerOptions holds options for the HTML sanitizer.
  150. type SanitizerOptions struct {
  151. OpenLinksInNewTab bool
  152. }
  153. // SanitizeHTML takes raw HTML input and removes any disallowed tags and attributes.
  154. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  155. var buffer strings.Builder
  156. // Educated guess about how big the sanitized HTML will be,
  157. // to reduce the amount of buffer re-allocations in this function.
  158. estimatedRatio := len(rawHTML) * 3 / 4
  159. buffer.Grow(estimatedRatio)
  160. // We need to surround `rawHTML` with body tags so that html.Parse
  161. // will consider it a valid html document.
  162. doc, err := html.Parse(io.MultiReader(
  163. strings.NewReader("<body>"),
  164. strings.NewReader(rawHTML),
  165. strings.NewReader("</body>"),
  166. ))
  167. if err != nil {
  168. return ""
  169. }
  170. /* The structure of `doc` is always:
  171. <html>
  172. <head>...</head>
  173. <body>..</body>
  174. </html>
  175. */
  176. body := doc.FirstChild.FirstChild.NextSibling
  177. // Errors are a non-issue, so they're handled in filterAndRenderHTML
  178. parsedBaseUrl, _ := url.Parse(baseURL)
  179. for c := body.FirstChild; c != nil; c = c.NextSibling {
  180. if err := filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions); err != nil {
  181. return ""
  182. }
  183. }
  184. return buffer.String()
  185. }
  186. func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
  187. iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
  188. if _, ok := iframeAllowList[iframeSourceDomain]; ok {
  189. return iframeSourceDomain, true
  190. }
  191. if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
  192. return iframeSourceDomain, true
  193. }
  194. if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
  195. return iframeSourceDomain, true
  196. }
  197. return "", false
  198. }
  199. func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) error {
  200. if n == nil {
  201. return nil
  202. }
  203. switch n.Type {
  204. case html.TextNode:
  205. buf.WriteString(html.EscapeString(n.Data))
  206. case html.ElementNode:
  207. tag := n.Data
  208. if shouldIgnoreTag(n, tag) {
  209. return nil
  210. }
  211. _, ok := allowedHTMLTagsAndAttributes[tag]
  212. if !ok {
  213. // The tag isn't allowed, but we're still interested in its content
  214. return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  215. }
  216. htmlAttributes, hasAllRequiredAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
  217. if !hasAllRequiredAttributes {
  218. if tag == "iframe" {
  219. // A blocked iframe should not have its inner content rendered.
  220. return nil
  221. }
  222. // The tag doesn't have every required attributes but we're still interested in its content
  223. return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  224. }
  225. buf.WriteByte('<')
  226. buf.WriteString(n.Data)
  227. if htmlAttributes != "" {
  228. buf.WriteByte(' ')
  229. buf.WriteString(htmlAttributes)
  230. }
  231. buf.WriteByte('>')
  232. if isSelfContainedTag(tag) {
  233. return nil
  234. }
  235. if tag != "iframe" {
  236. // iframes aren't allowed to have child nodes.
  237. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  238. }
  239. buf.WriteString("</")
  240. buf.WriteString(n.Data)
  241. buf.WriteByte('>')
  242. default:
  243. }
  244. return nil
  245. }
  246. func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) error {
  247. for c := n.FirstChild; c != nil; c = c.NextSibling {
  248. if err := filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions); err != nil {
  249. return err
  250. }
  251. }
  252. return nil
  253. }
  254. func hasRequiredAttributes(s *mandatoryAttributesStruct, tagName string) bool {
  255. switch tagName {
  256. case "a":
  257. return s.href
  258. case "iframe":
  259. return s.src
  260. case "source", "img":
  261. return s.src || s.srcset
  262. }
  263. return true
  264. }
  265. func isBlockedResource(absoluteURL string) bool {
  266. for _, blockedURL := range blockedResourceURLSubstrings {
  267. if strings.Contains(absoluteURL, blockedURL) {
  268. return true
  269. }
  270. }
  271. return false
  272. }
  273. func isBlockedTag(tagName string) bool {
  274. switch tagName {
  275. case "noscript", "script", "style":
  276. return true
  277. }
  278. return false
  279. }
  280. func isExternalResourceAttribute(attribute string) bool {
  281. switch attribute {
  282. case "src", "href", "poster", "cite":
  283. return true
  284. default:
  285. return false
  286. }
  287. }
  288. func isHidden(n *html.Node) bool {
  289. for _, attr := range n.Attr {
  290. if attr.Key == "hidden" {
  291. return true
  292. }
  293. }
  294. return false
  295. }
  296. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  297. if tagName != "img" {
  298. return false
  299. }
  300. hasHeight := false
  301. hasWidth := false
  302. for _, attribute := range attributes {
  303. if attribute.Val == "1" || attribute.Val == "0" {
  304. switch attribute.Key {
  305. case "height":
  306. hasHeight = true
  307. case "width":
  308. hasWidth = true
  309. }
  310. }
  311. }
  312. return hasHeight && hasWidth
  313. }
  314. func isPositiveInteger(value string) bool {
  315. if value == "" {
  316. return false
  317. }
  318. if number, err := strconv.Atoi(value); err == nil {
  319. return number > 0
  320. }
  321. return false
  322. }
  323. func isSelfContainedTag(tag string) bool {
  324. switch tag {
  325. case "area", "base", "br", "col", "embed", "hr", "img", "input",
  326. "link", "meta", "param", "source", "track", "wbr":
  327. return true
  328. }
  329. return false
  330. }
  331. func isValidDataAttribute(value string) bool {
  332. for _, prefix := range dataAttributeAllowedPrefixes {
  333. if strings.HasPrefix(value, prefix) {
  334. return true
  335. }
  336. }
  337. return false
  338. }
  339. func isValidDecodingValue(value string) bool {
  340. switch value {
  341. case "sync", "async", "auto":
  342. return true
  343. }
  344. return false
  345. }
  346. func isValidFetchPriorityValue(value string) bool {
  347. switch value {
  348. case "high", "low", "auto":
  349. return true
  350. }
  351. return false
  352. }
  353. func rewriteIframeURL(link string) string {
  354. u, err := url.Parse(link)
  355. if err != nil {
  356. return link
  357. }
  358. switch strings.TrimPrefix(u.Hostname(), "www.") {
  359. case "youtube.com":
  360. if pathWithoutEmbed, ok := strings.CutPrefix(u.Path, "/embed/"); ok {
  361. if len(u.RawQuery) > 0 {
  362. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed + "?" + u.RawQuery
  363. }
  364. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed
  365. }
  366. case "player.vimeo.com":
  367. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  368. if strings.HasPrefix(u.Path, "/video/") {
  369. if len(u.RawQuery) > 0 {
  370. return link + "&dnt=1"
  371. }
  372. return link + "?dnt=1"
  373. }
  374. }
  375. return link
  376. }
  377. type mandatoryAttributesStruct struct {
  378. href bool
  379. src bool
  380. srcset bool
  381. }
  382. func trackAttributes(s *mandatoryAttributesStruct, attributeName string) {
  383. switch attributeName {
  384. case "href":
  385. s.href = true
  386. case "src":
  387. s.src = true
  388. case "srcset":
  389. s.srcset = true
  390. }
  391. }
  392. func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) (string, bool) {
  393. var htmlAttrs strings.Builder
  394. // Rough estimate: most attributes are short; ~24 bytes (key + ="value") is
  395. // a reasonable starting point. Avoids early grows for typical elements.
  396. htmlAttrs.Grow(len(attributes) * 24)
  397. // writeAttr appends key="value" to htmlAttrs, prefixing with a single
  398. // space when not the first written attribute. value is HTML-escaped.
  399. writeAttr := func(key, value string) {
  400. htmlAttrs.WriteByte(' ')
  401. htmlAttrs.WriteString(key)
  402. htmlAttrs.WriteString(`="`)
  403. htmlAttrs.WriteString(html.EscapeString(value))
  404. htmlAttrs.WriteByte('"')
  405. }
  406. // Keep track of mandatory attributes for some tags
  407. mandatoryAttributes := mandatoryAttributesStruct{false, false, false}
  408. var isAnchorLink bool
  409. var isYouTubeEmbed bool
  410. // We know the element is present, as the tag was validated in the caller of `sanitizeAttributes`
  411. allowedAttributes := allowedHTMLTagsAndAttributes[tagName]
  412. for _, attribute := range attributes {
  413. if !slices.Contains(allowedAttributes, attribute.Key) {
  414. continue
  415. }
  416. value := attribute.Val
  417. switch tagName {
  418. case "math":
  419. if attribute.Key == "xmlns" {
  420. value = "http://www.w3.org/1998/Math/MathML"
  421. }
  422. case "img":
  423. switch attribute.Key {
  424. case "fetchpriority":
  425. if !isValidFetchPriorityValue(value) {
  426. continue
  427. }
  428. case "decoding":
  429. if !isValidDecodingValue(value) {
  430. continue
  431. }
  432. case "width", "height":
  433. if !isPositiveInteger(value) {
  434. continue
  435. }
  436. case "srcset":
  437. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  438. if value == "" {
  439. continue
  440. }
  441. }
  442. case "source":
  443. if attribute.Key == "srcset" {
  444. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  445. if value == "" {
  446. continue
  447. }
  448. }
  449. }
  450. if isExternalResourceAttribute(attribute.Key) {
  451. switch {
  452. case tagName == "iframe":
  453. iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
  454. if !trustedIframeDomain {
  455. return "", false
  456. }
  457. value = rewriteIframeURL(attribute.Val)
  458. if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
  459. isYouTubeEmbed = true
  460. }
  461. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  462. value = attribute.Val
  463. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  464. value = attribute.Val
  465. isAnchorLink = true
  466. default:
  467. if isBlockedResource(value) {
  468. return "", false
  469. }
  470. var err error
  471. value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
  472. if err != nil {
  473. continue
  474. }
  475. if !HasValidURIScheme(value) {
  476. continue
  477. }
  478. // Skip the parse + RemoveTrackingParameters round trip when there
  479. // is no query string to clean, which is common for <img>.
  480. if strings.IndexByte(value, '?') >= 0 {
  481. parsedValueUrl, _ := url.Parse(value)
  482. // TODO use feedURL instead of baseURL twice.
  483. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  484. value = cleanedURL
  485. }
  486. }
  487. }
  488. }
  489. trackAttributes(&mandatoryAttributes, attribute.Key)
  490. writeAttr(attribute.Key, value)
  491. }
  492. if !hasRequiredAttributes(&mandatoryAttributes, tagName) {
  493. return "", false
  494. }
  495. if !isAnchorLink {
  496. switch tagName {
  497. case "a":
  498. writeAttr("rel", "noopener noreferrer")
  499. writeAttr("referrerpolicy", "no-referrer")
  500. if sanitizerOptions.OpenLinksInNewTab {
  501. writeAttr("target", "_blank")
  502. }
  503. case "video", "audio":
  504. htmlAttrs.WriteString(" controls")
  505. case "iframe":
  506. writeAttr("sandbox", "allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox")
  507. writeAttr("loading", "lazy")
  508. // Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
  509. // See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
  510. if isYouTubeEmbed {
  511. writeAttr("referrerpolicy", "strict-origin-when-cross-origin")
  512. }
  513. case "img":
  514. writeAttr("loading", "lazy")
  515. }
  516. }
  517. return strings.TrimLeft(htmlAttrs.String(), " "), true
  518. }
  519. func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
  520. candidates := ParseSrcSetAttribute(value)
  521. if len(candidates) == 0 {
  522. return ""
  523. }
  524. sanitizedCandidates := make([]*imageCandidate, 0, len(candidates))
  525. for _, imageCandidate := range candidates {
  526. absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL)
  527. if err != nil {
  528. continue
  529. }
  530. if !HasValidURIScheme(absoluteURL) || isBlockedResource(absoluteURL) {
  531. continue
  532. }
  533. imageCandidate.ImageURL = absoluteURL
  534. sanitizedCandidates = append(sanitizedCandidates, imageCandidate)
  535. }
  536. return imageCandidates(sanitizedCandidates).String()
  537. }
  538. func shouldIgnoreTag(n *html.Node, tag string) bool {
  539. if isPixelTracker(tag, n.Attr) {
  540. return true
  541. }
  542. if isBlockedTag(tag) {
  543. return true
  544. }
  545. if isHidden(n) {
  546. return true
  547. }
  548. return false
  549. }