sanitizer.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "errors"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. const (
  16. maxDepth = 512 // The maximum allowed depths for nested HTML tags, same was WebKit.
  17. )
  18. var (
  19. allowedHTMLTagsAndAttributes = map[string][]string{
  20. "a": {"href", "title", "id"},
  21. "abbr": {"title"},
  22. "acronym": {"title"},
  23. "aside": {},
  24. "audio": {"src"},
  25. "blockquote": {},
  26. "b": {},
  27. "br": {},
  28. "caption": {},
  29. "cite": {},
  30. "code": {},
  31. "dd": {"id"},
  32. "del": {},
  33. "dfn": {},
  34. "dl": {"id"},
  35. "dt": {"id"},
  36. "em": {},
  37. "figcaption": {},
  38. "figure": {},
  39. "h1": {"id"},
  40. "h2": {"id"},
  41. "h3": {"id"},
  42. "h4": {"id"},
  43. "h5": {"id"},
  44. "h6": {"id"},
  45. "hr": {},
  46. "i": {},
  47. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  48. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
  49. "ins": {},
  50. "kbd": {},
  51. "li": {"id"},
  52. "ol": {"id"},
  53. "p": {},
  54. "picture": {},
  55. "pre": {},
  56. "q": {"cite"},
  57. "rp": {},
  58. "rt": {},
  59. "rtc": {},
  60. "ruby": {},
  61. "s": {},
  62. "small": {},
  63. "samp": {},
  64. "source": {"src", "type", "srcset", "sizes", "media"},
  65. "strong": {},
  66. "sub": {},
  67. "sup": {"id"},
  68. "table": {},
  69. "td": {"rowspan", "colspan"},
  70. "tfoot": {},
  71. "th": {"rowspan", "colspan"},
  72. "thead": {},
  73. "time": {"datetime"},
  74. "tr": {},
  75. "u": {},
  76. "ul": {"id"},
  77. "var": {},
  78. "video": {"poster", "height", "width", "src"},
  79. "wbr": {},
  80. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  81. "annotation": {},
  82. "annotation-xml": {},
  83. "maction": {},
  84. "math": {"xmlns"},
  85. "merror": {},
  86. "mfrac": {},
  87. "mi": {},
  88. "mmultiscripts": {},
  89. "mn": {},
  90. "mo": {},
  91. "mover": {},
  92. "mpadded": {},
  93. "mphantom": {},
  94. "mprescripts": {},
  95. "mroot": {},
  96. "mrow": {},
  97. "ms": {},
  98. "mspace": {},
  99. "msqrt": {},
  100. "mstyle": {},
  101. "msub": {},
  102. "msubsup": {},
  103. "msup": {},
  104. "mtable": {},
  105. "mtd": {},
  106. "mtext": {},
  107. "mtr": {},
  108. "munder": {},
  109. "munderover": {},
  110. "semantics": {},
  111. }
  112. iframeAllowList = map[string]struct{}{
  113. "bandcamp.com": {},
  114. "cdn.embedly.com": {},
  115. "dailymotion.com": {},
  116. "framatube.org": {},
  117. "open.spotify.com": {},
  118. "player.bilibili.com": {},
  119. "player.twitch.tv": {},
  120. "player.vimeo.com": {},
  121. "soundcloud.com": {},
  122. "vk.com": {},
  123. "w.soundcloud.com": {},
  124. "youtube-nocookie.com": {},
  125. "youtube.com": {},
  126. }
  127. blockedResourceURLSubstrings = []string{
  128. "api.flattr.com",
  129. "www.facebook.com/sharer.php",
  130. "feeds.feedburner.com",
  131. "feedsportal.com",
  132. "linkedin.com/shareArticle",
  133. "pinterest.com/pin/create/button/",
  134. "stats.wordpress.com",
  135. "twitter.com/intent/tweet",
  136. "twitter.com/share",
  137. "x.com/intent/tweet",
  138. "x.com/share",
  139. }
  140. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  141. validURISchemes = []string{
  142. // Most commong schemes on top.
  143. "https:",
  144. "http:",
  145. // Then the rest.
  146. "apt:",
  147. "bitcoin:",
  148. "callto:",
  149. "dav:",
  150. "davs:",
  151. "ed2k:",
  152. "facetime:",
  153. "feed:",
  154. "ftp:",
  155. "geo:",
  156. "git:",
  157. "gopher:",
  158. "irc:",
  159. "irc6:",
  160. "ircs:",
  161. "itms-apps:",
  162. "itms:",
  163. "magnet:",
  164. "mailto:",
  165. "news:",
  166. "nntp:",
  167. "rtmp:",
  168. "sftp:",
  169. "sip:",
  170. "sips:",
  171. "skype:",
  172. "spotify:",
  173. "ssh:",
  174. "steam:",
  175. "svn:",
  176. "svn+ssh:",
  177. "tel:",
  178. "webcal:",
  179. "xmpp:",
  180. // iOS Apps
  181. "opener:", // https://www.opener.link
  182. "hack:", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  183. }
  184. dataAttributeAllowedPrefixes = []string{
  185. "data:image/avif",
  186. "data:image/apng",
  187. "data:image/png",
  188. "data:image/svg",
  189. "data:image/svg+xml",
  190. "data:image/jpg",
  191. "data:image/jpeg",
  192. "data:image/gif",
  193. "data:image/webp",
  194. }
  195. )
  196. // SanitizerOptions holds options for the HTML sanitizer.
  197. type SanitizerOptions struct {
  198. OpenLinksInNewTab bool
  199. }
  200. // SanitizeHTML takes raw HTML input and removes any disallowed tags and attributes.
  201. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  202. var buffer strings.Builder
  203. // Educated guess about how big the sanitized HTML will be,
  204. // to reduce the amount of buffer re-allocations in this function.
  205. estimatedRatio := len(rawHTML) * 3 / 4
  206. buffer.Grow(estimatedRatio)
  207. // We need to surround `rawHTML` with body tags so that html.Parse
  208. // will consider it a valid html document.
  209. doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
  210. if err != nil {
  211. return ""
  212. }
  213. /* The structure of `doc` is always:
  214. <html>
  215. <head>...</head>
  216. <body>..</body>
  217. </html>
  218. */
  219. body := doc.FirstChild.FirstChild.NextSibling
  220. // Errors are a non-issue, so they're handled in filterAndRenderHTML
  221. parsedBaseUrl, _ := url.Parse(baseURL)
  222. for c := body.FirstChild; c != nil; c = c.NextSibling {
  223. // -2 because of `<html><body>…`
  224. if err := filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions, maxDepth-2); err != nil {
  225. return ""
  226. }
  227. }
  228. return buffer.String()
  229. }
  230. func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
  231. iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
  232. if _, ok := iframeAllowList[iframeSourceDomain]; ok {
  233. return iframeSourceDomain, true
  234. }
  235. if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
  236. return iframeSourceDomain, true
  237. }
  238. if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
  239. return iframeSourceDomain, true
  240. }
  241. return "", false
  242. }
  243. func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
  244. if n == nil {
  245. return nil
  246. }
  247. if depth == 0 {
  248. return errors.New("maximum nested tags limit reached")
  249. }
  250. switch n.Type {
  251. case html.TextNode:
  252. buf.WriteString(html.EscapeString(n.Data))
  253. case html.ElementNode:
  254. tag := n.Data
  255. if shouldIgnoreTag(n, tag) {
  256. return nil
  257. }
  258. _, ok := allowedHTMLTagsAndAttributes[tag]
  259. if !ok {
  260. // The tag isn't allowed, but we're still interested in its content
  261. return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
  262. }
  263. htmlAttributes, hasAllRequiredAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
  264. if !hasAllRequiredAttributes {
  265. if tag == "iframe" {
  266. // A blocked iframe should not have its inner content rendered.
  267. return nil
  268. }
  269. // The tag doesn't have every required attributes but we're still interested in its content
  270. return filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
  271. }
  272. buf.WriteByte('<')
  273. buf.WriteString(n.Data)
  274. if htmlAttributes != "" {
  275. buf.WriteByte(' ')
  276. buf.WriteString(htmlAttributes)
  277. }
  278. buf.WriteByte('>')
  279. if isSelfContainedTag(tag) {
  280. return nil
  281. }
  282. if tag != "iframe" {
  283. // iframes aren't allowed to have child nodes.
  284. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions, depth-1)
  285. }
  286. buf.WriteString("</")
  287. buf.WriteString(n.Data)
  288. buf.WriteByte('>')
  289. default:
  290. }
  291. return nil
  292. }
  293. func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions, depth uint) error {
  294. for c := n.FirstChild; c != nil; c = c.NextSibling {
  295. if err := filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions, depth); err != nil {
  296. return err
  297. }
  298. }
  299. return nil
  300. }
  301. func hasRequiredAttributes(s *mandatoryAttributesStruct, tagName string) bool {
  302. switch tagName {
  303. case "a":
  304. return s.href
  305. case "iframe":
  306. return s.src
  307. case "source", "img":
  308. return s.src || s.srcset
  309. }
  310. return true
  311. }
  312. func hasValidURIScheme(absoluteURL string) bool {
  313. for _, scheme := range validURISchemes {
  314. if strings.HasPrefix(absoluteURL, scheme) {
  315. return true
  316. }
  317. }
  318. return false
  319. }
  320. func isBlockedResource(absoluteURL string) bool {
  321. for _, blockedURL := range blockedResourceURLSubstrings {
  322. if strings.Contains(absoluteURL, blockedURL) {
  323. return true
  324. }
  325. }
  326. return false
  327. }
  328. func isBlockedTag(tagName string) bool {
  329. switch tagName {
  330. case "noscript", "script", "style":
  331. return true
  332. }
  333. return false
  334. }
  335. func isExternalResourceAttribute(attribute string) bool {
  336. switch attribute {
  337. case "src", "href", "poster", "cite":
  338. return true
  339. default:
  340. return false
  341. }
  342. }
  343. func isHidden(n *html.Node) bool {
  344. for _, attr := range n.Attr {
  345. if attr.Key == "hidden" {
  346. return true
  347. }
  348. }
  349. return false
  350. }
  351. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  352. if tagName != "img" {
  353. return false
  354. }
  355. hasHeight := false
  356. hasWidth := false
  357. for _, attribute := range attributes {
  358. if attribute.Val == "1" || attribute.Val == "0" {
  359. switch attribute.Key {
  360. case "height":
  361. hasHeight = true
  362. case "width":
  363. hasWidth = true
  364. }
  365. }
  366. }
  367. return hasHeight && hasWidth
  368. }
  369. func isPositiveInteger(value string) bool {
  370. if value == "" {
  371. return false
  372. }
  373. if number, err := strconv.Atoi(value); err == nil {
  374. return number > 0
  375. }
  376. return false
  377. }
  378. func isSelfContainedTag(tag string) bool {
  379. switch tag {
  380. case "area", "base", "br", "col", "embed", "hr", "img", "input",
  381. "link", "meta", "param", "source", "track", "wbr":
  382. return true
  383. }
  384. return false
  385. }
  386. func isValidDataAttribute(value string) bool {
  387. for _, prefix := range dataAttributeAllowedPrefixes {
  388. if strings.HasPrefix(value, prefix) {
  389. return true
  390. }
  391. }
  392. return false
  393. }
  394. func isValidDecodingValue(value string) bool {
  395. switch value {
  396. case "sync", "async", "auto":
  397. return true
  398. }
  399. return false
  400. }
  401. func isValidFetchPriorityValue(value string) bool {
  402. switch value {
  403. case "high", "low", "auto":
  404. return true
  405. }
  406. return false
  407. }
  408. func rewriteIframeURL(link string) string {
  409. u, err := url.Parse(link)
  410. if err != nil {
  411. return link
  412. }
  413. switch strings.TrimPrefix(u.Hostname(), "www.") {
  414. case "youtube.com":
  415. if pathWithoutEmbed, ok := strings.CutPrefix(u.Path, "/embed/"); ok {
  416. if len(u.RawQuery) > 0 {
  417. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed + "?" + u.RawQuery
  418. }
  419. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed
  420. }
  421. case "player.vimeo.com":
  422. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  423. if strings.HasPrefix(u.Path, "/video/") {
  424. if len(u.RawQuery) > 0 {
  425. return link + "&dnt=1"
  426. }
  427. return link + "?dnt=1"
  428. }
  429. }
  430. return link
  431. }
  432. type mandatoryAttributesStruct struct {
  433. href bool
  434. src bool
  435. srcset bool
  436. }
  437. func trackAttributes(s *mandatoryAttributesStruct, attributeName string) {
  438. switch attributeName {
  439. case "href":
  440. s.href = true
  441. case "src":
  442. s.src = true
  443. case "srcset":
  444. s.srcset = true
  445. }
  446. }
  447. func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) (string, bool) {
  448. htmlAttrs := make([]string, 0, len(attributes))
  449. // Keep track of mandatory attributes for some tags
  450. mandatoryAttributes := mandatoryAttributesStruct{false, false, false}
  451. var isAnchorLink bool
  452. var isYouTubeEmbed bool
  453. // We know the element is present, as the tag was validated in the caller of `sanitizeAttributes`
  454. allowedAttributes := allowedHTMLTagsAndAttributes[tagName]
  455. for _, attribute := range attributes {
  456. if !slices.Contains(allowedAttributes, attribute.Key) {
  457. continue
  458. }
  459. value := attribute.Val
  460. switch tagName {
  461. case "math":
  462. if attribute.Key == "xmlns" {
  463. if value != "http://www.w3.org/1998/Math/MathML" {
  464. value = "http://www.w3.org/1998/Math/MathML"
  465. }
  466. }
  467. case "img":
  468. switch attribute.Key {
  469. case "fetchpriority":
  470. if !isValidFetchPriorityValue(value) {
  471. continue
  472. }
  473. case "decoding":
  474. if !isValidDecodingValue(value) {
  475. continue
  476. }
  477. case "width", "height":
  478. if !isPositiveInteger(value) {
  479. continue
  480. }
  481. case "srcset":
  482. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  483. if value == "" {
  484. continue
  485. }
  486. }
  487. case "source":
  488. if attribute.Key == "srcset" {
  489. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  490. if value == "" {
  491. continue
  492. }
  493. }
  494. }
  495. if isExternalResourceAttribute(attribute.Key) {
  496. switch {
  497. case tagName == "iframe":
  498. iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
  499. if !trustedIframeDomain {
  500. return "", false
  501. }
  502. value = rewriteIframeURL(attribute.Val)
  503. if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
  504. isYouTubeEmbed = true
  505. }
  506. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  507. value = attribute.Val
  508. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  509. value = attribute.Val
  510. isAnchorLink = true
  511. default:
  512. if isBlockedResource(value) {
  513. return "", false
  514. }
  515. var err error
  516. value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
  517. if err != nil {
  518. continue
  519. }
  520. if !hasValidURIScheme(value) {
  521. continue
  522. }
  523. // TODO use feedURL instead of baseURL twice.
  524. parsedValueUrl, _ := url.Parse(value)
  525. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  526. value = cleanedURL
  527. }
  528. }
  529. }
  530. trackAttributes(&mandatoryAttributes, attribute.Key)
  531. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  532. }
  533. if !hasRequiredAttributes(&mandatoryAttributes, tagName) {
  534. return "", false
  535. }
  536. if !isAnchorLink {
  537. switch tagName {
  538. case "a":
  539. htmlAttrs = append(htmlAttrs, `rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`)
  540. if sanitizerOptions.OpenLinksInNewTab {
  541. htmlAttrs = append(htmlAttrs, `target="_blank"`)
  542. }
  543. case "video", "audio":
  544. htmlAttrs = append(htmlAttrs, "controls")
  545. case "iframe":
  546. htmlAttrs = append(htmlAttrs, `sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`)
  547. // Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
  548. // See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
  549. if isYouTubeEmbed {
  550. htmlAttrs = append(htmlAttrs, `referrerpolicy="strict-origin-when-cross-origin"`)
  551. }
  552. case "img":
  553. htmlAttrs = append(htmlAttrs, `loading="lazy"`)
  554. }
  555. }
  556. return strings.Join(htmlAttrs, " "), true
  557. }
  558. func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
  559. candidates := ParseSrcSetAttribute(value)
  560. if len(candidates) == 0 {
  561. return ""
  562. }
  563. sanitizedCandidates := make([]*imageCandidate, 0, len(candidates))
  564. for _, imageCandidate := range candidates {
  565. absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL)
  566. if err != nil {
  567. continue
  568. }
  569. if !hasValidURIScheme(absoluteURL) || isBlockedResource(absoluteURL) {
  570. continue
  571. }
  572. imageCandidate.ImageURL = absoluteURL
  573. sanitizedCandidates = append(sanitizedCandidates, imageCandidate)
  574. }
  575. return imageCandidates(sanitizedCandidates).String()
  576. }
  577. func shouldIgnoreTag(n *html.Node, tag string) bool {
  578. if isPixelTracker(tag, n.Attr) {
  579. return true
  580. }
  581. if isBlockedTag(tag) {
  582. return true
  583. }
  584. if isHidden(n) {
  585. return true
  586. }
  587. return false
  588. }