sanitizer.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "net/url"
  6. "slices"
  7. "strconv"
  8. "strings"
  9. "miniflux.app/v2/internal/config"
  10. "miniflux.app/v2/internal/reader/urlcleaner"
  11. "miniflux.app/v2/internal/urllib"
  12. "golang.org/x/net/html"
  13. )
  14. var (
  15. allowedHTMLTagsAndAttributes = map[string][]string{
  16. "a": {"href", "title", "id"},
  17. "abbr": {"title"},
  18. "acronym": {"title"},
  19. "aside": {},
  20. "audio": {"src"},
  21. "blockquote": {},
  22. "b": {},
  23. "br": {},
  24. "caption": {},
  25. "cite": {},
  26. "code": {},
  27. "dd": {"id"},
  28. "del": {},
  29. "dfn": {},
  30. "dl": {"id"},
  31. "dt": {"id"},
  32. "em": {},
  33. "figcaption": {},
  34. "figure": {},
  35. "h1": {"id"},
  36. "h2": {"id"},
  37. "h3": {"id"},
  38. "h4": {"id"},
  39. "h5": {"id"},
  40. "h6": {"id"},
  41. "hr": {},
  42. "i": {},
  43. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  44. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
  45. "ins": {},
  46. "kbd": {},
  47. "li": {"id"},
  48. "ol": {"id"},
  49. "p": {},
  50. "picture": {},
  51. "pre": {},
  52. "q": {"cite"},
  53. "rp": {},
  54. "rt": {},
  55. "rtc": {},
  56. "ruby": {},
  57. "s": {},
  58. "small": {},
  59. "samp": {},
  60. "source": {"src", "type", "srcset", "sizes", "media"},
  61. "strong": {},
  62. "sub": {},
  63. "sup": {"id"},
  64. "table": {},
  65. "td": {"rowspan", "colspan"},
  66. "tfoot": {},
  67. "th": {"rowspan", "colspan"},
  68. "thead": {},
  69. "time": {"datetime"},
  70. "tr": {},
  71. "u": {},
  72. "ul": {"id"},
  73. "var": {},
  74. "video": {"poster", "height", "width", "src"},
  75. "wbr": {},
  76. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  77. "annotation": {},
  78. "annotation-xml": {},
  79. "maction": {},
  80. "math": {"xmlns"},
  81. "merror": {},
  82. "mfrac": {},
  83. "mi": {},
  84. "mmultiscripts": {},
  85. "mn": {},
  86. "mo": {},
  87. "mover": {},
  88. "mpadded": {},
  89. "mphantom": {},
  90. "mprescripts": {},
  91. "mroot": {},
  92. "mrow": {},
  93. "ms": {},
  94. "mspace": {},
  95. "msqrt": {},
  96. "mstyle": {},
  97. "msub": {},
  98. "msubsup": {},
  99. "msup": {},
  100. "mtable": {},
  101. "mtd": {},
  102. "mtext": {},
  103. "mtr": {},
  104. "munder": {},
  105. "munderover": {},
  106. "semantics": {},
  107. }
  108. iframeAllowList = map[string]struct{}{
  109. "bandcamp.com": {},
  110. "cdn.embedly.com": {},
  111. "dailymotion.com": {},
  112. "open.spotify.com": {},
  113. "player.bilibili.com": {},
  114. "player.twitch.tv": {},
  115. "player.vimeo.com": {},
  116. "soundcloud.com": {},
  117. "vk.com": {},
  118. "w.soundcloud.com": {},
  119. "youtube-nocookie.com": {},
  120. "youtube.com": {},
  121. }
  122. blockedResourceURLSubstrings = []string{
  123. "api.flattr.com",
  124. "feeds.feedburner.com",
  125. "feedsportal.com",
  126. "pinterest.com/pin/create/button/",
  127. "stats.wordpress.com",
  128. "twitter.com/intent/tweet",
  129. "twitter.com/share",
  130. "facebook.com/sharer.php",
  131. "linkedin.com/shareArticle",
  132. }
  133. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  134. validURISchemes = []string{
  135. // Most commong schemes on top.
  136. "https:",
  137. "http:",
  138. // Then the rest.
  139. "apt:",
  140. "bitcoin:",
  141. "callto:",
  142. "dav:",
  143. "davs:",
  144. "ed2k:",
  145. "facetime:",
  146. "feed:",
  147. "ftp:",
  148. "geo:",
  149. "git:",
  150. "gopher:",
  151. "irc:",
  152. "irc6:",
  153. "ircs:",
  154. "itms-apps:",
  155. "itms:",
  156. "magnet:",
  157. "mailto:",
  158. "news:",
  159. "nntp:",
  160. "rtmp:",
  161. "sftp:",
  162. "sip:",
  163. "sips:",
  164. "skype:",
  165. "spotify:",
  166. "ssh:",
  167. "steam:",
  168. "svn:",
  169. "svn+ssh:",
  170. "tel:",
  171. "webcal:",
  172. "xmpp:",
  173. // iOS Apps
  174. "opener:", // https://www.opener.link
  175. "hack:", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  176. }
  177. dataAttributeAllowedPrefixes = []string{
  178. "data:image/avif",
  179. "data:image/apng",
  180. "data:image/png",
  181. "data:image/svg",
  182. "data:image/svg+xml",
  183. "data:image/jpg",
  184. "data:image/jpeg",
  185. "data:image/gif",
  186. "data:image/webp",
  187. }
  188. )
  189. // SanitizerOptions holds options for the HTML sanitizer.
  190. type SanitizerOptions struct {
  191. OpenLinksInNewTab bool
  192. }
  193. // SanitizeHTML takes raw HTML input and removes any disallowed tags and attributes.
  194. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  195. var buffer strings.Builder
  196. // Educated guess about how big the sanitized HTML will be,
  197. // to reduce the amount of buffer re-allocations in this function.
  198. estimatedRatio := len(rawHTML) * 3 / 4
  199. buffer.Grow(estimatedRatio)
  200. // We need to surround `rawHTML` with body tags so that html.Parse
  201. // will consider it a valid html document.
  202. doc, err := html.Parse(strings.NewReader("<body>" + rawHTML + "</body>"))
  203. if err != nil {
  204. return ""
  205. }
  206. /* The structure of `doc` is always:
  207. <html>
  208. <head>...</head>
  209. <body>..</body>
  210. </html>
  211. */
  212. body := doc.FirstChild.FirstChild.NextSibling
  213. // Errors are a non-issue, so they're handled in filterAndRenderHTML
  214. parsedBaseUrl, _ := url.Parse(baseURL)
  215. for c := body.FirstChild; c != nil; c = c.NextSibling {
  216. filterAndRenderHTML(&buffer, c, parsedBaseUrl, sanitizerOptions)
  217. }
  218. return buffer.String()
  219. }
  220. func isHidden(n *html.Node) bool {
  221. for _, attr := range n.Attr {
  222. if strings.ToLower(attr.Key) == "hidden" {
  223. return true
  224. }
  225. }
  226. return false
  227. }
  228. func shouldIgnoreTag(n *html.Node, tag string) bool {
  229. if isPixelTracker(tag, n.Attr) {
  230. return true
  231. }
  232. if isBlockedTag(tag) {
  233. return true
  234. }
  235. if isHidden(n) {
  236. return true
  237. }
  238. return false
  239. }
  240. func isSelfContainedTag(tag string) bool {
  241. switch tag {
  242. case "area", "base", "br", "col", "embed", "hr", "img", "input",
  243. "link", "meta", "param", "source", "track", "wbr":
  244. return true
  245. }
  246. return false
  247. }
  248. func filterAndRenderHTMLChildren(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
  249. for c := n.FirstChild; c != nil; c = c.NextSibling {
  250. filterAndRenderHTML(buf, c, parsedBaseUrl, sanitizerOptions)
  251. }
  252. }
  253. func filterAndRenderHTML(buf *strings.Builder, n *html.Node, parsedBaseUrl *url.URL, sanitizerOptions *SanitizerOptions) {
  254. if n == nil {
  255. return
  256. }
  257. switch n.Type {
  258. case html.TextNode:
  259. buf.WriteString(html.EscapeString(n.Data))
  260. case html.ElementNode:
  261. tag := strings.ToLower(n.Data)
  262. if shouldIgnoreTag(n, tag) {
  263. return
  264. }
  265. _, ok := allowedHTMLTagsAndAttributes[tag]
  266. if !ok {
  267. // The tag isn't allowed, but we're still interested in its content
  268. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  269. return
  270. }
  271. attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tag, n.Attr, sanitizerOptions)
  272. if !hasRequiredAttributes(tag, attrNames) {
  273. // The tag doesn't have every required attributes but we're still interested in its content
  274. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  275. return
  276. }
  277. buf.WriteString("<")
  278. buf.WriteString(n.Data)
  279. if len(attrNames) > 0 {
  280. buf.WriteString(" " + htmlAttributes)
  281. }
  282. buf.WriteString(">")
  283. if isSelfContainedTag(tag) {
  284. return
  285. }
  286. if tag != "iframe" {
  287. // iframes aren't allowed to have child nodes.
  288. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  289. }
  290. buf.WriteString("</")
  291. buf.WriteString(n.Data)
  292. buf.WriteString(">")
  293. case html.DocumentNode:
  294. filterAndRenderHTMLChildren(buf, n, parsedBaseUrl, sanitizerOptions)
  295. default:
  296. }
  297. }
  298. func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
  299. htmlAttrs := make([]string, 0, len(attributes))
  300. attrNames := make([]string, 0, len(attributes))
  301. var err error
  302. var isAnchorLink bool
  303. var isYouTubeEmbed bool
  304. for _, attribute := range attributes {
  305. if !isValidAttribute(tagName, attribute.Key) {
  306. continue
  307. }
  308. value := attribute.Val
  309. switch tagName {
  310. case "math":
  311. if attribute.Key == "xmlns" {
  312. if value != "http://www.w3.org/1998/Math/MathML" {
  313. value = "http://www.w3.org/1998/Math/MathML"
  314. }
  315. }
  316. case "img":
  317. switch attribute.Key {
  318. case "fetchpriority":
  319. if !isValidFetchPriorityValue(value) {
  320. continue
  321. }
  322. case "decoding":
  323. if !isValidDecodingValue(value) {
  324. continue
  325. }
  326. case "width", "height":
  327. if !isPositiveInteger(value) {
  328. continue
  329. }
  330. case "srcset":
  331. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  332. }
  333. case "source":
  334. if attribute.Key == "srcset" {
  335. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  336. }
  337. }
  338. if isExternalResourceAttribute(attribute.Key) {
  339. switch {
  340. case tagName == "iframe":
  341. iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
  342. if !trustedIframeDomain {
  343. continue
  344. }
  345. value = rewriteIframeURL(attribute.Val)
  346. if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
  347. isYouTubeEmbed = true
  348. }
  349. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  350. value = attribute.Val
  351. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  352. value = attribute.Val
  353. isAnchorLink = true
  354. default:
  355. value, err = urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseUrl, value)
  356. if err != nil {
  357. continue
  358. }
  359. if !hasValidURIScheme(value) || isBlockedResource(value) {
  360. continue
  361. }
  362. // TODO use feedURL instead of baseURL twice.
  363. parsedValueUrl, _ := url.Parse(value)
  364. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  365. value = cleanedURL
  366. }
  367. }
  368. }
  369. attrNames = append(attrNames, attribute.Key)
  370. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  371. }
  372. if !isAnchorLink {
  373. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
  374. if len(extraAttrNames) > 0 {
  375. attrNames = append(attrNames, extraAttrNames...)
  376. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  377. }
  378. }
  379. return attrNames, strings.Join(htmlAttrs, " ")
  380. }
  381. func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) ([]string, []string) {
  382. switch tagName {
  383. case "a":
  384. attributeNames := []string{"rel", "referrerpolicy"}
  385. htmlAttributes := []string{`rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`}
  386. if sanitizerOptions.OpenLinksInNewTab {
  387. attributeNames = append(attributeNames, "target")
  388. htmlAttributes = append(htmlAttributes, `target="_blank"`)
  389. }
  390. return attributeNames, htmlAttributes
  391. case "video", "audio":
  392. return []string{"controls"}, []string{"controls"}
  393. case "iframe":
  394. extraAttrNames := []string{}
  395. extraHTMLAttributes := []string{}
  396. // Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
  397. // See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
  398. if isYouTubeEmbed {
  399. extraAttrNames = append(extraAttrNames, "referrerpolicy")
  400. extraHTMLAttributes = append(extraHTMLAttributes, `referrerpolicy="strict-origin-when-cross-origin"`)
  401. }
  402. extraAttrNames = append(extraAttrNames, "sandbox", "loading")
  403. extraHTMLAttributes = append(extraHTMLAttributes, `sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`)
  404. return extraAttrNames, extraHTMLAttributes
  405. case "img":
  406. return []string{"loading"}, []string{`loading="lazy"`}
  407. default:
  408. return nil, nil
  409. }
  410. }
  411. func isValidAttribute(tagName, attributeName string) bool {
  412. if attributes, ok := allowedHTMLTagsAndAttributes[tagName]; ok {
  413. return slices.Contains(attributes, attributeName)
  414. }
  415. return false
  416. }
  417. func isExternalResourceAttribute(attribute string) bool {
  418. switch attribute {
  419. case "src", "href", "poster", "cite":
  420. return true
  421. default:
  422. return false
  423. }
  424. }
  425. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  426. if tagName != "img" {
  427. return false
  428. }
  429. hasHeight := false
  430. hasWidth := false
  431. for _, attribute := range attributes {
  432. if attribute.Val == "1" || attribute.Val == "0" {
  433. switch attribute.Key {
  434. case "height":
  435. hasHeight = true
  436. case "width":
  437. hasWidth = true
  438. }
  439. }
  440. }
  441. return hasHeight && hasWidth
  442. }
  443. func hasRequiredAttributes(tagName string, attributes []string) bool {
  444. switch tagName {
  445. case "a":
  446. return slices.Contains(attributes, "href")
  447. case "iframe":
  448. return slices.Contains(attributes, "src")
  449. case "source", "img":
  450. for _, attribute := range attributes {
  451. if attribute == "src" || attribute == "srcset" {
  452. return true
  453. }
  454. }
  455. return false
  456. default:
  457. return true
  458. }
  459. }
  460. func hasValidURIScheme(absoluteURL string) bool {
  461. for _, scheme := range validURISchemes {
  462. if strings.HasPrefix(absoluteURL, scheme) {
  463. return true
  464. }
  465. }
  466. return false
  467. }
  468. func isBlockedResource(absoluteURL string) bool {
  469. for _, blockedURL := range blockedResourceURLSubstrings {
  470. if strings.Contains(absoluteURL, blockedURL) {
  471. return true
  472. }
  473. }
  474. return false
  475. }
  476. func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
  477. iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
  478. if _, ok := iframeAllowList[iframeSourceDomain]; ok {
  479. return iframeSourceDomain, true
  480. }
  481. if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
  482. return iframeSourceDomain, true
  483. }
  484. if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
  485. return iframeSourceDomain, true
  486. }
  487. return "", false
  488. }
  489. func rewriteIframeURL(link string) string {
  490. u, err := url.Parse(link)
  491. if err != nil {
  492. return link
  493. }
  494. switch strings.TrimPrefix(u.Hostname(), "www.") {
  495. case "youtube.com":
  496. if pathWithoutEmbed, ok := strings.CutPrefix(u.Path, "/embed/"); ok {
  497. if len(u.RawQuery) > 0 {
  498. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed + "?" + u.RawQuery
  499. }
  500. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed
  501. }
  502. case "player.vimeo.com":
  503. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  504. if strings.HasPrefix(u.Path, "/video/") {
  505. if len(u.RawQuery) > 0 {
  506. return link + "&dnt=1"
  507. }
  508. return link + "?dnt=1"
  509. }
  510. }
  511. return link
  512. }
  513. func isBlockedTag(tagName string) bool {
  514. switch tagName {
  515. case "noscript", "script", "style":
  516. return true
  517. }
  518. return false
  519. }
  520. func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
  521. imageCandidates := ParseSrcSetAttribute(value)
  522. for _, imageCandidate := range imageCandidates {
  523. if absoluteURL, err := urllib.ResolveToAbsoluteURLWithParsedBaseURL(parsedBaseURL, imageCandidate.ImageURL); err == nil {
  524. imageCandidate.ImageURL = absoluteURL
  525. }
  526. }
  527. return imageCandidates.String()
  528. }
  529. func isValidDataAttribute(value string) bool {
  530. for _, prefix := range dataAttributeAllowedPrefixes {
  531. if strings.HasPrefix(value, prefix) {
  532. return true
  533. }
  534. }
  535. return false
  536. }
  537. func isPositiveInteger(value string) bool {
  538. if value == "" {
  539. return false
  540. }
  541. if number, err := strconv.Atoi(value); err == nil {
  542. return number > 0
  543. }
  544. return false
  545. }
  546. func isValidFetchPriorityValue(value string) bool {
  547. switch value {
  548. case "high", "low", "auto":
  549. return true
  550. }
  551. return false
  552. }
  553. func isValidDecodingValue(value string) bool {
  554. switch value {
  555. case "sync", "async", "auto":
  556. return true
  557. }
  558. return false
  559. }