sanitizer.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "io"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. allowedHTMLTagsAndAttributes = map[string][]string{
  17. "a": {"href", "title", "id"},
  18. "abbr": {"title"},
  19. "acronym": {"title"},
  20. "aside": {},
  21. "audio": {"src"},
  22. "blockquote": {},
  23. "b": {},
  24. "br": {},
  25. "caption": {},
  26. "cite": {},
  27. "code": {},
  28. "dd": {"id"},
  29. "del": {},
  30. "dfn": {},
  31. "dl": {"id"},
  32. "dt": {"id"},
  33. "em": {},
  34. "figcaption": {},
  35. "figure": {},
  36. "h1": {"id"},
  37. "h2": {"id"},
  38. "h3": {"id"},
  39. "h4": {"id"},
  40. "h5": {"id"},
  41. "h6": {"id"},
  42. "hr": {},
  43. "i": {},
  44. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  45. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
  46. "ins": {},
  47. "kbd": {},
  48. "li": {"id"},
  49. "ol": {"id"},
  50. "p": {},
  51. "picture": {},
  52. "pre": {},
  53. "q": {"cite"},
  54. "rp": {},
  55. "rt": {},
  56. "rtc": {},
  57. "ruby": {},
  58. "s": {},
  59. "small": {},
  60. "samp": {},
  61. "source": {"src", "type", "srcset", "sizes", "media"},
  62. "strong": {},
  63. "sub": {},
  64. "sup": {"id"},
  65. "table": {},
  66. "td": {"rowspan", "colspan"},
  67. "tfoot": {},
  68. "th": {"rowspan", "colspan"},
  69. "thead": {},
  70. "time": {"datetime"},
  71. "tr": {},
  72. "u": {},
  73. "ul": {"id"},
  74. "var": {},
  75. "video": {"poster", "height", "width", "src"},
  76. "wbr": {},
  77. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  78. "annotation": {},
  79. "annotation-xml": {},
  80. "maction": {},
  81. "math": {"xmlns"},
  82. "merror": {},
  83. "mfrac": {},
  84. "mi": {},
  85. "mmultiscripts": {},
  86. "mn": {},
  87. "mo": {},
  88. "mover": {},
  89. "mpadded": {},
  90. "mphantom": {},
  91. "mprescripts": {},
  92. "mroot": {},
  93. "mrow": {},
  94. "ms": {},
  95. "mspace": {},
  96. "msqrt": {},
  97. "mstyle": {},
  98. "msub": {},
  99. "msubsup": {},
  100. "msup": {},
  101. "mtable": {},
  102. "mtd": {},
  103. "mtext": {},
  104. "mtr": {},
  105. "munder": {},
  106. "munderover": {},
  107. "semantics": {},
  108. }
  109. iframeAllowList = map[string]struct{}{
  110. "bandcamp.com": {},
  111. "cdn.embedly.com": {},
  112. "dailymotion.com": {},
  113. "open.spotify.com": {},
  114. "player.bilibili.com": {},
  115. "player.twitch.tv": {},
  116. "player.vimeo.com": {},
  117. "soundcloud.com": {},
  118. "vk.com": {},
  119. "w.soundcloud.com": {},
  120. "youtube-nocookie.com": {},
  121. "youtube.com": {},
  122. }
  123. blockedResourceURLSubstrings = []string{
  124. "api.flattr.com",
  125. "feeds.feedburner.com",
  126. "feedsportal.com",
  127. "pinterest.com/pin/create/button/",
  128. "stats.wordpress.com",
  129. "twitter.com/intent/tweet",
  130. "twitter.com/share",
  131. "facebook.com/sharer.php",
  132. "linkedin.com/shareArticle",
  133. }
  134. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  135. validURISchemes = []string{
  136. // Most commong schemes on top.
  137. "https:",
  138. "http:",
  139. // Then the rest.
  140. "apt:",
  141. "bitcoin:",
  142. "callto:",
  143. "dav:",
  144. "davs:",
  145. "ed2k:",
  146. "facetime:",
  147. "feed:",
  148. "ftp:",
  149. "geo:",
  150. "git:",
  151. "gopher:",
  152. "irc:",
  153. "irc6:",
  154. "ircs:",
  155. "itms-apps:",
  156. "itms:",
  157. "magnet:",
  158. "mailto:",
  159. "news:",
  160. "nntp:",
  161. "rtmp:",
  162. "sftp:",
  163. "sip:",
  164. "sips:",
  165. "skype:",
  166. "spotify:",
  167. "ssh:",
  168. "steam:",
  169. "svn:",
  170. "svn+ssh:",
  171. "tel:",
  172. "webcal:",
  173. "xmpp:",
  174. // iOS Apps
  175. "opener:", // https://www.opener.link
  176. "hack:", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  177. }
  178. dataAttributeAllowedPrefixes = []string{
  179. "data:image/avif",
  180. "data:image/apng",
  181. "data:image/png",
  182. "data:image/svg",
  183. "data:image/svg+xml",
  184. "data:image/jpg",
  185. "data:image/jpeg",
  186. "data:image/gif",
  187. "data:image/webp",
  188. }
  189. )
  190. type SanitizerOptions struct {
  191. OpenLinksInNewTab bool
  192. }
  193. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  194. var tagStack []string
  195. var parentTag string
  196. var blockedStack []string
  197. var buffer strings.Builder
  198. // Educated guess about how big the sanitized HTML will be,
  199. // to reduce the amount of buffer re-allocations in this function.
  200. estimatedRatio := len(rawHTML) * 3 / 4
  201. buffer.Grow(estimatedRatio)
  202. // Errors are a non-issue, so they're handled later in the function.
  203. parsedBaseUrl, _ := url.Parse(baseURL)
  204. tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
  205. for {
  206. if tokenizer.Next() == html.ErrorToken {
  207. err := tokenizer.Err()
  208. if err == io.EOF {
  209. return buffer.String()
  210. }
  211. return ""
  212. }
  213. token := tokenizer.Token()
  214. // Note: MathML elements are not fully supported by golang.org/x/net/html.
  215. // See https://github.com/golang/net/blob/master/html/atom/gen.go
  216. // and https://github.com/golang/net/blob/master/html/atom/table.go
  217. tagName := token.Data
  218. if tagName == "" {
  219. continue
  220. }
  221. switch token.Type {
  222. case html.TextToken:
  223. if len(blockedStack) > 0 {
  224. continue
  225. }
  226. // An iframe element never has fallback content.
  227. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  228. if parentTag == "iframe" {
  229. continue
  230. }
  231. buffer.WriteString(token.String())
  232. case html.StartTagToken:
  233. parentTag = tagName
  234. if isPixelTracker(tagName, token.Attr) {
  235. continue
  236. }
  237. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  238. blockedStack = append(blockedStack, tagName)
  239. continue
  240. }
  241. if len(blockedStack) == 0 && isValidTag(tagName) {
  242. attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
  243. if hasRequiredAttributes(tagName, attrNames) {
  244. if len(attrNames) > 0 {
  245. // Rewrite the start tag with allowed attributes.
  246. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  247. } else {
  248. // Rewrite the start tag without any attributes.
  249. buffer.WriteString("<" + tagName + ">")
  250. }
  251. tagStack = append(tagStack, tagName)
  252. }
  253. }
  254. case html.EndTagToken:
  255. if len(blockedStack) == 0 {
  256. if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  257. buffer.WriteString("</" + tagName + ">")
  258. }
  259. } else {
  260. if blockedStack[len(blockedStack)-1] == tagName {
  261. blockedStack = blockedStack[:len(blockedStack)-1]
  262. }
  263. }
  264. case html.SelfClosingTagToken:
  265. if isPixelTracker(tagName, token.Attr) {
  266. continue
  267. }
  268. if len(blockedStack) == 0 && isValidTag(tagName) {
  269. attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, tagName, token.Attr, sanitizerOptions)
  270. if hasRequiredAttributes(tagName, attrNames) {
  271. if len(attrNames) > 0 {
  272. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  273. } else {
  274. buffer.WriteString("<" + tagName + "/>")
  275. }
  276. }
  277. }
  278. }
  279. }
  280. }
  281. func sanitizeAttributes(parsedBaseUrl *url.URL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
  282. htmlAttrs := make([]string, 0, len(attributes))
  283. attrNames := make([]string, 0, len(attributes))
  284. var err error
  285. var isAnchorLink bool
  286. var isYouTubeEmbed bool
  287. for _, attribute := range attributes {
  288. if !isValidAttribute(tagName, attribute.Key) {
  289. continue
  290. }
  291. value := attribute.Val
  292. switch tagName {
  293. case "math":
  294. if attribute.Key == "xmlns" {
  295. if value != "http://www.w3.org/1998/Math/MathML" {
  296. value = "http://www.w3.org/1998/Math/MathML"
  297. }
  298. }
  299. case "img":
  300. switch attribute.Key {
  301. case "fetchpriority":
  302. if !isValidFetchPriorityValue(value) {
  303. continue
  304. }
  305. case "decoding":
  306. if !isValidDecodingValue(value) {
  307. continue
  308. }
  309. case "width", "height":
  310. if !isPositiveInteger(value) {
  311. continue
  312. }
  313. // Discard width and height attributes when width is larger than Miniflux layout (750px)
  314. if imgWidth := getIntegerAttributeValue("width", attributes); imgWidth > 750 {
  315. continue
  316. }
  317. case "srcset":
  318. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  319. }
  320. case "source":
  321. if attribute.Key == "srcset" {
  322. value = sanitizeSrcsetAttr(parsedBaseUrl, value)
  323. }
  324. }
  325. if isExternalResourceAttribute(attribute.Key) {
  326. switch {
  327. case tagName == "iframe":
  328. iframeSourceDomain, trustedIframeDomain := findAllowedIframeSourceDomain(attribute.Val)
  329. if !trustedIframeDomain {
  330. continue
  331. }
  332. value = rewriteIframeURL(attribute.Val)
  333. if iframeSourceDomain == "youtube.com" || iframeSourceDomain == "youtube-nocookie.com" {
  334. isYouTubeEmbed = true
  335. }
  336. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  337. value = attribute.Val
  338. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  339. value = attribute.Val
  340. isAnchorLink = true
  341. default:
  342. value, err = absoluteURLParsedBase(parsedBaseUrl, value)
  343. if err != nil {
  344. continue
  345. }
  346. if !hasValidURIScheme(value) || isBlockedResource(value) {
  347. continue
  348. }
  349. // TODO use feedURL instead of baseURL twice.
  350. parsedValueUrl, _ := url.Parse(value)
  351. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  352. value = cleanedURL
  353. }
  354. }
  355. }
  356. attrNames = append(attrNames, attribute.Key)
  357. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  358. }
  359. if !isAnchorLink {
  360. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, isYouTubeEmbed, sanitizerOptions)
  361. if len(extraAttrNames) > 0 {
  362. attrNames = append(attrNames, extraAttrNames...)
  363. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  364. }
  365. }
  366. return attrNames, strings.Join(htmlAttrs, " ")
  367. }
  368. func getExtraAttributes(tagName string, isYouTubeEmbed bool, sanitizerOptions *SanitizerOptions) ([]string, []string) {
  369. switch tagName {
  370. case "a":
  371. attributeNames := []string{"rel", "referrerpolicy"}
  372. htmlAttributes := []string{`rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`}
  373. if sanitizerOptions.OpenLinksInNewTab {
  374. attributeNames = append(attributeNames, "target")
  375. htmlAttributes = append(htmlAttributes, `target="_blank"`)
  376. }
  377. return attributeNames, htmlAttributes
  378. case "video", "audio":
  379. return []string{"controls"}, []string{"controls"}
  380. case "iframe":
  381. extraAttrNames := []string{}
  382. extraHTMLAttributes := []string{}
  383. // Note: the referrerpolicy seems to be required to avoid YouTube error 153 video player configuration error
  384. // See https://developers.google.com/youtube/terms/required-minimum-functionality#embedded-player-api-client-identity
  385. if isYouTubeEmbed {
  386. extraAttrNames = append(extraAttrNames, "referrerpolicy")
  387. extraHTMLAttributes = append(extraHTMLAttributes, `referrerpolicy="strict-origin-when-cross-origin"`)
  388. }
  389. extraAttrNames = append(extraAttrNames, "sandbox", "loading")
  390. extraHTMLAttributes = append(extraHTMLAttributes, `sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`)
  391. return extraAttrNames, extraHTMLAttributes
  392. case "img":
  393. return []string{"loading"}, []string{`loading="lazy"`}
  394. default:
  395. return nil, nil
  396. }
  397. }
  398. func isValidTag(tagName string) bool {
  399. _, ok := allowedHTMLTagsAndAttributes[tagName]
  400. return ok
  401. }
  402. func isValidAttribute(tagName, attributeName string) bool {
  403. if attributes, ok := allowedHTMLTagsAndAttributes[tagName]; ok {
  404. return slices.Contains(attributes, attributeName)
  405. }
  406. return false
  407. }
  408. func isExternalResourceAttribute(attribute string) bool {
  409. switch attribute {
  410. case "src", "href", "poster", "cite":
  411. return true
  412. default:
  413. return false
  414. }
  415. }
  416. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  417. if tagName != "img" {
  418. return false
  419. }
  420. hasHeight := false
  421. hasWidth := false
  422. for _, attribute := range attributes {
  423. if attribute.Val == "1" || attribute.Val == "0" {
  424. switch attribute.Key {
  425. case "height":
  426. hasHeight = true
  427. case "width":
  428. hasWidth = true
  429. }
  430. }
  431. }
  432. return hasHeight && hasWidth
  433. }
  434. func hasRequiredAttributes(tagName string, attributes []string) bool {
  435. switch tagName {
  436. case "a":
  437. return slices.Contains(attributes, "href")
  438. case "iframe":
  439. return slices.Contains(attributes, "src")
  440. case "source", "img":
  441. for _, attribute := range attributes {
  442. if attribute == "src" || attribute == "srcset" {
  443. return true
  444. }
  445. }
  446. return false
  447. default:
  448. return true
  449. }
  450. }
  451. func hasValidURIScheme(absoluteURL string) bool {
  452. for _, scheme := range validURISchemes {
  453. if strings.HasPrefix(absoluteURL, scheme) {
  454. return true
  455. }
  456. }
  457. return false
  458. }
  459. func isBlockedResource(absoluteURL string) bool {
  460. for _, blockedURL := range blockedResourceURLSubstrings {
  461. if strings.Contains(absoluteURL, blockedURL) {
  462. return true
  463. }
  464. }
  465. return false
  466. }
  467. func findAllowedIframeSourceDomain(iframeSourceURL string) (string, bool) {
  468. iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
  469. if _, ok := iframeAllowList[iframeSourceDomain]; ok {
  470. return iframeSourceDomain, true
  471. }
  472. if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
  473. return iframeSourceDomain, true
  474. }
  475. if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
  476. return iframeSourceDomain, true
  477. }
  478. return "", false
  479. }
  480. func rewriteIframeURL(link string) string {
  481. u, err := url.Parse(link)
  482. if err != nil {
  483. return link
  484. }
  485. switch strings.TrimPrefix(u.Hostname(), "www.") {
  486. case "youtube.com":
  487. if pathWithoutEmbed, ok := strings.CutPrefix(u.Path, "/embed/"); ok {
  488. if len(u.RawQuery) > 0 {
  489. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed + "?" + u.RawQuery
  490. }
  491. return config.Opts.YouTubeEmbedUrlOverride() + pathWithoutEmbed
  492. }
  493. case "player.vimeo.com":
  494. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  495. if strings.HasPrefix(u.Path, "/video/") {
  496. if len(u.RawQuery) > 0 {
  497. return link + "&dnt=1"
  498. }
  499. return link + "?dnt=1"
  500. }
  501. }
  502. return link
  503. }
  504. func isBlockedTag(tagName string) bool {
  505. switch tagName {
  506. case "noscript", "script", "style":
  507. return true
  508. }
  509. return false
  510. }
  511. func sanitizeSrcsetAttr(parsedBaseURL *url.URL, value string) string {
  512. imageCandidates := ParseSrcSetAttribute(value)
  513. for _, imageCandidate := range imageCandidates {
  514. if absoluteURL, err := absoluteURLParsedBase(parsedBaseURL, imageCandidate.ImageURL); err == nil {
  515. imageCandidate.ImageURL = absoluteURL
  516. }
  517. }
  518. return imageCandidates.String()
  519. }
  520. func isValidDataAttribute(value string) bool {
  521. for _, prefix := range dataAttributeAllowedPrefixes {
  522. if strings.HasPrefix(value, prefix) {
  523. return true
  524. }
  525. }
  526. return false
  527. }
  528. func isPositiveInteger(value string) bool {
  529. if value == "" {
  530. return false
  531. }
  532. if number, err := strconv.Atoi(value); err == nil {
  533. return number > 0
  534. }
  535. return false
  536. }
  537. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  538. for _, attribute := range attributes {
  539. if attribute.Key == name {
  540. number, _ := strconv.Atoi(attribute.Val)
  541. return number
  542. }
  543. }
  544. return 0
  545. }
  546. func isValidFetchPriorityValue(value string) bool {
  547. switch value {
  548. case "high", "low", "auto":
  549. return true
  550. }
  551. return false
  552. }
  553. func isValidDecodingValue(value string) bool {
  554. switch value {
  555. case "sync", "async", "auto":
  556. return true
  557. }
  558. return false
  559. }
  560. // absoluteURLParsedBase is used instead of urllib.AbsoluteURL to avoid parsing baseURL over and over.
  561. func absoluteURLParsedBase(parsedBaseURL *url.URL, input string) (string, error) {
  562. absURL, u, err := urllib.GetAbsoluteURL(input)
  563. if err != nil {
  564. return "", err
  565. }
  566. if absURL != "" {
  567. return absURL, nil
  568. }
  569. if parsedBaseURL == nil {
  570. return "", nil
  571. }
  572. return parsedBaseURL.ResolveReference(u).String(), nil
  573. }