sanitizer.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "io"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. allowedHTMLTagsAndAttributes = map[string]map[string]struct{}{
  17. "a": {"href": {}, "title": {}, "id": {}},
  18. "abbr": {"title": {}},
  19. "acronym": {"title": {}},
  20. "aside": {},
  21. "audio": {"src": {}},
  22. "blockquote": {},
  23. "b": {},
  24. "br": {},
  25. "caption": {},
  26. "cite": {},
  27. "code": {},
  28. "dd": {"id": {}},
  29. "del": {},
  30. "dfn": {},
  31. "dl": {"id": {}},
  32. "dt": {"id": {}},
  33. "em": {},
  34. "figcaption": {},
  35. "figure": {},
  36. "h1": {"id": {}},
  37. "h2": {"id": {}},
  38. "h3": {"id": {}},
  39. "h4": {"id": {}},
  40. "h5": {"id": {}},
  41. "h6": {"id": {}},
  42. "hr": {},
  43. "iframe": {"width": {}, "height": {}, "frameborder": {}, "src": {}, "allowfullscreen": {}},
  44. "img": {"alt": {}, "title": {}, "src": {}, "srcset": {}, "sizes": {}, "width": {}, "height": {}, "fetchpriority": {}, "decoding": {}},
  45. "ins": {},
  46. "kbd": {},
  47. "li": {"id": {}},
  48. "ol": {"id": {}},
  49. "p": {},
  50. "picture": {},
  51. "pre": {},
  52. "q": {"cite": {}},
  53. "rp": {},
  54. "rt": {},
  55. "rtc": {},
  56. "ruby": {},
  57. "s": {},
  58. "samp": {},
  59. "source": {"src": {}, "type": {}, "srcset": {}, "sizes": {}, "media": {}},
  60. "strong": {},
  61. "sub": {},
  62. "sup": {"id": {}},
  63. "table": {},
  64. "td": {"rowspan": {}, "colspan": {}},
  65. "tfoot": {},
  66. "th": {"rowspan": {}, "colspan": {}},
  67. "thead": {},
  68. "time": {"datetime": {}},
  69. "tr": {},
  70. "u": {},
  71. "ul": {"id": {}},
  72. "var": {},
  73. "video": {"poster": {}, "height": {}, "width": {}, "src": {}},
  74. "wbr": {},
  75. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  76. "annotation": {},
  77. "annotation-xml": {},
  78. "maction": {},
  79. "math": {"xmlns": {}},
  80. "merror": {},
  81. "mfrac": {},
  82. "mi": {},
  83. "mmultiscripts": {},
  84. "mn": {},
  85. "mo": {},
  86. "mover": {},
  87. "mpadded": {},
  88. "mphantom": {},
  89. "mprescripts": {},
  90. "mroot": {},
  91. "mrow": {},
  92. "ms": {},
  93. "mspace": {},
  94. "msqrt": {},
  95. "mstyle": {},
  96. "msub": {},
  97. "msubsup": {},
  98. "msup": {},
  99. "mtable": {},
  100. "mtd": {},
  101. "mtext": {},
  102. "mtr": {},
  103. "munder": {},
  104. "munderover": {},
  105. "semantics": {},
  106. }
  107. iframeAllowList = map[string]struct{}{
  108. "bandcamp.com": {},
  109. "cdn.embedly.com": {},
  110. "dailymotion.com": {},
  111. "open.spotify.com": {},
  112. "player.bilibili.com": {},
  113. "player.twitch.tv": {},
  114. "player.vimeo.com": {},
  115. "soundcloud.com": {},
  116. "vk.com": {},
  117. "w.soundcloud.com": {},
  118. "youtube-nocookie.com": {},
  119. "youtube.com": {},
  120. }
  121. blockedResourceURLSubstrings = []string{
  122. "api.flattr.com",
  123. "feeds.feedburner.com",
  124. "feedsportal.com",
  125. "pinterest.com/pin/create/button/",
  126. "stats.wordpress.com",
  127. "twitter.com/intent/tweet",
  128. "twitter.com/share",
  129. "facebook.com/sharer.php",
  130. "linkedin.com/shareArticle",
  131. }
  132. validURISchemes = map[string]struct{}{
  133. "apt": {},
  134. "bitcoin": {},
  135. "callto": {},
  136. "dav": {},
  137. "davs": {},
  138. "ed2k": {},
  139. "facetime": {},
  140. "feed": {},
  141. "ftp": {},
  142. "geo": {},
  143. "git": {},
  144. "gopher": {},
  145. "http": {},
  146. "https": {},
  147. "irc": {},
  148. "irc6": {},
  149. "ircs": {},
  150. "itms-apps": {},
  151. "itms": {},
  152. "magnet": {},
  153. "mailto": {},
  154. "news": {},
  155. "nntp": {},
  156. "rtmp": {},
  157. "sftp": {},
  158. "sip": {},
  159. "sips": {},
  160. "skype": {},
  161. "spotify": {},
  162. "ssh": {},
  163. "steam": {},
  164. "svn": {},
  165. "svn+ssh": {},
  166. "tel": {},
  167. "webcal": {},
  168. "xmpp": {},
  169. // iOS Apps
  170. "opener": {}, // https://www.opener.link
  171. "hack": {}, // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  172. }
  173. dataAttributeAllowedPrefixes = []string{
  174. "data:image/avif",
  175. "data:image/apng",
  176. "data:image/png",
  177. "data:image/svg",
  178. "data:image/svg+xml",
  179. "data:image/jpg",
  180. "data:image/jpeg",
  181. "data:image/gif",
  182. "data:image/webp",
  183. }
  184. )
  185. type SanitizerOptions struct {
  186. OpenLinksInNewTab bool
  187. }
  188. func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
  189. return SanitizeHTML(baseURL, rawHTML, &SanitizerOptions{
  190. OpenLinksInNewTab: true,
  191. })
  192. }
  193. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  194. var buffer strings.Builder
  195. var tagStack []string
  196. var parentTag string
  197. var blockedStack []string
  198. // Errors are a non-issue, so they're handled later in the function.
  199. parsedBaseUrl, _ := url.Parse(baseURL)
  200. tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
  201. for {
  202. if tokenizer.Next() == html.ErrorToken {
  203. err := tokenizer.Err()
  204. if err == io.EOF {
  205. return buffer.String()
  206. }
  207. return ""
  208. }
  209. token := tokenizer.Token()
  210. // Note: MathML elements are not fully supported by golang.org/x/net/html.
  211. // See https://github.com/golang/net/blob/master/html/atom/gen.go
  212. // and https://github.com/golang/net/blob/master/html/atom/table.go
  213. tagName := token.Data
  214. if tagName == "" {
  215. continue
  216. }
  217. switch token.Type {
  218. case html.TextToken:
  219. if len(blockedStack) > 0 {
  220. continue
  221. }
  222. // An iframe element never has fallback content.
  223. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  224. if parentTag == "iframe" {
  225. continue
  226. }
  227. buffer.WriteString(token.String())
  228. case html.StartTagToken:
  229. parentTag = tagName
  230. if isPixelTracker(tagName, token.Attr) {
  231. continue
  232. }
  233. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  234. blockedStack = append(blockedStack, tagName)
  235. continue
  236. }
  237. if len(blockedStack) == 0 && isValidTag(tagName) {
  238. attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
  239. if hasRequiredAttributes(tagName, attrNames) {
  240. if len(attrNames) > 0 {
  241. // Rewrite the start tag with allowed attributes.
  242. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  243. } else {
  244. // Rewrite the start tag without any attributes.
  245. buffer.WriteString("<" + tagName + ">")
  246. }
  247. tagStack = append(tagStack, tagName)
  248. }
  249. }
  250. case html.EndTagToken:
  251. if len(blockedStack) == 0 {
  252. if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  253. buffer.WriteString("</" + tagName + ">")
  254. }
  255. } else {
  256. if blockedStack[len(blockedStack)-1] == tagName {
  257. blockedStack = blockedStack[:len(blockedStack)-1]
  258. }
  259. }
  260. case html.SelfClosingTagToken:
  261. if isPixelTracker(tagName, token.Attr) {
  262. continue
  263. }
  264. if len(blockedStack) == 0 && isValidTag(tagName) {
  265. attrNames, htmlAttributes := sanitizeAttributes(parsedBaseUrl, baseURL, tagName, token.Attr, sanitizerOptions)
  266. if hasRequiredAttributes(tagName, attrNames) {
  267. if len(attrNames) > 0 {
  268. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  269. } else {
  270. buffer.WriteString("<" + tagName + "/>")
  271. }
  272. }
  273. }
  274. }
  275. }
  276. }
  277. func sanitizeAttributes(parsedBaseUrl *url.URL, baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
  278. var htmlAttrs, attrNames []string
  279. var err error
  280. var isImageLargerThanLayout bool
  281. var isAnchorLink bool
  282. if tagName == "img" {
  283. imgWidth := getIntegerAttributeValue("width", attributes)
  284. isImageLargerThanLayout = imgWidth > 750
  285. }
  286. for _, attribute := range attributes {
  287. value := attribute.Val
  288. if !isValidAttribute(tagName, attribute.Key) {
  289. continue
  290. }
  291. if tagName == "math" && attribute.Key == "xmlns" && value != "http://www.w3.org/1998/Math/MathML" {
  292. value = "http://www.w3.org/1998/Math/MathML"
  293. }
  294. if tagName == "img" && attribute.Key == "fetchpriority" {
  295. if !isValidFetchPriorityValue(value) {
  296. continue
  297. }
  298. }
  299. if tagName == "img" && attribute.Key == "decoding" {
  300. if !isValidDecodingValue(value) {
  301. continue
  302. }
  303. }
  304. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  305. value = sanitizeSrcsetAttr(baseURL, value)
  306. }
  307. if tagName == "img" && (attribute.Key == "width" || attribute.Key == "height") {
  308. if isImageLargerThanLayout || !isPositiveInteger(value) {
  309. continue
  310. }
  311. }
  312. if isExternalResourceAttribute(attribute.Key) {
  313. switch {
  314. case tagName == "iframe":
  315. if !isValidIframeSource(attribute.Val) {
  316. continue
  317. }
  318. value = rewriteIframeURL(attribute.Val)
  319. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  320. value = attribute.Val
  321. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  322. value = attribute.Val
  323. isAnchorLink = true
  324. default:
  325. value, err = urllib.AbsoluteURL(baseURL, value)
  326. if err != nil {
  327. continue
  328. }
  329. if !hasValidURIScheme(value) || isBlockedResource(value) {
  330. continue
  331. }
  332. // TODO use feedURL instead of baseURL twice.
  333. parsedValueUrl, _ := url.Parse(value)
  334. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  335. value = cleanedURL
  336. }
  337. }
  338. }
  339. attrNames = append(attrNames, attribute.Key)
  340. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  341. }
  342. if !isAnchorLink {
  343. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, sanitizerOptions)
  344. if len(extraAttrNames) > 0 {
  345. attrNames = append(attrNames, extraAttrNames...)
  346. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  347. }
  348. }
  349. return attrNames, strings.Join(htmlAttrs, " ")
  350. }
  351. func getExtraAttributes(tagName string, sanitizerOptions *SanitizerOptions) ([]string, []string) {
  352. switch tagName {
  353. case "a":
  354. attributeNames := []string{"rel", "referrerpolicy"}
  355. htmlAttributes := []string{`rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`}
  356. if sanitizerOptions.OpenLinksInNewTab {
  357. attributeNames = append(attributeNames, "target")
  358. htmlAttributes = append(htmlAttributes, `target="_blank"`)
  359. }
  360. return attributeNames, htmlAttributes
  361. case "video", "audio":
  362. return []string{"controls"}, []string{"controls"}
  363. case "iframe":
  364. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
  365. case "img":
  366. return []string{"loading"}, []string{`loading="lazy"`}
  367. default:
  368. return nil, nil
  369. }
  370. }
  371. func isValidTag(tagName string) bool {
  372. _, ok := allowedHTMLTagsAndAttributes[tagName]
  373. return ok
  374. }
  375. func isValidAttribute(tagName, attributeName string) bool {
  376. if attributes, ok := allowedHTMLTagsAndAttributes[tagName]; ok {
  377. _, allowed := attributes[attributeName]
  378. return allowed
  379. }
  380. return false
  381. }
  382. func isExternalResourceAttribute(attribute string) bool {
  383. switch attribute {
  384. case "src", "href", "poster", "cite":
  385. return true
  386. default:
  387. return false
  388. }
  389. }
  390. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  391. if tagName != "img" {
  392. return false
  393. }
  394. hasHeight := false
  395. hasWidth := false
  396. for _, attribute := range attributes {
  397. if attribute.Val == "1" || attribute.Val == "0" {
  398. switch attribute.Key {
  399. case "height":
  400. hasHeight = true
  401. case "width":
  402. hasWidth = true
  403. }
  404. }
  405. }
  406. return hasHeight && hasWidth
  407. }
  408. func hasRequiredAttributes(tagName string, attributes []string) bool {
  409. switch tagName {
  410. case "a":
  411. return slices.Contains(attributes, "href")
  412. case "iframe":
  413. return slices.Contains(attributes, "src")
  414. case "source", "img":
  415. return slices.Contains(attributes, "src") || slices.Contains(attributes, "srcset")
  416. default:
  417. return true
  418. }
  419. }
  420. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  421. func hasValidURIScheme(absoluteURL string) bool {
  422. colonIndex := strings.IndexByte(absoluteURL, ':')
  423. // Scheme must exist (colonIndex > 0). An empty scheme (e.g. ":foo") is not allowed.
  424. if colonIndex <= 0 {
  425. return false
  426. }
  427. scheme := absoluteURL[:colonIndex]
  428. _, ok := validURISchemes[strings.ToLower(scheme)]
  429. return ok
  430. }
  431. func isBlockedResource(absoluteURL string) bool {
  432. return slices.ContainsFunc(blockedResourceURLSubstrings, func(element string) bool {
  433. return strings.Contains(absoluteURL, element)
  434. })
  435. }
  436. func isValidIframeSource(iframeSourceURL string) bool {
  437. iframeSourceDomain := urllib.DomainWithoutWWW(iframeSourceURL)
  438. if _, ok := iframeAllowList[iframeSourceDomain]; ok {
  439. return true
  440. }
  441. if ytDomain := config.Opts.YouTubeEmbedDomain(); ytDomain != "" && iframeSourceDomain == strings.TrimPrefix(ytDomain, "www.") {
  442. return true
  443. }
  444. if invidiousInstance := config.Opts.InvidiousInstance(); invidiousInstance != "" && iframeSourceDomain == strings.TrimPrefix(invidiousInstance, "www.") {
  445. return true
  446. }
  447. return false
  448. }
  449. func rewriteIframeURL(link string) string {
  450. u, err := url.Parse(link)
  451. if err != nil {
  452. return link
  453. }
  454. switch strings.TrimPrefix(u.Hostname(), "www.") {
  455. case "youtube.com":
  456. if strings.HasPrefix(u.Path, "/embed/") {
  457. if len(u.RawQuery) > 0 {
  458. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/") + "?" + u.RawQuery
  459. }
  460. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/")
  461. }
  462. case "player.vimeo.com":
  463. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  464. if strings.HasPrefix(u.Path, "/video/") {
  465. if len(u.RawQuery) > 0 {
  466. return link + "&dnt=1"
  467. }
  468. return link + "?dnt=1"
  469. }
  470. }
  471. return link
  472. }
  473. func isBlockedTag(tagName string) bool {
  474. switch tagName {
  475. case "noscript", "script", "style":
  476. return true
  477. default:
  478. return false
  479. }
  480. }
  481. func sanitizeSrcsetAttr(baseURL, value string) string {
  482. imageCandidates := ParseSrcSetAttribute(value)
  483. for _, imageCandidate := range imageCandidates {
  484. if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
  485. imageCandidate.ImageURL = absoluteURL
  486. }
  487. }
  488. return imageCandidates.String()
  489. }
  490. func isValidDataAttribute(value string) bool {
  491. for _, prefix := range dataAttributeAllowedPrefixes {
  492. if strings.HasPrefix(value, prefix) {
  493. return true
  494. }
  495. }
  496. return false
  497. }
  498. func isPositiveInteger(value string) bool {
  499. if value == "" {
  500. return false
  501. }
  502. if number, err := strconv.Atoi(value); err == nil {
  503. return number > 0
  504. }
  505. return false
  506. }
  507. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  508. for _, attribute := range attributes {
  509. if attribute.Key == name {
  510. number, _ := strconv.Atoi(attribute.Val)
  511. return number
  512. }
  513. }
  514. return 0
  515. }
  516. func isValidFetchPriorityValue(value string) bool {
  517. allowedValues := []string{"high", "low", "auto"}
  518. return slices.Contains(allowedValues, value)
  519. }
  520. func isValidDecodingValue(value string) bool {
  521. allowedValues := []string{"sync", "async", "auto"}
  522. return slices.Contains(allowedValues, value)
  523. }