sanitizer.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "io"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. tagAllowList = map[string][]string{
  17. "a": {"href", "title", "id"},
  18. "abbr": {"title"},
  19. "acronym": {"title"},
  20. "aside": {},
  21. "audio": {"src"},
  22. "blockquote": {},
  23. "b": {},
  24. "br": {},
  25. "caption": {},
  26. "cite": {},
  27. "code": {},
  28. "dd": {"id"},
  29. "del": {},
  30. "dfn": {},
  31. "dl": {"id"},
  32. "dt": {"id"},
  33. "em": {},
  34. "figcaption": {},
  35. "figure": {},
  36. "h1": {"id"},
  37. "h2": {"id"},
  38. "h3": {"id"},
  39. "h4": {"id"},
  40. "h5": {"id"},
  41. "h6": {"id"},
  42. "hr": {},
  43. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  44. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height", "fetchpriority", "decoding"},
  45. "ins": {},
  46. "kbd": {},
  47. "li": {"id"},
  48. "ol": {"id"},
  49. "p": {},
  50. "picture": {},
  51. "pre": {},
  52. "q": {"cite"},
  53. "rp": {},
  54. "rt": {},
  55. "rtc": {},
  56. "ruby": {},
  57. "s": {},
  58. "samp": {},
  59. "source": {"src", "type", "srcset", "sizes", "media"},
  60. "strong": {},
  61. "sub": {},
  62. "sup": {"id"},
  63. "table": {},
  64. "td": {"rowspan", "colspan"},
  65. "tfoot": {},
  66. "th": {"rowspan", "colspan"},
  67. "thead": {},
  68. "time": {"datetime"},
  69. "tr": {},
  70. "u": {},
  71. "ul": {"id"},
  72. "var": {},
  73. "video": {"poster", "height", "width", "src"},
  74. "wbr": {},
  75. // MathML: https://w3c.github.io/mathml-core/ and https://developer.mozilla.org/en-US/docs/Web/MathML/Reference/Element
  76. "annotation": {},
  77. "annotation-xml": {},
  78. "maction": {},
  79. "math": {"xmlns"},
  80. "merror": {},
  81. "mfrac": {},
  82. "mi": {},
  83. "mmultiscripts": {},
  84. "mn": {},
  85. "mo": {},
  86. "mover": {},
  87. "mpadded": {},
  88. "mphantom": {},
  89. "mprescripts": {},
  90. "mroot": {},
  91. "mrow": {},
  92. "ms": {},
  93. "mspace": {},
  94. "msqrt": {},
  95. "mstyle": {},
  96. "msub": {},
  97. "msubsup": {},
  98. "msup": {},
  99. "mtable": {},
  100. "mtd": {},
  101. "mtext": {},
  102. "mtr": {},
  103. "munder": {},
  104. "munderover": {},
  105. "semantics": {},
  106. }
  107. )
  108. type SanitizerOptions struct {
  109. OpenLinksInNewTab bool
  110. }
  111. func SanitizeHTMLWithDefaultOptions(baseURL, rawHTML string) string {
  112. return SanitizeHTML(baseURL, rawHTML, &SanitizerOptions{
  113. OpenLinksInNewTab: true,
  114. })
  115. }
  116. func SanitizeHTML(baseURL, rawHTML string, sanitizerOptions *SanitizerOptions) string {
  117. var buffer strings.Builder
  118. var tagStack []string
  119. var parentTag string
  120. var blockedStack []string
  121. tokenizer := html.NewTokenizer(strings.NewReader(rawHTML))
  122. for {
  123. if tokenizer.Next() == html.ErrorToken {
  124. err := tokenizer.Err()
  125. if err == io.EOF {
  126. return buffer.String()
  127. }
  128. return ""
  129. }
  130. token := tokenizer.Token()
  131. // Note: MathML elements are not fully supported by golang.org/x/net/html.
  132. // See https://github.com/golang/net/blob/master/html/atom/gen.go
  133. // and https://github.com/golang/net/blob/master/html/atom/table.go
  134. tagName := token.Data
  135. if tagName == "" {
  136. continue
  137. }
  138. switch token.Type {
  139. case html.TextToken:
  140. if len(blockedStack) > 0 {
  141. continue
  142. }
  143. // An iframe element never has fallback content.
  144. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  145. if parentTag == "iframe" {
  146. continue
  147. }
  148. buffer.WriteString(token.String())
  149. case html.StartTagToken:
  150. parentTag = tagName
  151. if isPixelTracker(tagName, token.Attr) {
  152. continue
  153. }
  154. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  155. blockedStack = append(blockedStack, tagName)
  156. continue
  157. }
  158. if len(blockedStack) == 0 && isValidTag(tagName) {
  159. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr, sanitizerOptions)
  160. if hasRequiredAttributes(tagName, attrNames) {
  161. if len(attrNames) > 0 {
  162. // Rewrite the start tag with allowed attributes.
  163. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  164. } else {
  165. // Rewrite the start tag without any attributes.
  166. buffer.WriteString("<" + tagName + ">")
  167. }
  168. tagStack = append(tagStack, tagName)
  169. }
  170. }
  171. case html.EndTagToken:
  172. if len(blockedStack) == 0 {
  173. if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  174. buffer.WriteString("</" + tagName + ">")
  175. }
  176. } else {
  177. if blockedStack[len(blockedStack)-1] == tagName {
  178. blockedStack = blockedStack[:len(blockedStack)-1]
  179. }
  180. }
  181. case html.SelfClosingTagToken:
  182. if isPixelTracker(tagName, token.Attr) {
  183. continue
  184. }
  185. if len(blockedStack) == 0 && isValidTag(tagName) {
  186. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr, sanitizerOptions)
  187. if hasRequiredAttributes(tagName, attrNames) {
  188. if len(attrNames) > 0 {
  189. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  190. } else {
  191. buffer.WriteString("<" + tagName + "/>")
  192. }
  193. }
  194. }
  195. }
  196. }
  197. }
  198. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute, sanitizerOptions *SanitizerOptions) ([]string, string) {
  199. var htmlAttrs, attrNames []string
  200. var err error
  201. var isImageLargerThanLayout bool
  202. var isAnchorLink bool
  203. if tagName == "img" {
  204. imgWidth := getIntegerAttributeValue("width", attributes)
  205. isImageLargerThanLayout = imgWidth > 750
  206. }
  207. parsedBaseUrl, _ := url.Parse(baseURL)
  208. for _, attribute := range attributes {
  209. value := attribute.Val
  210. if !isValidAttribute(tagName, attribute.Key) {
  211. continue
  212. }
  213. if tagName == "math" && attribute.Key == "xmlns" && value != "http://www.w3.org/1998/Math/MathML" {
  214. value = "http://www.w3.org/1998/Math/MathML"
  215. }
  216. if tagName == "img" && attribute.Key == "fetchpriority" {
  217. if !isValidFetchPriorityValue(value) {
  218. continue
  219. }
  220. }
  221. if tagName == "img" && attribute.Key == "decoding" {
  222. if !isValidDecodingValue(value) {
  223. continue
  224. }
  225. }
  226. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  227. value = sanitizeSrcsetAttr(baseURL, value)
  228. }
  229. if tagName == "img" && (attribute.Key == "width" || attribute.Key == "height") {
  230. if isImageLargerThanLayout || !isPositiveInteger(value) {
  231. continue
  232. }
  233. }
  234. if isExternalResourceAttribute(attribute.Key) {
  235. switch {
  236. case tagName == "iframe":
  237. if !isValidIframeSource(baseURL, attribute.Val) {
  238. continue
  239. }
  240. value = rewriteIframeURL(attribute.Val)
  241. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  242. value = attribute.Val
  243. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  244. value = attribute.Val
  245. isAnchorLink = true
  246. default:
  247. value, err = urllib.AbsoluteURL(baseURL, value)
  248. if err != nil {
  249. continue
  250. }
  251. if !hasValidURIScheme(value) || isBlockedResource(value) {
  252. continue
  253. }
  254. // TODO use feedURL instead of baseURL twice.
  255. parsedValueUrl, _ := url.Parse(value)
  256. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(parsedBaseUrl, parsedBaseUrl, parsedValueUrl); err == nil {
  257. value = cleanedURL
  258. }
  259. }
  260. }
  261. attrNames = append(attrNames, attribute.Key)
  262. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  263. }
  264. if !isAnchorLink {
  265. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName, sanitizerOptions)
  266. if len(extraAttrNames) > 0 {
  267. attrNames = append(attrNames, extraAttrNames...)
  268. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  269. }
  270. }
  271. return attrNames, strings.Join(htmlAttrs, " ")
  272. }
  273. func getExtraAttributes(tagName string, sanitizerOptions *SanitizerOptions) ([]string, []string) {
  274. switch tagName {
  275. case "a":
  276. attributeNames := []string{"rel", "referrerpolicy"}
  277. htmlAttributes := []string{`rel="noopener noreferrer"`, `referrerpolicy="no-referrer"`}
  278. if sanitizerOptions.OpenLinksInNewTab {
  279. attributeNames = append(attributeNames, "target")
  280. htmlAttributes = append(htmlAttributes, `target="_blank"`)
  281. }
  282. return attributeNames, htmlAttributes
  283. case "video", "audio":
  284. return []string{"controls"}, []string{"controls"}
  285. case "iframe":
  286. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
  287. case "img":
  288. return []string{"loading"}, []string{`loading="lazy"`}
  289. default:
  290. return nil, nil
  291. }
  292. }
  293. func isValidTag(tagName string) bool {
  294. _, ok := tagAllowList[tagName]
  295. return ok
  296. }
  297. func isValidAttribute(tagName, attributeName string) bool {
  298. if attributes, ok := tagAllowList[tagName]; ok {
  299. return slices.Contains(attributes, attributeName)
  300. }
  301. return false
  302. }
  303. func isExternalResourceAttribute(attribute string) bool {
  304. switch attribute {
  305. case "src", "href", "poster", "cite":
  306. return true
  307. default:
  308. return false
  309. }
  310. }
  311. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  312. if tagName != "img" {
  313. return false
  314. }
  315. hasHeight := false
  316. hasWidth := false
  317. for _, attribute := range attributes {
  318. if attribute.Val == "1" {
  319. switch attribute.Key {
  320. case "height":
  321. hasHeight = true
  322. case "width":
  323. hasWidth = true
  324. }
  325. }
  326. }
  327. return hasHeight && hasWidth
  328. }
  329. func hasRequiredAttributes(tagName string, attributes []string) bool {
  330. switch tagName {
  331. case "a":
  332. return slices.Contains(attributes, "href")
  333. case "iframe":
  334. return slices.Contains(attributes, "src")
  335. case "source", "img":
  336. return slices.Contains(attributes, "src") || slices.Contains(attributes, "srcset")
  337. default:
  338. return true
  339. }
  340. }
  341. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  342. func hasValidURIScheme(src string) bool {
  343. whitelist := []string{
  344. "apt:",
  345. "bitcoin:",
  346. "callto:",
  347. "dav:",
  348. "davs:",
  349. "ed2k://",
  350. "facetime://",
  351. "feed:",
  352. "ftp://",
  353. "geo:",
  354. "gopher://",
  355. "git://",
  356. "http://",
  357. "https://",
  358. "irc://",
  359. "irc6://",
  360. "ircs://",
  361. "itms://",
  362. "itms-apps://",
  363. "magnet:",
  364. "mailto:",
  365. "news:",
  366. "nntp:",
  367. "rtmp://",
  368. "sip:",
  369. "sips:",
  370. "skype:",
  371. "spotify:",
  372. "ssh://",
  373. "sftp://",
  374. "steam://",
  375. "svn://",
  376. "svn+ssh://",
  377. "tel:",
  378. "webcal://",
  379. "xmpp:",
  380. // iOS Apps
  381. "opener://", // https://www.opener.link
  382. "hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  383. }
  384. return slices.ContainsFunc(whitelist, func(prefix string) bool {
  385. return strings.HasPrefix(src, prefix)
  386. })
  387. }
  388. func isBlockedResource(src string) bool {
  389. blacklist := []string{
  390. "feedsportal.com",
  391. "api.flattr.com",
  392. "stats.wordpress.com",
  393. "twitter.com/share",
  394. "feeds.feedburner.com",
  395. }
  396. return slices.ContainsFunc(blacklist, func(element string) bool {
  397. return strings.Contains(src, element)
  398. })
  399. }
  400. func isValidIframeSource(baseURL, src string) bool {
  401. whitelist := []string{
  402. "bandcamp.com",
  403. "cdn.embedly.com",
  404. "player.bilibili.com",
  405. "player.twitch.tv",
  406. "player.vimeo.com",
  407. "soundcloud.com",
  408. "vk.com",
  409. "w.soundcloud.com",
  410. "dailymotion.com",
  411. "youtube-nocookie.com",
  412. "youtube.com",
  413. "open.spotify.com",
  414. }
  415. domain := urllib.Domain(src)
  416. // allow iframe from same origin
  417. if urllib.Domain(baseURL) == domain {
  418. return true
  419. }
  420. // allow iframe from custom invidious instance
  421. if config.Opts.InvidiousInstance() == domain {
  422. return true
  423. }
  424. return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
  425. }
  426. func rewriteIframeURL(link string) string {
  427. u, err := url.Parse(link)
  428. if err != nil {
  429. return link
  430. }
  431. switch strings.TrimPrefix(u.Hostname(), "www.") {
  432. case "youtube.com":
  433. if strings.HasPrefix(u.Path, "/embed/") {
  434. if len(u.RawQuery) > 0 {
  435. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/") + "?" + u.RawQuery
  436. }
  437. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/")
  438. }
  439. case "player.vimeo.com":
  440. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  441. if strings.HasPrefix(u.Path, "/video/") {
  442. if len(u.RawQuery) > 0 {
  443. return link + "&dnt=1"
  444. }
  445. return link + "?dnt=1"
  446. }
  447. }
  448. return link
  449. }
  450. func isBlockedTag(tagName string) bool {
  451. blacklist := []string{
  452. "noscript",
  453. "script",
  454. "style",
  455. }
  456. return slices.Contains(blacklist, tagName)
  457. }
  458. func sanitizeSrcsetAttr(baseURL, value string) string {
  459. imageCandidates := ParseSrcSetAttribute(value)
  460. for _, imageCandidate := range imageCandidates {
  461. if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
  462. imageCandidate.ImageURL = absoluteURL
  463. }
  464. }
  465. return imageCandidates.String()
  466. }
  467. func isValidDataAttribute(value string) bool {
  468. var dataAttributeAllowList = []string{
  469. "data:image/avif",
  470. "data:image/apng",
  471. "data:image/png",
  472. "data:image/svg",
  473. "data:image/svg+xml",
  474. "data:image/jpg",
  475. "data:image/jpeg",
  476. "data:image/gif",
  477. "data:image/webp",
  478. }
  479. return slices.ContainsFunc(dataAttributeAllowList, func(prefix string) bool {
  480. return strings.HasPrefix(value, prefix)
  481. })
  482. }
  483. func isPositiveInteger(value string) bool {
  484. if value == "" {
  485. return false
  486. }
  487. if number, err := strconv.Atoi(value); err == nil {
  488. return number > 0
  489. }
  490. return false
  491. }
  492. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  493. for _, attribute := range attributes {
  494. if attribute.Key == name {
  495. number, _ := strconv.Atoi(attribute.Val)
  496. return number
  497. }
  498. }
  499. return 0
  500. }
  501. func isValidFetchPriorityValue(value string) bool {
  502. allowedValues := []string{"high", "low", "auto"}
  503. return slices.Contains(allowedValues, value)
  504. }
  505. func isValidDecodingValue(value string) bool {
  506. allowedValues := []string{"sync", "async", "auto"}
  507. return slices.Contains(allowedValues, value)
  508. }