sanitizer.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "io"
  6. "net/url"
  7. "slices"
  8. "strconv"
  9. "strings"
  10. "miniflux.app/v2/internal/config"
  11. "miniflux.app/v2/internal/reader/urlcleaner"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. tagAllowList = map[string][]string{
  17. "a": {"href", "title", "id"},
  18. "abbr": {"title"},
  19. "acronym": {"title"},
  20. "aside": {},
  21. "audio": {"src"},
  22. "blockquote": {},
  23. "b": {},
  24. "br": {},
  25. "caption": {},
  26. "cite": {},
  27. "code": {},
  28. "dd": {"id"},
  29. "del": {},
  30. "dfn": {},
  31. "dl": {"id"},
  32. "dt": {"id"},
  33. "em": {},
  34. "figcaption": {},
  35. "figure": {},
  36. "h1": {"id"},
  37. "h2": {"id"},
  38. "h3": {"id"},
  39. "h4": {"id"},
  40. "h5": {"id"},
  41. "h6": {"id"},
  42. "hr": {},
  43. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  44. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height"},
  45. "ins": {},
  46. "kbd": {},
  47. "li": {"id"},
  48. "ol": {"id"},
  49. "p": {},
  50. "picture": {},
  51. "pre": {},
  52. "q": {"cite"},
  53. "rp": {},
  54. "rt": {},
  55. "rtc": {},
  56. "ruby": {},
  57. "s": {},
  58. "samp": {},
  59. "source": {"src", "type", "srcset", "sizes", "media"},
  60. "strong": {},
  61. "sub": {},
  62. "sup": {"id"},
  63. "table": {},
  64. "td": {"rowspan", "colspan"},
  65. "tfoot": {},
  66. "th": {"rowspan", "colspan"},
  67. "thead": {},
  68. "time": {"datetime"},
  69. "tr": {},
  70. "u": {},
  71. "ul": {"id"},
  72. "var": {},
  73. "video": {"poster", "height", "width", "src"},
  74. "wbr": {},
  75. }
  76. )
  77. // Sanitize returns safe HTML.
  78. func Sanitize(baseURL, input string) string {
  79. var buffer strings.Builder
  80. var tagStack []string
  81. var parentTag string
  82. var blockedStack []string
  83. tokenizer := html.NewTokenizer(strings.NewReader(input))
  84. for {
  85. if tokenizer.Next() == html.ErrorToken {
  86. err := tokenizer.Err()
  87. if err == io.EOF {
  88. return buffer.String()
  89. }
  90. return ""
  91. }
  92. token := tokenizer.Token()
  93. tagName := token.DataAtom.String()
  94. switch token.Type {
  95. case html.TextToken:
  96. if len(blockedStack) > 0 {
  97. continue
  98. }
  99. // An iframe element never has fallback content.
  100. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  101. if parentTag == "iframe" {
  102. continue
  103. }
  104. buffer.WriteString(token.String())
  105. case html.StartTagToken:
  106. parentTag = tagName
  107. if isPixelTracker(tagName, token.Attr) {
  108. continue
  109. }
  110. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  111. blockedStack = append(blockedStack, tagName)
  112. continue
  113. }
  114. if len(blockedStack) == 0 && isValidTag(tagName) {
  115. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  116. if hasRequiredAttributes(tagName, attrNames) {
  117. if len(attrNames) > 0 {
  118. // Rewrite the start tag with allowed attributes.
  119. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  120. } else {
  121. // Rewrite the start tag without any attributes.
  122. buffer.WriteString("<" + tagName + ">")
  123. }
  124. tagStack = append(tagStack, tagName)
  125. }
  126. }
  127. case html.EndTagToken:
  128. if len(blockedStack) == 0 {
  129. if isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  130. buffer.WriteString("</" + tagName + ">")
  131. }
  132. } else {
  133. if blockedStack[len(blockedStack)-1] == tagName {
  134. blockedStack = blockedStack[:len(blockedStack)-1]
  135. }
  136. }
  137. case html.SelfClosingTagToken:
  138. if isPixelTracker(tagName, token.Attr) {
  139. continue
  140. }
  141. if len(blockedStack) == 0 && isValidTag(tagName) {
  142. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  143. if hasRequiredAttributes(tagName, attrNames) {
  144. if len(attrNames) > 0 {
  145. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  146. } else {
  147. buffer.WriteString("<" + tagName + "/>")
  148. }
  149. }
  150. }
  151. }
  152. }
  153. }
  154. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  155. var htmlAttrs, attrNames []string
  156. var err error
  157. var isImageLargerThanLayout bool
  158. var isAnchorLink bool
  159. if tagName == "img" {
  160. imgWidth := getIntegerAttributeValue("width", attributes)
  161. isImageLargerThanLayout = imgWidth > 750
  162. }
  163. for _, attribute := range attributes {
  164. value := attribute.Val
  165. if !isValidAttribute(tagName, attribute.Key) {
  166. continue
  167. }
  168. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  169. value = sanitizeSrcsetAttr(baseURL, value)
  170. }
  171. if tagName == "img" && (attribute.Key == "width" || attribute.Key == "height") {
  172. if isImageLargerThanLayout || !isPositiveInteger(value) {
  173. continue
  174. }
  175. }
  176. if isExternalResourceAttribute(attribute.Key) {
  177. switch {
  178. case tagName == "iframe":
  179. if !isValidIframeSource(baseURL, attribute.Val) {
  180. continue
  181. }
  182. value = rewriteIframeURL(attribute.Val)
  183. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  184. value = attribute.Val
  185. case tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#"):
  186. value = attribute.Val
  187. isAnchorLink = true
  188. default:
  189. value, err = urllib.AbsoluteURL(baseURL, value)
  190. if err != nil {
  191. continue
  192. }
  193. if !hasValidURIScheme(value) || isBlockedResource(value) {
  194. continue
  195. }
  196. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(value); err == nil {
  197. value = cleanedURL
  198. }
  199. }
  200. }
  201. attrNames = append(attrNames, attribute.Key)
  202. htmlAttrs = append(htmlAttrs, attribute.Key+`="`+html.EscapeString(value)+`"`)
  203. }
  204. if !isAnchorLink {
  205. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  206. if len(extraAttrNames) > 0 {
  207. attrNames = append(attrNames, extraAttrNames...)
  208. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  209. }
  210. }
  211. return attrNames, strings.Join(htmlAttrs, " ")
  212. }
  213. func getExtraAttributes(tagName string) ([]string, []string) {
  214. switch tagName {
  215. case "a":
  216. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  217. case "video", "audio":
  218. return []string{"controls"}, []string{"controls"}
  219. case "iframe":
  220. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
  221. case "img":
  222. return []string{"loading"}, []string{`loading="lazy"`}
  223. default:
  224. return nil, nil
  225. }
  226. }
  227. func isValidTag(tagName string) bool {
  228. _, ok := tagAllowList[tagName]
  229. return ok
  230. }
  231. func isValidAttribute(tagName, attributeName string) bool {
  232. if attributes, ok := tagAllowList[tagName]; ok {
  233. return slices.Contains(attributes, attributeName)
  234. }
  235. return false
  236. }
  237. func isExternalResourceAttribute(attribute string) bool {
  238. switch attribute {
  239. case "src", "href", "poster", "cite":
  240. return true
  241. default:
  242. return false
  243. }
  244. }
  245. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  246. if tagName != "img" {
  247. return false
  248. }
  249. hasHeight := false
  250. hasWidth := false
  251. for _, attribute := range attributes {
  252. if attribute.Val == "1" {
  253. switch attribute.Key {
  254. case "height":
  255. hasHeight = true
  256. case "width":
  257. hasWidth = true
  258. }
  259. }
  260. }
  261. return hasHeight && hasWidth
  262. }
  263. func hasRequiredAttributes(tagName string, attributes []string) bool {
  264. switch tagName {
  265. case "a":
  266. return slices.Contains(attributes, "href")
  267. case "iframe":
  268. return slices.Contains(attributes, "src")
  269. case "source", "img":
  270. return slices.Contains(attributes, "src") || slices.Contains(attributes, "srcset")
  271. default:
  272. return true
  273. }
  274. }
  275. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  276. func hasValidURIScheme(src string) bool {
  277. whitelist := []string{
  278. "apt:",
  279. "bitcoin:",
  280. "callto:",
  281. "dav:",
  282. "davs:",
  283. "ed2k://",
  284. "facetime://",
  285. "feed:",
  286. "ftp://",
  287. "geo:",
  288. "gopher://",
  289. "git://",
  290. "http://",
  291. "https://",
  292. "irc://",
  293. "irc6://",
  294. "ircs://",
  295. "itms://",
  296. "itms-apps://",
  297. "magnet:",
  298. "mailto:",
  299. "news:",
  300. "nntp:",
  301. "rtmp://",
  302. "sip:",
  303. "sips:",
  304. "skype:",
  305. "spotify:",
  306. "ssh://",
  307. "sftp://",
  308. "steam://",
  309. "svn://",
  310. "svn+ssh://",
  311. "tel:",
  312. "webcal://",
  313. "xmpp:",
  314. // iOS Apps
  315. "opener://", // https://www.opener.link
  316. "hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  317. }
  318. return slices.ContainsFunc(whitelist, func(prefix string) bool {
  319. return strings.HasPrefix(src, prefix)
  320. })
  321. }
  322. func isBlockedResource(src string) bool {
  323. blacklist := []string{
  324. "feedsportal.com",
  325. "api.flattr.com",
  326. "stats.wordpress.com",
  327. "twitter.com/share",
  328. "feeds.feedburner.com",
  329. }
  330. return slices.ContainsFunc(blacklist, func(element string) bool {
  331. return strings.Contains(src, element)
  332. })
  333. }
  334. func isValidIframeSource(baseURL, src string) bool {
  335. whitelist := []string{
  336. "bandcamp.com",
  337. "cdn.embedly.com",
  338. "player.bilibili.com",
  339. "player.twitch.tv",
  340. "player.vimeo.com",
  341. "soundcloud.com",
  342. "vk.com",
  343. "w.soundcloud.com",
  344. "dailymotion.com",
  345. "youtube-nocookie.com",
  346. "youtube.com",
  347. }
  348. domain := urllib.Domain(src)
  349. // allow iframe from same origin
  350. if urllib.Domain(baseURL) == domain {
  351. return true
  352. }
  353. // allow iframe from custom invidious instance
  354. if config.Opts.InvidiousInstance() == domain {
  355. return true
  356. }
  357. return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
  358. }
  359. func rewriteIframeURL(link string) string {
  360. u, err := url.Parse(link)
  361. if err != nil {
  362. return link
  363. }
  364. switch strings.TrimPrefix(u.Hostname(), "www.") {
  365. case "youtube.com":
  366. if strings.HasPrefix(u.Path, "/embed/") {
  367. if len(u.RawQuery) > 0 {
  368. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/") + "?" + u.RawQuery
  369. }
  370. return config.Opts.YouTubeEmbedUrlOverride() + strings.TrimPrefix(u.Path, "/embed/")
  371. }
  372. case "player.vimeo.com":
  373. // See https://help.vimeo.com/hc/en-us/articles/12426260232977-About-Player-parameters
  374. if strings.HasPrefix(u.Path, "/video/") {
  375. if len(u.RawQuery) > 0 {
  376. return link + "&dnt=1"
  377. }
  378. return link + "?dnt=1"
  379. }
  380. }
  381. return link
  382. }
  383. func isBlockedTag(tagName string) bool {
  384. blacklist := []string{
  385. "noscript",
  386. "script",
  387. "style",
  388. }
  389. return slices.Contains(blacklist, tagName)
  390. }
  391. func sanitizeSrcsetAttr(baseURL, value string) string {
  392. imageCandidates := ParseSrcSetAttribute(value)
  393. for _, imageCandidate := range imageCandidates {
  394. if absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL); err == nil {
  395. imageCandidate.ImageURL = absoluteURL
  396. }
  397. }
  398. return imageCandidates.String()
  399. }
  400. func isValidDataAttribute(value string) bool {
  401. var dataAttributeAllowList = []string{
  402. "data:image/avif",
  403. "data:image/apng",
  404. "data:image/png",
  405. "data:image/svg",
  406. "data:image/svg+xml",
  407. "data:image/jpg",
  408. "data:image/jpeg",
  409. "data:image/gif",
  410. "data:image/webp",
  411. }
  412. return slices.ContainsFunc(dataAttributeAllowList, func(prefix string) bool {
  413. return strings.HasPrefix(value, prefix)
  414. })
  415. }
  416. func isPositiveInteger(value string) bool {
  417. if number, err := strconv.Atoi(value); err == nil {
  418. return number > 0
  419. }
  420. return false
  421. }
  422. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  423. for _, attribute := range attributes {
  424. if attribute.Key == name {
  425. number, _ := strconv.Atoi(attribute.Val)
  426. return number
  427. }
  428. }
  429. return 0
  430. }