sanitizer.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package sanitizer // import "miniflux.app/reader/sanitizer"
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. "miniflux.app/url"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
  17. splitSrcsetRegex = regexp.MustCompile(`,\s?`)
  18. )
  19. // Sanitize returns safe HTML.
  20. func Sanitize(baseURL, input string) string {
  21. var buffer bytes.Buffer
  22. var tagStack []string
  23. var parentTag string
  24. blacklistedTagDepth := 0
  25. tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
  26. for {
  27. if tokenizer.Next() == html.ErrorToken {
  28. err := tokenizer.Err()
  29. if err == io.EOF {
  30. return buffer.String()
  31. }
  32. return ""
  33. }
  34. token := tokenizer.Token()
  35. switch token.Type {
  36. case html.TextToken:
  37. if blacklistedTagDepth > 0 {
  38. continue
  39. }
  40. // An iframe element never has fallback content.
  41. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  42. if parentTag == "iframe" {
  43. continue
  44. }
  45. buffer.WriteString(html.EscapeString(token.Data))
  46. case html.StartTagToken:
  47. tagName := token.DataAtom.String()
  48. parentTag = tagName
  49. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  50. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  51. if hasRequiredAttributes(tagName, attrNames) {
  52. if len(attrNames) > 0 {
  53. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  54. } else {
  55. buffer.WriteString("<" + tagName + ">")
  56. }
  57. tagStack = append(tagStack, tagName)
  58. }
  59. } else if isBlockedTag(tagName) {
  60. blacklistedTagDepth++
  61. }
  62. case html.EndTagToken:
  63. tagName := token.DataAtom.String()
  64. if isValidTag(tagName) && inList(tagName, tagStack) {
  65. buffer.WriteString(fmt.Sprintf("</%s>", tagName))
  66. } else if isBlockedTag(tagName) {
  67. blacklistedTagDepth--
  68. }
  69. case html.SelfClosingTagToken:
  70. tagName := token.DataAtom.String()
  71. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  72. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  73. if hasRequiredAttributes(tagName, attrNames) {
  74. if len(attrNames) > 0 {
  75. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  76. } else {
  77. buffer.WriteString("<" + tagName + "/>")
  78. }
  79. }
  80. }
  81. }
  82. }
  83. }
  84. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  85. var htmlAttrs, attrNames []string
  86. var err error
  87. for _, attribute := range attributes {
  88. value := attribute.Val
  89. if !isValidAttribute(tagName, attribute.Key) {
  90. continue
  91. }
  92. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  93. value = sanitizeSrcsetAttr(baseURL, value)
  94. }
  95. if isExternalResourceAttribute(attribute.Key) {
  96. if tagName == "iframe" {
  97. if isValidIframeSource(baseURL, attribute.Val) {
  98. value = rewriteIframeURL(attribute.Val)
  99. } else {
  100. continue
  101. }
  102. } else if tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val) {
  103. value = attribute.Val
  104. } else {
  105. value, err = url.AbsoluteURL(baseURL, value)
  106. if err != nil {
  107. continue
  108. }
  109. if !hasValidURIScheme(value) || isBlockedResource(value) {
  110. continue
  111. }
  112. }
  113. }
  114. attrNames = append(attrNames, attribute.Key)
  115. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
  116. }
  117. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  118. if len(extraAttrNames) > 0 {
  119. attrNames = append(attrNames, extraAttrNames...)
  120. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  121. }
  122. return attrNames, strings.Join(htmlAttrs, " ")
  123. }
  124. func getExtraAttributes(tagName string) ([]string, []string) {
  125. switch tagName {
  126. case "a":
  127. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  128. case "video", "audio":
  129. return []string{"controls"}, []string{"controls"}
  130. case "iframe":
  131. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
  132. case "img":
  133. return []string{"loading"}, []string{`loading="lazy"`}
  134. default:
  135. return nil, nil
  136. }
  137. }
  138. func isValidTag(tagName string) bool {
  139. for element := range getTagAllowList() {
  140. if tagName == element {
  141. return true
  142. }
  143. }
  144. return false
  145. }
  146. func isValidAttribute(tagName, attributeName string) bool {
  147. for element, attributes := range getTagAllowList() {
  148. if tagName == element {
  149. if inList(attributeName, attributes) {
  150. return true
  151. }
  152. }
  153. }
  154. return false
  155. }
  156. func isExternalResourceAttribute(attribute string) bool {
  157. switch attribute {
  158. case "src", "href", "poster", "cite":
  159. return true
  160. default:
  161. return false
  162. }
  163. }
  164. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  165. if tagName == "img" {
  166. hasHeight := false
  167. hasWidth := false
  168. for _, attribute := range attributes {
  169. if attribute.Key == "height" && attribute.Val == "1" {
  170. hasHeight = true
  171. }
  172. if attribute.Key == "width" && attribute.Val == "1" {
  173. hasWidth = true
  174. }
  175. }
  176. return hasHeight && hasWidth
  177. }
  178. return false
  179. }
  180. func hasRequiredAttributes(tagName string, attributes []string) bool {
  181. elements := make(map[string][]string)
  182. elements["a"] = []string{"href"}
  183. elements["iframe"] = []string{"src"}
  184. elements["img"] = []string{"src"}
  185. elements["source"] = []string{"src", "srcset"}
  186. for element, attrs := range elements {
  187. if tagName == element {
  188. for _, attribute := range attributes {
  189. for _, attr := range attrs {
  190. if attr == attribute {
  191. return true
  192. }
  193. }
  194. }
  195. return false
  196. }
  197. }
  198. return true
  199. }
  200. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  201. func hasValidURIScheme(src string) bool {
  202. whitelist := []string{
  203. "apt:",
  204. "bitcoin:",
  205. "callto:",
  206. "dav:",
  207. "davs:",
  208. "ed2k://",
  209. "facetime://",
  210. "feed:",
  211. "ftp://",
  212. "geo:",
  213. "gopher://",
  214. "git://",
  215. "http://",
  216. "https://",
  217. "irc://",
  218. "irc6://",
  219. "ircs://",
  220. "itms://",
  221. "itms-apps://",
  222. "magnet:",
  223. "mailto:",
  224. "news:",
  225. "nntp:",
  226. "rtmp://",
  227. "sip:",
  228. "sips:",
  229. "skype:",
  230. "spotify:",
  231. "ssh://",
  232. "sftp://",
  233. "steam://",
  234. "svn://",
  235. "svn+ssh://",
  236. "tel:",
  237. "webcal://",
  238. "xmpp:",
  239. }
  240. for _, prefix := range whitelist {
  241. if strings.HasPrefix(src, prefix) {
  242. return true
  243. }
  244. }
  245. return false
  246. }
  247. func isBlockedResource(src string) bool {
  248. blacklist := []string{
  249. "feedsportal.com",
  250. "api.flattr.com",
  251. "stats.wordpress.com",
  252. "plus.google.com/share",
  253. "twitter.com/share",
  254. "feeds.feedburner.com",
  255. }
  256. for _, element := range blacklist {
  257. if strings.Contains(src, element) {
  258. return true
  259. }
  260. }
  261. return false
  262. }
  263. func isValidIframeSource(baseURL, src string) bool {
  264. whitelist := []string{
  265. "https://invidio.us",
  266. "//www.youtube.com",
  267. "http://www.youtube.com",
  268. "https://www.youtube.com",
  269. "https://www.youtube-nocookie.com",
  270. "http://player.vimeo.com",
  271. "https://player.vimeo.com",
  272. "http://www.dailymotion.com",
  273. "https://www.dailymotion.com",
  274. "http://vk.com",
  275. "https://vk.com",
  276. "http://soundcloud.com",
  277. "https://soundcloud.com",
  278. "http://w.soundcloud.com",
  279. "https://w.soundcloud.com",
  280. "http://bandcamp.com",
  281. "https://bandcamp.com",
  282. "https://cdn.embedly.com",
  283. "https://player.bilibili.com",
  284. }
  285. // allow iframe from same origin
  286. if url.Domain(baseURL) == url.Domain(src) {
  287. return true
  288. }
  289. for _, prefix := range whitelist {
  290. if strings.HasPrefix(src, prefix) {
  291. return true
  292. }
  293. }
  294. return false
  295. }
  296. func getTagAllowList() map[string][]string {
  297. whitelist := make(map[string][]string)
  298. whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"}
  299. whitelist["picture"] = []string{}
  300. whitelist["audio"] = []string{"src"}
  301. whitelist["video"] = []string{"poster", "height", "width", "src"}
  302. whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"}
  303. whitelist["dt"] = []string{}
  304. whitelist["dd"] = []string{}
  305. whitelist["dl"] = []string{}
  306. whitelist["table"] = []string{}
  307. whitelist["caption"] = []string{}
  308. whitelist["thead"] = []string{}
  309. whitelist["tfooter"] = []string{}
  310. whitelist["tr"] = []string{}
  311. whitelist["td"] = []string{"rowspan", "colspan"}
  312. whitelist["th"] = []string{"rowspan", "colspan"}
  313. whitelist["h1"] = []string{}
  314. whitelist["h2"] = []string{}
  315. whitelist["h3"] = []string{}
  316. whitelist["h4"] = []string{}
  317. whitelist["h5"] = []string{}
  318. whitelist["h6"] = []string{}
  319. whitelist["strong"] = []string{}
  320. whitelist["em"] = []string{}
  321. whitelist["code"] = []string{}
  322. whitelist["pre"] = []string{}
  323. whitelist["blockquote"] = []string{}
  324. whitelist["q"] = []string{"cite"}
  325. whitelist["p"] = []string{}
  326. whitelist["ul"] = []string{}
  327. whitelist["li"] = []string{}
  328. whitelist["ol"] = []string{}
  329. whitelist["br"] = []string{}
  330. whitelist["del"] = []string{}
  331. whitelist["a"] = []string{"href", "title"}
  332. whitelist["figure"] = []string{}
  333. whitelist["figcaption"] = []string{}
  334. whitelist["cite"] = []string{}
  335. whitelist["time"] = []string{"datetime"}
  336. whitelist["abbr"] = []string{"title"}
  337. whitelist["acronym"] = []string{"title"}
  338. whitelist["wbr"] = []string{}
  339. whitelist["dfn"] = []string{}
  340. whitelist["sub"] = []string{}
  341. whitelist["sup"] = []string{}
  342. whitelist["var"] = []string{}
  343. whitelist["samp"] = []string{}
  344. whitelist["s"] = []string{}
  345. whitelist["del"] = []string{}
  346. whitelist["ins"] = []string{}
  347. whitelist["kbd"] = []string{}
  348. whitelist["rp"] = []string{}
  349. whitelist["rt"] = []string{}
  350. whitelist["rtc"] = []string{}
  351. whitelist["ruby"] = []string{}
  352. whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
  353. return whitelist
  354. }
  355. func inList(needle string, haystack []string) bool {
  356. for _, element := range haystack {
  357. if element == needle {
  358. return true
  359. }
  360. }
  361. return false
  362. }
  363. func rewriteIframeURL(link string) string {
  364. matches := youtubeEmbedRegex.FindStringSubmatch(link)
  365. if len(matches) == 2 {
  366. return `https://www.youtube-nocookie.com/embed/` + matches[1]
  367. }
  368. return link
  369. }
  370. func isBlockedTag(tagName string) bool {
  371. blacklist := []string{
  372. "noscript",
  373. "script",
  374. "style",
  375. }
  376. for _, element := range blacklist {
  377. if element == tagName {
  378. return true
  379. }
  380. }
  381. return false
  382. }
  383. /*
  384. One or more strings separated by commas, indicating possible image sources for the user agent to use.
  385. Each string is composed of:
  386. - A URL to an image
  387. - Optionally, whitespace followed by one of:
  388. - A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
  389. - A pixel density descriptor (a positive floating point number directly followed by x).
  390. */
  391. func sanitizeSrcsetAttr(baseURL, value string) string {
  392. var sanitizedSources []string
  393. rawSources := splitSrcsetRegex.Split(value, -1)
  394. for _, rawSource := range rawSources {
  395. parts := strings.Split(strings.TrimSpace(rawSource), " ")
  396. nbParts := len(parts)
  397. if nbParts > 0 {
  398. sanitizedSource, err := url.AbsoluteURL(baseURL, parts[0])
  399. if err != nil {
  400. continue
  401. }
  402. if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
  403. sanitizedSource += " " + parts[1]
  404. }
  405. sanitizedSources = append(sanitizedSources, sanitizedSource)
  406. }
  407. }
  408. return strings.Join(sanitizedSources, ", ")
  409. }
  410. func isValidWidthOrDensityDescriptor(value string) bool {
  411. if value == "" {
  412. return false
  413. }
  414. lastChar := value[len(value)-1:]
  415. if lastChar != "w" && lastChar != "x" {
  416. return false
  417. }
  418. _, err := strconv.ParseFloat(value[0:len(value)-1], 32)
  419. return err == nil
  420. }
  421. func isValidDataAttribute(value string) bool {
  422. var dataAttributeAllowList = []string{
  423. "data:image/avif",
  424. "data:image/apng",
  425. "data:image/png",
  426. "data:image/svg",
  427. "data:image/svg+xml",
  428. "data:image/jpg",
  429. "data:image/jpeg",
  430. "data:image/gif",
  431. "data:image/webp",
  432. }
  433. for _, prefix := range dataAttributeAllowList {
  434. if strings.HasPrefix(value, prefix) {
  435. return true
  436. }
  437. }
  438. return false
  439. }