sanitizer.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package sanitizer // import "miniflux.app/reader/sanitizer"
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "regexp"
  10. "strconv"
  11. "strings"
  12. "miniflux.app/url"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
  17. splitSrcsetRegex = regexp.MustCompile(`,\s+`)
  18. )
  19. // Sanitize returns safe HTML.
  20. func Sanitize(baseURL, input string) string {
  21. tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
  22. var buffer bytes.Buffer
  23. var tagStack []string
  24. blacklistedTagDepth := 0
  25. for {
  26. if tokenizer.Next() == html.ErrorToken {
  27. err := tokenizer.Err()
  28. if err == io.EOF {
  29. return buffer.String()
  30. }
  31. return ""
  32. }
  33. token := tokenizer.Token()
  34. switch token.Type {
  35. case html.TextToken:
  36. if blacklistedTagDepth > 0 {
  37. continue
  38. }
  39. buffer.WriteString(html.EscapeString(token.Data))
  40. case html.StartTagToken:
  41. tagName := token.DataAtom.String()
  42. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  43. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  44. if hasRequiredAttributes(tagName, attrNames) {
  45. if len(attrNames) > 0 {
  46. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  47. } else {
  48. buffer.WriteString("<" + tagName + ">")
  49. }
  50. tagStack = append(tagStack, tagName)
  51. }
  52. } else if isBlockedTag(tagName) {
  53. blacklistedTagDepth++
  54. }
  55. case html.EndTagToken:
  56. tagName := token.DataAtom.String()
  57. if isValidTag(tagName) && inList(tagName, tagStack) {
  58. buffer.WriteString(fmt.Sprintf("</%s>", tagName))
  59. } else if isBlockedTag(tagName) {
  60. blacklistedTagDepth--
  61. }
  62. case html.SelfClosingTagToken:
  63. tagName := token.DataAtom.String()
  64. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  65. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  66. if hasRequiredAttributes(tagName, attrNames) {
  67. if len(attrNames) > 0 {
  68. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  69. } else {
  70. buffer.WriteString("<" + tagName + "/>")
  71. }
  72. }
  73. }
  74. }
  75. }
  76. }
  77. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  78. var htmlAttrs, attrNames []string
  79. var err error
  80. for _, attribute := range attributes {
  81. value := attribute.Val
  82. if !isValidAttribute(tagName, attribute.Key) {
  83. continue
  84. }
  85. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  86. value = sanitizeSrcsetAttr(baseURL, value)
  87. }
  88. if isExternalResourceAttribute(attribute.Key) {
  89. if tagName == "iframe" {
  90. if isValidIframeSource(baseURL, attribute.Val) {
  91. value = rewriteIframeURL(attribute.Val)
  92. } else {
  93. continue
  94. }
  95. } else if tagName == "img" && attribute.Key == "src" && strings.HasPrefix(attribute.Val, "data:") {
  96. value = attribute.Val
  97. } else {
  98. value, err = url.AbsoluteURL(baseURL, value)
  99. if err != nil {
  100. continue
  101. }
  102. if !hasValidURIScheme(value) || isBlockedResource(value) {
  103. continue
  104. }
  105. }
  106. }
  107. attrNames = append(attrNames, attribute.Key)
  108. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
  109. }
  110. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  111. if len(extraAttrNames) > 0 {
  112. attrNames = append(attrNames, extraAttrNames...)
  113. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  114. }
  115. return attrNames, strings.Join(htmlAttrs, " ")
  116. }
  117. func getExtraAttributes(tagName string) ([]string, []string) {
  118. switch tagName {
  119. case "a":
  120. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  121. case "video", "audio":
  122. return []string{"controls"}, []string{"controls"}
  123. case "iframe":
  124. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
  125. case "img":
  126. return []string{"loading"}, []string{`loading="lazy"`}
  127. default:
  128. return nil, nil
  129. }
  130. }
  131. func isValidTag(tagName string) bool {
  132. for element := range getTagAllowList() {
  133. if tagName == element {
  134. return true
  135. }
  136. }
  137. return false
  138. }
  139. func isValidAttribute(tagName, attributeName string) bool {
  140. for element, attributes := range getTagAllowList() {
  141. if tagName == element {
  142. if inList(attributeName, attributes) {
  143. return true
  144. }
  145. }
  146. }
  147. return false
  148. }
  149. func isExternalResourceAttribute(attribute string) bool {
  150. switch attribute {
  151. case "src", "href", "poster", "cite":
  152. return true
  153. default:
  154. return false
  155. }
  156. }
  157. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  158. if tagName == "img" {
  159. hasHeight := false
  160. hasWidth := false
  161. for _, attribute := range attributes {
  162. if attribute.Key == "height" && attribute.Val == "1" {
  163. hasHeight = true
  164. }
  165. if attribute.Key == "width" && attribute.Val == "1" {
  166. hasWidth = true
  167. }
  168. }
  169. return hasHeight && hasWidth
  170. }
  171. return false
  172. }
  173. func hasRequiredAttributes(tagName string, attributes []string) bool {
  174. elements := make(map[string][]string)
  175. elements["a"] = []string{"href"}
  176. elements["iframe"] = []string{"src"}
  177. elements["img"] = []string{"src"}
  178. elements["source"] = []string{"src", "srcset"}
  179. for element, attrs := range elements {
  180. if tagName == element {
  181. for _, attribute := range attributes {
  182. for _, attr := range attrs {
  183. if attr == attribute {
  184. return true
  185. }
  186. }
  187. }
  188. return false
  189. }
  190. }
  191. return true
  192. }
  193. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  194. func hasValidURIScheme(src string) bool {
  195. whitelist := []string{
  196. "apt:",
  197. "bitcoin:",
  198. "callto:",
  199. "dav:",
  200. "davs:",
  201. "ed2k://",
  202. "facetime://",
  203. "feed:",
  204. "ftp://",
  205. "geo:",
  206. "gopher://",
  207. "git://",
  208. "http://",
  209. "https://",
  210. "irc://",
  211. "irc6://",
  212. "ircs://",
  213. "itms://",
  214. "itms-apps://",
  215. "magnet:",
  216. "mailto:",
  217. "news:",
  218. "nntp:",
  219. "rtmp://",
  220. "sip:",
  221. "sips:",
  222. "skype:",
  223. "spotify:",
  224. "ssh://",
  225. "sftp://",
  226. "steam://",
  227. "svn://",
  228. "svn+ssh://",
  229. "tel:",
  230. "webcal://",
  231. "xmpp:",
  232. }
  233. for _, prefix := range whitelist {
  234. if strings.HasPrefix(src, prefix) {
  235. return true
  236. }
  237. }
  238. return false
  239. }
  240. func isBlockedResource(src string) bool {
  241. blacklist := []string{
  242. "feedsportal.com",
  243. "api.flattr.com",
  244. "stats.wordpress.com",
  245. "plus.google.com/share",
  246. "twitter.com/share",
  247. "feeds.feedburner.com",
  248. }
  249. for _, element := range blacklist {
  250. if strings.Contains(src, element) {
  251. return true
  252. }
  253. }
  254. return false
  255. }
  256. func isValidIframeSource(baseURL, src string) bool {
  257. whitelist := []string{
  258. "https://invidio.us",
  259. "//www.youtube.com",
  260. "http://www.youtube.com",
  261. "https://www.youtube.com",
  262. "https://www.youtube-nocookie.com",
  263. "http://player.vimeo.com",
  264. "https://player.vimeo.com",
  265. "http://www.dailymotion.com",
  266. "https://www.dailymotion.com",
  267. "http://vk.com",
  268. "https://vk.com",
  269. "http://soundcloud.com",
  270. "https://soundcloud.com",
  271. "http://w.soundcloud.com",
  272. "https://w.soundcloud.com",
  273. "http://bandcamp.com",
  274. "https://bandcamp.com",
  275. "https://cdn.embedly.com",
  276. }
  277. // allow iframe from same origin
  278. if url.Domain(baseURL) == url.Domain(src) {
  279. return true
  280. }
  281. for _, prefix := range whitelist {
  282. if strings.HasPrefix(src, prefix) {
  283. return true
  284. }
  285. }
  286. return false
  287. }
  288. func getTagAllowList() map[string][]string {
  289. whitelist := make(map[string][]string)
  290. whitelist["img"] = []string{"alt", "title", "src", "srcset", "sizes"}
  291. whitelist["picture"] = []string{}
  292. whitelist["audio"] = []string{"src"}
  293. whitelist["video"] = []string{"poster", "height", "width", "src"}
  294. whitelist["source"] = []string{"src", "type", "srcset", "sizes", "media"}
  295. whitelist["dt"] = []string{}
  296. whitelist["dd"] = []string{}
  297. whitelist["dl"] = []string{}
  298. whitelist["table"] = []string{}
  299. whitelist["caption"] = []string{}
  300. whitelist["thead"] = []string{}
  301. whitelist["tfooter"] = []string{}
  302. whitelist["tr"] = []string{}
  303. whitelist["td"] = []string{"rowspan", "colspan"}
  304. whitelist["th"] = []string{"rowspan", "colspan"}
  305. whitelist["h1"] = []string{}
  306. whitelist["h2"] = []string{}
  307. whitelist["h3"] = []string{}
  308. whitelist["h4"] = []string{}
  309. whitelist["h5"] = []string{}
  310. whitelist["h6"] = []string{}
  311. whitelist["strong"] = []string{}
  312. whitelist["em"] = []string{}
  313. whitelist["code"] = []string{}
  314. whitelist["pre"] = []string{}
  315. whitelist["blockquote"] = []string{}
  316. whitelist["q"] = []string{"cite"}
  317. whitelist["p"] = []string{}
  318. whitelist["ul"] = []string{}
  319. whitelist["li"] = []string{}
  320. whitelist["ol"] = []string{}
  321. whitelist["br"] = []string{}
  322. whitelist["del"] = []string{}
  323. whitelist["a"] = []string{"href", "title"}
  324. whitelist["figure"] = []string{}
  325. whitelist["figcaption"] = []string{}
  326. whitelist["cite"] = []string{}
  327. whitelist["time"] = []string{"datetime"}
  328. whitelist["abbr"] = []string{"title"}
  329. whitelist["acronym"] = []string{"title"}
  330. whitelist["wbr"] = []string{}
  331. whitelist["dfn"] = []string{}
  332. whitelist["sub"] = []string{}
  333. whitelist["sup"] = []string{}
  334. whitelist["var"] = []string{}
  335. whitelist["samp"] = []string{}
  336. whitelist["s"] = []string{}
  337. whitelist["del"] = []string{}
  338. whitelist["ins"] = []string{}
  339. whitelist["kbd"] = []string{}
  340. whitelist["rp"] = []string{}
  341. whitelist["rt"] = []string{}
  342. whitelist["rtc"] = []string{}
  343. whitelist["ruby"] = []string{}
  344. whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
  345. return whitelist
  346. }
  347. func inList(needle string, haystack []string) bool {
  348. for _, element := range haystack {
  349. if element == needle {
  350. return true
  351. }
  352. }
  353. return false
  354. }
  355. func rewriteIframeURL(link string) string {
  356. matches := youtubeEmbedRegex.FindStringSubmatch(link)
  357. if len(matches) == 2 {
  358. return `https://www.youtube-nocookie.com/embed/` + matches[1]
  359. }
  360. return link
  361. }
  362. func isBlockedTag(tagName string) bool {
  363. blacklist := []string{
  364. "noscript",
  365. "script",
  366. "style",
  367. }
  368. for _, element := range blacklist {
  369. if element == tagName {
  370. return true
  371. }
  372. }
  373. return false
  374. }
  375. /*
  376. One or more strings separated by commas, indicating possible image sources for the user agent to use.
  377. Each string is composed of:
  378. - A URL to an image
  379. - Optionally, whitespace followed by one of:
  380. - A width descriptor (a positive integer directly followed by w). The width descriptor is divided by the source size given in the sizes attribute to calculate the effective pixel density.
  381. - A pixel density descriptor (a positive floating point number directly followed by x).
  382. */
  383. func sanitizeSrcsetAttr(baseURL, value string) string {
  384. var sanitizedSources []string
  385. rawSources := splitSrcsetRegex.Split(value, -1)
  386. for _, rawSource := range rawSources {
  387. parts := strings.Split(strings.TrimSpace(rawSource), " ")
  388. nbParts := len(parts)
  389. if nbParts > 0 {
  390. sanitizedSource := parts[0]
  391. if !strings.HasPrefix(parts[0], "data:") {
  392. var err error
  393. sanitizedSource, err = url.AbsoluteURL(baseURL, parts[0])
  394. if err != nil {
  395. continue
  396. }
  397. }
  398. if nbParts == 2 && isValidWidthOrDensityDescriptor(parts[1]) {
  399. sanitizedSource += " " + parts[1]
  400. }
  401. sanitizedSources = append(sanitizedSources, sanitizedSource)
  402. }
  403. }
  404. return strings.Join(sanitizedSources, ", ")
  405. }
  406. func isValidWidthOrDensityDescriptor(value string) bool {
  407. if value == "" {
  408. return false
  409. }
  410. lastChar := value[len(value)-1:]
  411. if lastChar != "w" && lastChar != "x" {
  412. return false
  413. }
  414. _, err := strconv.ParseFloat(value[0:len(value)-1], 32)
  415. return err == nil
  416. }