sanitizer.go 9.5 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package sanitizer // import "miniflux.app/reader/sanitizer"
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "regexp"
  10. "strings"
  11. "miniflux.app/url"
  12. "golang.org/x/net/html"
  13. )
  14. var (
  15. youtubeEmbedRegex = regexp.MustCompile(`//www\.youtube\.com/embed/(.*)`)
  16. )
  17. // Sanitize returns safe HTML.
  18. func Sanitize(baseURL, input string) string {
  19. tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
  20. var buffer bytes.Buffer
  21. var tagStack []string
  22. blacklistedTagDepth := 0
  23. for {
  24. if tokenizer.Next() == html.ErrorToken {
  25. err := tokenizer.Err()
  26. if err == io.EOF {
  27. return buffer.String()
  28. }
  29. return ""
  30. }
  31. token := tokenizer.Token()
  32. switch token.Type {
  33. case html.TextToken:
  34. if blacklistedTagDepth > 0 {
  35. continue
  36. }
  37. buffer.WriteString(html.EscapeString(token.Data))
  38. case html.StartTagToken:
  39. tagName := token.DataAtom.String()
  40. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  41. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  42. if hasRequiredAttributes(tagName, attrNames) {
  43. if len(attrNames) > 0 {
  44. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  45. } else {
  46. buffer.WriteString("<" + tagName + ">")
  47. }
  48. tagStack = append(tagStack, tagName)
  49. }
  50. } else if isBlacklistedTag(tagName) {
  51. blacklistedTagDepth++
  52. }
  53. case html.EndTagToken:
  54. tagName := token.DataAtom.String()
  55. if isValidTag(tagName) && inList(tagName, tagStack) {
  56. buffer.WriteString(fmt.Sprintf("</%s>", tagName))
  57. } else if isBlacklistedTag(tagName) {
  58. blacklistedTagDepth--
  59. }
  60. case html.SelfClosingTagToken:
  61. tagName := token.DataAtom.String()
  62. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  63. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  64. if hasRequiredAttributes(tagName, attrNames) {
  65. if len(attrNames) > 0 {
  66. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  67. } else {
  68. buffer.WriteString("<" + tagName + "/>")
  69. }
  70. }
  71. }
  72. }
  73. }
  74. }
  75. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  76. var htmlAttrs, attrNames []string
  77. var err error
  78. for _, attribute := range attributes {
  79. value := attribute.Val
  80. if !isValidAttribute(tagName, attribute.Key) {
  81. continue
  82. }
  83. if isExternalResourceAttribute(attribute.Key) {
  84. if tagName == "iframe" {
  85. if isValidIframeSource(attribute.Val) {
  86. value = rewriteIframeURL(attribute.Val)
  87. } else {
  88. continue
  89. }
  90. } else {
  91. value, err = url.AbsoluteURL(baseURL, value)
  92. if err != nil {
  93. continue
  94. }
  95. if !hasValidScheme(value) || isBlacklistedResource(value) {
  96. continue
  97. }
  98. }
  99. }
  100. attrNames = append(attrNames, attribute.Key)
  101. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
  102. }
  103. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  104. if len(extraAttrNames) > 0 {
  105. attrNames = append(attrNames, extraAttrNames...)
  106. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  107. }
  108. return attrNames, strings.Join(htmlAttrs, " ")
  109. }
  110. func getExtraAttributes(tagName string) ([]string, []string) {
  111. switch tagName {
  112. case "a":
  113. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  114. case "video", "audio":
  115. return []string{"controls"}, []string{"controls"}
  116. case "iframe":
  117. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups"`, `loading="lazy"`}
  118. case "img":
  119. return []string{"loading"}, []string{`loading="lazy"`}
  120. default:
  121. return nil, nil
  122. }
  123. }
  124. func isValidTag(tagName string) bool {
  125. for element := range getTagWhitelist() {
  126. if tagName == element {
  127. return true
  128. }
  129. }
  130. return false
  131. }
  132. func isValidAttribute(tagName, attributeName string) bool {
  133. for element, attributes := range getTagWhitelist() {
  134. if tagName == element {
  135. if inList(attributeName, attributes) {
  136. return true
  137. }
  138. }
  139. }
  140. return false
  141. }
  142. func isExternalResourceAttribute(attribute string) bool {
  143. switch attribute {
  144. case "src", "href", "poster", "cite":
  145. return true
  146. default:
  147. return false
  148. }
  149. }
  150. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  151. if tagName == "img" {
  152. hasHeight := false
  153. hasWidth := false
  154. for _, attribute := range attributes {
  155. if attribute.Key == "height" && attribute.Val == "1" {
  156. hasHeight = true
  157. }
  158. if attribute.Key == "width" && attribute.Val == "1" {
  159. hasWidth = true
  160. }
  161. }
  162. return hasHeight && hasWidth
  163. }
  164. return false
  165. }
  166. func hasRequiredAttributes(tagName string, attributes []string) bool {
  167. elements := make(map[string][]string)
  168. elements["a"] = []string{"href"}
  169. elements["iframe"] = []string{"src"}
  170. elements["img"] = []string{"src"}
  171. elements["source"] = []string{"src"}
  172. for element, attrs := range elements {
  173. if tagName == element {
  174. for _, attribute := range attributes {
  175. for _, attr := range attrs {
  176. if attr == attribute {
  177. return true
  178. }
  179. }
  180. }
  181. return false
  182. }
  183. }
  184. return true
  185. }
  186. func hasValidScheme(src string) bool {
  187. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  188. whitelist := []string{
  189. "apt://",
  190. "bitcoin://",
  191. "callto://",
  192. "ed2k://",
  193. "facetime://",
  194. "feed://",
  195. "ftp://",
  196. "geo://",
  197. "gopher://",
  198. "git://",
  199. "http://",
  200. "https://",
  201. "irc://",
  202. "irc6://",
  203. "ircs://",
  204. "itms://",
  205. "jabber://",
  206. "magnet://",
  207. "mailto://",
  208. "maps://",
  209. "news://",
  210. "nfs://",
  211. "nntp://",
  212. "rtmp://",
  213. "sip://",
  214. "sips://",
  215. "skype://",
  216. "smb://",
  217. "sms://",
  218. "spotify://",
  219. "ssh://",
  220. "sftp://",
  221. "steam://",
  222. "svn://",
  223. "tel://",
  224. "webcal://",
  225. "xmpp://",
  226. }
  227. for _, prefix := range whitelist {
  228. if strings.HasPrefix(src, prefix) {
  229. return true
  230. }
  231. }
  232. return false
  233. }
  234. func isBlacklistedResource(src string) bool {
  235. blacklist := []string{
  236. "feedsportal.com",
  237. "api.flattr.com",
  238. "stats.wordpress.com",
  239. "plus.google.com/share",
  240. "twitter.com/share",
  241. "feeds.feedburner.com",
  242. }
  243. for _, element := range blacklist {
  244. if strings.Contains(src, element) {
  245. return true
  246. }
  247. }
  248. return false
  249. }
  250. func isValidIframeSource(src string) bool {
  251. whitelist := []string{
  252. "//www.youtube.com",
  253. "http://www.youtube.com",
  254. "https://www.youtube.com",
  255. "https://www.youtube-nocookie.com",
  256. "http://player.vimeo.com",
  257. "https://player.vimeo.com",
  258. "http://www.dailymotion.com",
  259. "https://www.dailymotion.com",
  260. "http://vk.com",
  261. "https://vk.com",
  262. "http://soundcloud.com",
  263. "https://soundcloud.com",
  264. "http://w.soundcloud.com",
  265. "https://w.soundcloud.com",
  266. "http://bandcamp.com",
  267. "https://bandcamp.com",
  268. "https://cdn.embedly.com",
  269. }
  270. for _, prefix := range whitelist {
  271. if strings.HasPrefix(src, prefix) {
  272. return true
  273. }
  274. }
  275. return false
  276. }
  277. func getTagWhitelist() map[string][]string {
  278. whitelist := make(map[string][]string)
  279. whitelist["img"] = []string{"alt", "title", "src"}
  280. whitelist["audio"] = []string{"src"}
  281. whitelist["video"] = []string{"poster", "height", "width", "src"}
  282. whitelist["source"] = []string{"src", "type"}
  283. whitelist["dt"] = []string{}
  284. whitelist["dd"] = []string{}
  285. whitelist["dl"] = []string{}
  286. whitelist["table"] = []string{}
  287. whitelist["caption"] = []string{}
  288. whitelist["thead"] = []string{}
  289. whitelist["tfooter"] = []string{}
  290. whitelist["tr"] = []string{}
  291. whitelist["td"] = []string{"rowspan", "colspan"}
  292. whitelist["th"] = []string{"rowspan", "colspan"}
  293. whitelist["h1"] = []string{}
  294. whitelist["h2"] = []string{}
  295. whitelist["h3"] = []string{}
  296. whitelist["h4"] = []string{}
  297. whitelist["h5"] = []string{}
  298. whitelist["h6"] = []string{}
  299. whitelist["strong"] = []string{}
  300. whitelist["em"] = []string{}
  301. whitelist["code"] = []string{}
  302. whitelist["pre"] = []string{}
  303. whitelist["blockquote"] = []string{}
  304. whitelist["q"] = []string{"cite"}
  305. whitelist["p"] = []string{}
  306. whitelist["ul"] = []string{}
  307. whitelist["li"] = []string{}
  308. whitelist["ol"] = []string{}
  309. whitelist["br"] = []string{}
  310. whitelist["del"] = []string{}
  311. whitelist["a"] = []string{"href", "title"}
  312. whitelist["figure"] = []string{}
  313. whitelist["figcaption"] = []string{}
  314. whitelist["cite"] = []string{}
  315. whitelist["time"] = []string{"datetime"}
  316. whitelist["abbr"] = []string{"title"}
  317. whitelist["acronym"] = []string{"title"}
  318. whitelist["wbr"] = []string{}
  319. whitelist["dfn"] = []string{}
  320. whitelist["sub"] = []string{}
  321. whitelist["sup"] = []string{}
  322. whitelist["var"] = []string{}
  323. whitelist["samp"] = []string{}
  324. whitelist["s"] = []string{}
  325. whitelist["del"] = []string{}
  326. whitelist["ins"] = []string{}
  327. whitelist["kbd"] = []string{}
  328. whitelist["rp"] = []string{}
  329. whitelist["rt"] = []string{}
  330. whitelist["rtc"] = []string{}
  331. whitelist["ruby"] = []string{}
  332. whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
  333. return whitelist
  334. }
  335. func inList(needle string, haystack []string) bool {
  336. for _, element := range haystack {
  337. if element == needle {
  338. return true
  339. }
  340. }
  341. return false
  342. }
  343. func rewriteIframeURL(link string) string {
  344. matches := youtubeEmbedRegex.FindStringSubmatch(link)
  345. if len(matches) == 2 {
  346. return `https://www.youtube-nocookie.com/embed/` + matches[1]
  347. }
  348. return link
  349. }
  350. // Blacklisted tags remove the tag and all descendants.
  351. func isBlacklistedTag(tagName string) bool {
  352. blacklist := []string{
  353. "noscript",
  354. "script",
  355. "style",
  356. }
  357. for _, element := range blacklist {
  358. if element == tagName {
  359. return true
  360. }
  361. }
  362. return false
  363. }