sanitizer.go 8.2 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361
  1. // Copyright 2017 Frédéric Guillot. All rights reserved.
  2. // Use of this source code is governed by the Apache 2.0
  3. // license that can be found in the LICENSE file.
  4. package sanitizer
  5. import (
  6. "bytes"
  7. "fmt"
  8. "io"
  9. "strings"
  10. "github.com/miniflux/miniflux/url"
  11. "golang.org/x/net/html"
  12. )
  13. // Sanitize returns safe HTML.
  14. func Sanitize(baseURL, input string) string {
  15. tokenizer := html.NewTokenizer(bytes.NewBufferString(input))
  16. var buffer bytes.Buffer
  17. var tagStack []string
  18. for {
  19. if tokenizer.Next() == html.ErrorToken {
  20. err := tokenizer.Err()
  21. if err == io.EOF {
  22. return buffer.String()
  23. }
  24. return ""
  25. }
  26. token := tokenizer.Token()
  27. switch token.Type {
  28. case html.TextToken:
  29. buffer.WriteString(html.EscapeString(token.Data))
  30. case html.StartTagToken:
  31. tagName := token.DataAtom.String()
  32. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  33. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  34. if hasRequiredAttributes(tagName, attrNames) {
  35. if len(attrNames) > 0 {
  36. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  37. } else {
  38. buffer.WriteString("<" + tagName + ">")
  39. }
  40. tagStack = append(tagStack, tagName)
  41. }
  42. }
  43. case html.EndTagToken:
  44. tagName := token.DataAtom.String()
  45. if isValidTag(tagName) && inList(tagName, tagStack) {
  46. buffer.WriteString(fmt.Sprintf("</%s>", tagName))
  47. }
  48. case html.SelfClosingTagToken:
  49. tagName := token.DataAtom.String()
  50. if !isPixelTracker(tagName, token.Attr) && isValidTag(tagName) {
  51. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  52. if hasRequiredAttributes(tagName, attrNames) {
  53. if len(attrNames) > 0 {
  54. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  55. } else {
  56. buffer.WriteString("<" + tagName + "/>")
  57. }
  58. }
  59. }
  60. }
  61. }
  62. }
  63. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  64. var htmlAttrs, attrNames []string
  65. var err error
  66. for _, attribute := range attributes {
  67. value := attribute.Val
  68. if !isValidAttribute(tagName, attribute.Key) {
  69. continue
  70. }
  71. if isExternalResourceAttribute(attribute.Key) {
  72. if tagName == "iframe" && !isValidIframeSource(attribute.Val) {
  73. continue
  74. } else {
  75. value, err = url.AbsoluteURL(baseURL, value)
  76. if err != nil {
  77. continue
  78. }
  79. if !hasValidScheme(value) || isBlacklistedResource(value) {
  80. continue
  81. }
  82. }
  83. }
  84. attrNames = append(attrNames, attribute.Key)
  85. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s="%s"`, attribute.Key, html.EscapeString(value)))
  86. }
  87. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  88. if len(extraAttrNames) > 0 {
  89. attrNames = append(attrNames, extraAttrNames...)
  90. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  91. }
  92. return attrNames, strings.Join(htmlAttrs, " ")
  93. }
  94. func getExtraAttributes(tagName string) ([]string, []string) {
  95. if tagName == "a" {
  96. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  97. }
  98. if tagName == "video" || tagName == "audio" {
  99. return []string{"controls"}, []string{"controls"}
  100. }
  101. return nil, nil
  102. }
  103. func isValidTag(tagName string) bool {
  104. for element := range getTagWhitelist() {
  105. if tagName == element {
  106. return true
  107. }
  108. }
  109. return false
  110. }
  111. func isValidAttribute(tagName, attributeName string) bool {
  112. for element, attributes := range getTagWhitelist() {
  113. if tagName == element {
  114. if inList(attributeName, attributes) {
  115. return true
  116. }
  117. }
  118. }
  119. return false
  120. }
  121. func isExternalResourceAttribute(attribute string) bool {
  122. switch attribute {
  123. case "src", "href", "poster", "cite":
  124. return true
  125. default:
  126. return false
  127. }
  128. }
  129. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  130. if tagName == "img" {
  131. hasHeight := false
  132. hasWidth := false
  133. for _, attribute := range attributes {
  134. if attribute.Key == "height" && attribute.Val == "1" {
  135. hasHeight = true
  136. }
  137. if attribute.Key == "width" && attribute.Val == "1" {
  138. hasWidth = true
  139. }
  140. }
  141. return hasHeight && hasWidth
  142. }
  143. return false
  144. }
  145. func hasRequiredAttributes(tagName string, attributes []string) bool {
  146. elements := make(map[string][]string)
  147. elements["a"] = []string{"href"}
  148. elements["iframe"] = []string{"src"}
  149. elements["img"] = []string{"src"}
  150. elements["source"] = []string{"src"}
  151. for element, attrs := range elements {
  152. if tagName == element {
  153. for _, attribute := range attributes {
  154. for _, attr := range attrs {
  155. if attr == attribute {
  156. return true
  157. }
  158. }
  159. }
  160. return false
  161. }
  162. }
  163. return true
  164. }
  165. func hasValidScheme(src string) bool {
  166. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  167. whitelist := []string{
  168. "apt://",
  169. "bitcoin://",
  170. "callto://",
  171. "ed2k://",
  172. "facetime://",
  173. "feed://",
  174. "ftp://",
  175. "geo://",
  176. "gopher://",
  177. "git://",
  178. "http://",
  179. "https://",
  180. "irc://",
  181. "irc6://",
  182. "ircs://",
  183. "itms://",
  184. "jabber://",
  185. "magnet://",
  186. "mailto://",
  187. "maps://",
  188. "news://",
  189. "nfs://",
  190. "nntp://",
  191. "rtmp://",
  192. "sip://",
  193. "sips://",
  194. "skype://",
  195. "smb://",
  196. "sms://",
  197. "spotify://",
  198. "ssh://",
  199. "sftp://",
  200. "steam://",
  201. "svn://",
  202. "tel://",
  203. "webcal://",
  204. "xmpp://",
  205. }
  206. for _, prefix := range whitelist {
  207. if strings.HasPrefix(src, prefix) {
  208. return true
  209. }
  210. }
  211. return false
  212. }
  213. func isBlacklistedResource(src string) bool {
  214. blacklist := []string{
  215. "feedsportal.com",
  216. "api.flattr.com",
  217. "stats.wordpress.com",
  218. "plus.google.com/share",
  219. "twitter.com/share",
  220. "feeds.feedburner.com",
  221. }
  222. for _, element := range blacklist {
  223. if strings.Contains(src, element) {
  224. return true
  225. }
  226. }
  227. return false
  228. }
  229. func isValidIframeSource(src string) bool {
  230. whitelist := []string{
  231. "http://www.youtube.com",
  232. "https://www.youtube.com",
  233. "http://player.vimeo.com",
  234. "https://player.vimeo.com",
  235. "http://www.dailymotion.com",
  236. "https://www.dailymotion.com",
  237. "http://vk.com",
  238. "https://vk.com",
  239. }
  240. for _, prefix := range whitelist {
  241. if strings.HasPrefix(src, prefix) {
  242. return true
  243. }
  244. }
  245. return false
  246. }
  247. func getTagWhitelist() map[string][]string {
  248. whitelist := make(map[string][]string)
  249. whitelist["img"] = []string{"alt", "title", "src"}
  250. whitelist["audio"] = []string{"src"}
  251. whitelist["video"] = []string{"poster", "height", "width", "src"}
  252. whitelist["source"] = []string{"src", "type"}
  253. whitelist["dt"] = []string{}
  254. whitelist["dd"] = []string{}
  255. whitelist["dl"] = []string{}
  256. whitelist["table"] = []string{}
  257. whitelist["caption"] = []string{}
  258. whitelist["thead"] = []string{}
  259. whitelist["tfooter"] = []string{}
  260. whitelist["tr"] = []string{}
  261. whitelist["td"] = []string{"rowspan", "colspan"}
  262. whitelist["th"] = []string{"rowspan", "colspan"}
  263. whitelist["h1"] = []string{}
  264. whitelist["h2"] = []string{}
  265. whitelist["h3"] = []string{}
  266. whitelist["h4"] = []string{}
  267. whitelist["h5"] = []string{}
  268. whitelist["h6"] = []string{}
  269. whitelist["strong"] = []string{}
  270. whitelist["em"] = []string{}
  271. whitelist["code"] = []string{}
  272. whitelist["pre"] = []string{}
  273. whitelist["blockquote"] = []string{}
  274. whitelist["q"] = []string{"cite"}
  275. whitelist["p"] = []string{}
  276. whitelist["ul"] = []string{}
  277. whitelist["li"] = []string{}
  278. whitelist["ol"] = []string{}
  279. whitelist["br"] = []string{}
  280. whitelist["del"] = []string{}
  281. whitelist["a"] = []string{"href", "title"}
  282. whitelist["figure"] = []string{}
  283. whitelist["figcaption"] = []string{}
  284. whitelist["cite"] = []string{}
  285. whitelist["time"] = []string{"datetime"}
  286. whitelist["abbr"] = []string{"title"}
  287. whitelist["acronym"] = []string{"title"}
  288. whitelist["wbr"] = []string{}
  289. whitelist["dfn"] = []string{}
  290. whitelist["sub"] = []string{}
  291. whitelist["sup"] = []string{}
  292. whitelist["var"] = []string{}
  293. whitelist["samp"] = []string{}
  294. whitelist["s"] = []string{}
  295. whitelist["del"] = []string{}
  296. whitelist["ins"] = []string{}
  297. whitelist["kbd"] = []string{}
  298. whitelist["rp"] = []string{}
  299. whitelist["rt"] = []string{}
  300. whitelist["rtc"] = []string{}
  301. whitelist["ruby"] = []string{}
  302. whitelist["iframe"] = []string{"width", "height", "frameborder", "src", "allowfullscreen"}
  303. return whitelist
  304. }
  305. func inList(needle string, haystack []string) bool {
  306. for _, element := range haystack {
  307. if element == needle {
  308. return true
  309. }
  310. }
  311. return false
  312. }