sanitizer.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "fmt"
  6. "io"
  7. "regexp"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "miniflux.app/v2/internal/config"
  12. "miniflux.app/v2/internal/urllib"
  13. "golang.org/x/net/html"
  14. )
  15. var (
  16. youtubeEmbedRegex = regexp.MustCompile(`//(?:www\.)?youtube\.com/embed/(.+)$`)
  17. tagAllowList = map[string][]string{
  18. "a": {"href", "title", "id"},
  19. "abbr": {"title"},
  20. "acronym": {"title"},
  21. "audio": {"src"},
  22. "blockquote": {},
  23. "br": {},
  24. "caption": {},
  25. "cite": {},
  26. "code": {},
  27. "dd": {"id"},
  28. "del": {},
  29. "dfn": {},
  30. "dl": {"id"},
  31. "dt": {"id"},
  32. "em": {},
  33. "figcaption": {},
  34. "figure": {},
  35. "h1": {"id"},
  36. "h2": {"id"},
  37. "h3": {"id"},
  38. "h4": {"id"},
  39. "h5": {"id"},
  40. "h6": {"id"},
  41. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  42. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height"},
  43. "ins": {},
  44. "kbd": {},
  45. "li": {"id"},
  46. "ol": {"id"},
  47. "p": {},
  48. "picture": {},
  49. "pre": {},
  50. "q": {"cite"},
  51. "rp": {},
  52. "rt": {},
  53. "rtc": {},
  54. "ruby": {},
  55. "s": {},
  56. "samp": {},
  57. "source": {"src", "type", "srcset", "sizes", "media"},
  58. "strong": {},
  59. "sub": {},
  60. "sup": {"id"},
  61. "table": {},
  62. "td": {"rowspan", "colspan"},
  63. "tfooter": {},
  64. "th": {"rowspan", "colspan"},
  65. "thead": {},
  66. "time": {"datetime"},
  67. "tr": {},
  68. "ul": {"id"},
  69. "var": {},
  70. "video": {"poster", "height", "width", "src"},
  71. "wbr": {},
  72. }
  73. )
  74. // Sanitize returns safe HTML.
  75. func Sanitize(baseURL, input string) string {
  76. var buffer strings.Builder
  77. var tagStack []string
  78. var parentTag string
  79. var blockedStack []string
  80. tokenizer := html.NewTokenizer(strings.NewReader(input))
  81. for {
  82. if tokenizer.Next() == html.ErrorToken {
  83. err := tokenizer.Err()
  84. if err == io.EOF {
  85. return buffer.String()
  86. }
  87. return ""
  88. }
  89. token := tokenizer.Token()
  90. switch token.Type {
  91. case html.TextToken:
  92. if len(blockedStack) > 0 {
  93. continue
  94. }
  95. // An iframe element never has fallback content.
  96. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  97. if parentTag == "iframe" {
  98. continue
  99. }
  100. buffer.WriteString(html.EscapeString(token.Data))
  101. case html.StartTagToken:
  102. tagName := token.DataAtom.String()
  103. parentTag = tagName
  104. if isPixelTracker(tagName, token.Attr) {
  105. continue
  106. }
  107. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  108. blockedStack = append(blockedStack, tagName)
  109. } else if len(blockedStack) == 0 && isValidTag(tagName) {
  110. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  111. if hasRequiredAttributes(tagName, attrNames) {
  112. if len(attrNames) > 0 {
  113. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  114. } else {
  115. buffer.WriteString("<" + tagName + ">")
  116. }
  117. tagStack = append(tagStack, tagName)
  118. }
  119. }
  120. case html.EndTagToken:
  121. tagName := token.DataAtom.String()
  122. if len(blockedStack) > 0 && blockedStack[len(blockedStack)-1] == tagName {
  123. blockedStack = blockedStack[:len(blockedStack)-1]
  124. } else if len(blockedStack) == 0 && isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  125. buffer.WriteString("</" + tagName + ">")
  126. }
  127. case html.SelfClosingTagToken:
  128. tagName := token.DataAtom.String()
  129. if isPixelTracker(tagName, token.Attr) {
  130. continue
  131. }
  132. if isValidTag(tagName) && len(blockedStack) == 0 {
  133. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  134. if hasRequiredAttributes(tagName, attrNames) {
  135. if len(attrNames) > 0 {
  136. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  137. } else {
  138. buffer.WriteString("<" + tagName + "/>")
  139. }
  140. }
  141. }
  142. }
  143. }
  144. }
  145. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  146. var htmlAttrs, attrNames []string
  147. var err error
  148. var isImageLargerThanLayout bool
  149. var isAnchorLink bool
  150. if tagName == "img" {
  151. imgWidth := getIntegerAttributeValue("width", attributes)
  152. isImageLargerThanLayout = imgWidth > 750
  153. }
  154. for _, attribute := range attributes {
  155. value := attribute.Val
  156. if !isValidAttribute(tagName, attribute.Key) {
  157. continue
  158. }
  159. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  160. value = sanitizeSrcsetAttr(baseURL, value)
  161. }
  162. if tagName == "img" && (attribute.Key == "width" || attribute.Key == "height") {
  163. if !isPositiveInteger(value) {
  164. continue
  165. }
  166. if isImageLargerThanLayout {
  167. continue
  168. }
  169. }
  170. if isExternalResourceAttribute(attribute.Key) {
  171. switch {
  172. case tagName == "iframe":
  173. if !isValidIframeSource(baseURL, attribute.Val) {
  174. continue
  175. }
  176. value = rewriteIframeURL(attribute.Val)
  177. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  178. value = attribute.Val
  179. case isAnchor("a", attribute):
  180. value = attribute.Val
  181. isAnchorLink = true
  182. default:
  183. value, err = urllib.AbsoluteURL(baseURL, value)
  184. if err != nil {
  185. continue
  186. }
  187. if !hasValidURIScheme(value) || isBlockedResource(value) {
  188. continue
  189. }
  190. }
  191. }
  192. attrNames = append(attrNames, attribute.Key)
  193. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s=%q`, attribute.Key, html.EscapeString(value)))
  194. }
  195. if !isAnchorLink {
  196. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  197. if len(extraAttrNames) > 0 {
  198. attrNames = append(attrNames, extraAttrNames...)
  199. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  200. }
  201. }
  202. return attrNames, strings.Join(htmlAttrs, " ")
  203. }
  204. func getExtraAttributes(tagName string) ([]string, []string) {
  205. switch tagName {
  206. case "a":
  207. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  208. case "video", "audio":
  209. return []string{"controls"}, []string{"controls"}
  210. case "iframe":
  211. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
  212. case "img":
  213. return []string{"loading"}, []string{`loading="lazy"`}
  214. default:
  215. return nil, nil
  216. }
  217. }
  218. func isValidTag(tagName string) bool {
  219. if _, ok := tagAllowList[tagName]; ok {
  220. return true
  221. }
  222. return false
  223. }
  224. func isValidAttribute(tagName, attributeName string) bool {
  225. if attributes, ok := tagAllowList[tagName]; ok {
  226. return slices.Contains(attributes, attributeName)
  227. }
  228. return false
  229. }
  230. func isExternalResourceAttribute(attribute string) bool {
  231. switch attribute {
  232. case "src", "href", "poster", "cite":
  233. return true
  234. default:
  235. return false
  236. }
  237. }
  238. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  239. if tagName != "img" {
  240. return false
  241. }
  242. hasHeight := false
  243. hasWidth := false
  244. for _, attribute := range attributes {
  245. if attribute.Val == "1" {
  246. if attribute.Key == "height" {
  247. hasHeight = true
  248. } else if attribute.Key == "width" {
  249. hasWidth = true
  250. }
  251. }
  252. }
  253. return hasHeight && hasWidth
  254. }
  255. func hasRequiredAttributes(tagName string, attributes []string) bool {
  256. elements := map[string][]string{
  257. "a": {"href"},
  258. "iframe": {"src"},
  259. "img": {"src"},
  260. "source": {"src", "srcset"},
  261. }
  262. if attrs, ok := elements[tagName]; ok {
  263. for _, attribute := range attributes {
  264. if slices.Contains(attrs, attribute) {
  265. return true
  266. }
  267. }
  268. return false
  269. }
  270. return true
  271. }
  272. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  273. func hasValidURIScheme(src string) bool {
  274. whitelist := []string{
  275. "apt:",
  276. "bitcoin:",
  277. "callto:",
  278. "dav:",
  279. "davs:",
  280. "ed2k://",
  281. "facetime://",
  282. "feed:",
  283. "ftp://",
  284. "geo:",
  285. "gopher://",
  286. "git://",
  287. "http://",
  288. "https://",
  289. "irc://",
  290. "irc6://",
  291. "ircs://",
  292. "itms://",
  293. "itms-apps://",
  294. "magnet:",
  295. "mailto:",
  296. "news:",
  297. "nntp:",
  298. "rtmp://",
  299. "sip:",
  300. "sips:",
  301. "skype:",
  302. "spotify:",
  303. "ssh://",
  304. "sftp://",
  305. "steam://",
  306. "svn://",
  307. "svn+ssh://",
  308. "tel:",
  309. "webcal://",
  310. "xmpp:",
  311. // iOS Apps
  312. "opener://", // https://www.opener.link
  313. "hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  314. }
  315. return slices.ContainsFunc(whitelist, func(prefix string) bool {
  316. return strings.HasPrefix(src, prefix)
  317. })
  318. }
  319. func isBlockedResource(src string) bool {
  320. blacklist := []string{
  321. "feedsportal.com",
  322. "api.flattr.com",
  323. "stats.wordpress.com",
  324. "plus.google.com/share",
  325. "twitter.com/share",
  326. "feeds.feedburner.com",
  327. }
  328. return slices.ContainsFunc(blacklist, func(element string) bool {
  329. return strings.Contains(src, element)
  330. })
  331. }
  332. func isValidIframeSource(baseURL, src string) bool {
  333. whitelist := []string{
  334. "bandcamp.com",
  335. "cdn.embedly.com",
  336. "player.bilibili.com",
  337. "player.twitch.tv",
  338. "player.vimeo.com",
  339. "soundcloud.com",
  340. "vk.com",
  341. "w.soundcloud.com",
  342. "dailymotion.com",
  343. "youtube-nocookie.com",
  344. "youtube.com",
  345. }
  346. domain := urllib.Domain(src)
  347. // allow iframe from same origin
  348. if urllib.Domain(baseURL) == domain {
  349. return true
  350. }
  351. // allow iframe from custom invidious instance
  352. if config.Opts != nil && config.Opts.InvidiousInstance() == domain {
  353. return true
  354. }
  355. return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
  356. }
  357. func rewriteIframeURL(link string) string {
  358. matches := youtubeEmbedRegex.FindStringSubmatch(link)
  359. if len(matches) == 2 {
  360. return config.Opts.YouTubeEmbedUrlOverride() + matches[1]
  361. }
  362. return link
  363. }
  364. func isBlockedTag(tagName string) bool {
  365. blacklist := []string{
  366. "noscript",
  367. "script",
  368. "style",
  369. }
  370. return slices.Contains(blacklist, tagName)
  371. }
  372. func sanitizeSrcsetAttr(baseURL, value string) string {
  373. imageCandidates := ParseSrcSetAttribute(value)
  374. for _, imageCandidate := range imageCandidates {
  375. absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL)
  376. if err == nil {
  377. imageCandidate.ImageURL = absoluteURL
  378. }
  379. }
  380. return imageCandidates.String()
  381. }
  382. func isValidDataAttribute(value string) bool {
  383. var dataAttributeAllowList = []string{
  384. "data:image/avif",
  385. "data:image/apng",
  386. "data:image/png",
  387. "data:image/svg",
  388. "data:image/svg+xml",
  389. "data:image/jpg",
  390. "data:image/jpeg",
  391. "data:image/gif",
  392. "data:image/webp",
  393. }
  394. return slices.ContainsFunc(dataAttributeAllowList, func(prefix string) bool {
  395. return strings.HasPrefix(value, prefix)
  396. })
  397. }
  398. func isAnchor(tagName string, attribute html.Attribute) bool {
  399. return tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#")
  400. }
  401. func isPositiveInteger(value string) bool {
  402. if number, err := strconv.Atoi(value); err == nil {
  403. return number > 0
  404. }
  405. return false
  406. }
  407. func getAttributeValue(name string, attributes []html.Attribute) string {
  408. for _, attribute := range attributes {
  409. if attribute.Key == name {
  410. return attribute.Val
  411. }
  412. }
  413. return ""
  414. }
  415. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  416. number, _ := strconv.Atoi(getAttributeValue(name, attributes))
  417. return number
  418. }