sanitizer.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483
  1. // SPDX-FileCopyrightText: Copyright The Miniflux Authors. All rights reserved.
  2. // SPDX-License-Identifier: Apache-2.0
  3. package sanitizer // import "miniflux.app/v2/internal/reader/sanitizer"
  4. import (
  5. "fmt"
  6. "io"
  7. "regexp"
  8. "slices"
  9. "strconv"
  10. "strings"
  11. "miniflux.app/v2/internal/config"
  12. "miniflux.app/v2/internal/reader/urlcleaner"
  13. "miniflux.app/v2/internal/urllib"
  14. "golang.org/x/net/html"
  15. )
  16. var (
  17. youtubeEmbedRegex = regexp.MustCompile(`//(?:www\.)?youtube\.com/embed/(.+)$`)
  18. tagAllowList = map[string][]string{
  19. "a": {"href", "title", "id"},
  20. "abbr": {"title"},
  21. "acronym": {"title"},
  22. "aside": {},
  23. "audio": {"src"},
  24. "blockquote": {},
  25. "br": {},
  26. "caption": {},
  27. "cite": {},
  28. "code": {},
  29. "dd": {"id"},
  30. "del": {},
  31. "dfn": {},
  32. "dl": {"id"},
  33. "dt": {"id"},
  34. "em": {},
  35. "figcaption": {},
  36. "figure": {},
  37. "h1": {"id"},
  38. "h2": {"id"},
  39. "h3": {"id"},
  40. "h4": {"id"},
  41. "h5": {"id"},
  42. "h6": {"id"},
  43. "iframe": {"width", "height", "frameborder", "src", "allowfullscreen"},
  44. "img": {"alt", "title", "src", "srcset", "sizes", "width", "height"},
  45. "ins": {},
  46. "kbd": {},
  47. "li": {"id"},
  48. "ol": {"id"},
  49. "p": {},
  50. "picture": {},
  51. "pre": {},
  52. "q": {"cite"},
  53. "rp": {},
  54. "rt": {},
  55. "rtc": {},
  56. "ruby": {},
  57. "s": {},
  58. "samp": {},
  59. "source": {"src", "type", "srcset", "sizes", "media"},
  60. "strong": {},
  61. "sub": {},
  62. "sup": {"id"},
  63. "table": {},
  64. "td": {"rowspan", "colspan"},
  65. "tfooter": {},
  66. "th": {"rowspan", "colspan"},
  67. "thead": {},
  68. "time": {"datetime"},
  69. "tr": {},
  70. "ul": {"id"},
  71. "var": {},
  72. "video": {"poster", "height", "width", "src"},
  73. "wbr": {},
  74. }
  75. )
  76. // Sanitize returns safe HTML.
  77. func Sanitize(baseURL, input string) string {
  78. var buffer strings.Builder
  79. var tagStack []string
  80. var parentTag string
  81. var blockedStack []string
  82. tokenizer := html.NewTokenizer(strings.NewReader(input))
  83. for {
  84. if tokenizer.Next() == html.ErrorToken {
  85. err := tokenizer.Err()
  86. if err == io.EOF {
  87. return buffer.String()
  88. }
  89. return ""
  90. }
  91. token := tokenizer.Token()
  92. switch token.Type {
  93. case html.TextToken:
  94. if len(blockedStack) > 0 {
  95. continue
  96. }
  97. // An iframe element never has fallback content.
  98. // See https://www.w3.org/TR/2010/WD-html5-20101019/the-iframe-element.html#the-iframe-element
  99. if parentTag == "iframe" {
  100. continue
  101. }
  102. buffer.WriteString(html.EscapeString(token.Data))
  103. case html.StartTagToken:
  104. tagName := token.DataAtom.String()
  105. parentTag = tagName
  106. if isPixelTracker(tagName, token.Attr) {
  107. continue
  108. }
  109. if isBlockedTag(tagName) || slices.ContainsFunc(token.Attr, func(attr html.Attribute) bool { return attr.Key == "hidden" }) {
  110. blockedStack = append(blockedStack, tagName)
  111. } else if len(blockedStack) == 0 && isValidTag(tagName) {
  112. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  113. if hasRequiredAttributes(tagName, attrNames) {
  114. if len(attrNames) > 0 {
  115. buffer.WriteString("<" + tagName + " " + htmlAttributes + ">")
  116. } else {
  117. buffer.WriteString("<" + tagName + ">")
  118. }
  119. tagStack = append(tagStack, tagName)
  120. }
  121. }
  122. case html.EndTagToken:
  123. tagName := token.DataAtom.String()
  124. if len(blockedStack) > 0 && blockedStack[len(blockedStack)-1] == tagName {
  125. blockedStack = blockedStack[:len(blockedStack)-1]
  126. } else if len(blockedStack) == 0 && isValidTag(tagName) && slices.Contains(tagStack, tagName) {
  127. buffer.WriteString("</" + tagName + ">")
  128. }
  129. case html.SelfClosingTagToken:
  130. tagName := token.DataAtom.String()
  131. if isPixelTracker(tagName, token.Attr) {
  132. continue
  133. }
  134. if isValidTag(tagName) && len(blockedStack) == 0 {
  135. attrNames, htmlAttributes := sanitizeAttributes(baseURL, tagName, token.Attr)
  136. if hasRequiredAttributes(tagName, attrNames) {
  137. if len(attrNames) > 0 {
  138. buffer.WriteString("<" + tagName + " " + htmlAttributes + "/>")
  139. } else {
  140. buffer.WriteString("<" + tagName + "/>")
  141. }
  142. }
  143. }
  144. }
  145. }
  146. }
  147. func sanitizeAttributes(baseURL, tagName string, attributes []html.Attribute) ([]string, string) {
  148. var htmlAttrs, attrNames []string
  149. var err error
  150. var isImageLargerThanLayout bool
  151. var isAnchorLink bool
  152. if tagName == "img" {
  153. imgWidth := getIntegerAttributeValue("width", attributes)
  154. isImageLargerThanLayout = imgWidth > 750
  155. }
  156. for _, attribute := range attributes {
  157. value := attribute.Val
  158. if !isValidAttribute(tagName, attribute.Key) {
  159. continue
  160. }
  161. if (tagName == "img" || tagName == "source") && attribute.Key == "srcset" {
  162. value = sanitizeSrcsetAttr(baseURL, value)
  163. }
  164. if tagName == "img" && (attribute.Key == "width" || attribute.Key == "height") {
  165. if !isPositiveInteger(value) {
  166. continue
  167. }
  168. if isImageLargerThanLayout {
  169. continue
  170. }
  171. }
  172. if isExternalResourceAttribute(attribute.Key) {
  173. switch {
  174. case tagName == "iframe":
  175. if !isValidIframeSource(baseURL, attribute.Val) {
  176. continue
  177. }
  178. value = rewriteIframeURL(attribute.Val)
  179. case tagName == "img" && attribute.Key == "src" && isValidDataAttribute(attribute.Val):
  180. value = attribute.Val
  181. case isAnchor("a", attribute):
  182. value = attribute.Val
  183. isAnchorLink = true
  184. default:
  185. value, err = urllib.AbsoluteURL(baseURL, value)
  186. if err != nil {
  187. continue
  188. }
  189. if !hasValidURIScheme(value) || isBlockedResource(value) {
  190. continue
  191. }
  192. if cleanedURL, err := urlcleaner.RemoveTrackingParameters(value); err == nil {
  193. value = cleanedURL
  194. }
  195. }
  196. }
  197. attrNames = append(attrNames, attribute.Key)
  198. htmlAttrs = append(htmlAttrs, fmt.Sprintf(`%s=%q`, attribute.Key, html.EscapeString(value)))
  199. }
  200. if !isAnchorLink {
  201. extraAttrNames, extraHTMLAttributes := getExtraAttributes(tagName)
  202. if len(extraAttrNames) > 0 {
  203. attrNames = append(attrNames, extraAttrNames...)
  204. htmlAttrs = append(htmlAttrs, extraHTMLAttributes...)
  205. }
  206. }
  207. return attrNames, strings.Join(htmlAttrs, " ")
  208. }
  209. func getExtraAttributes(tagName string) ([]string, []string) {
  210. switch tagName {
  211. case "a":
  212. return []string{"rel", "target", "referrerpolicy"}, []string{`rel="noopener noreferrer"`, `target="_blank"`, `referrerpolicy="no-referrer"`}
  213. case "video", "audio":
  214. return []string{"controls"}, []string{"controls"}
  215. case "iframe":
  216. return []string{"sandbox", "loading"}, []string{`sandbox="allow-scripts allow-same-origin allow-popups allow-popups-to-escape-sandbox"`, `loading="lazy"`}
  217. case "img":
  218. return []string{"loading"}, []string{`loading="lazy"`}
  219. default:
  220. return nil, nil
  221. }
  222. }
  223. func isValidTag(tagName string) bool {
  224. if _, ok := tagAllowList[tagName]; ok {
  225. return true
  226. }
  227. return false
  228. }
  229. func isValidAttribute(tagName, attributeName string) bool {
  230. if attributes, ok := tagAllowList[tagName]; ok {
  231. return slices.Contains(attributes, attributeName)
  232. }
  233. return false
  234. }
  235. func isExternalResourceAttribute(attribute string) bool {
  236. switch attribute {
  237. case "src", "href", "poster", "cite":
  238. return true
  239. default:
  240. return false
  241. }
  242. }
  243. func isPixelTracker(tagName string, attributes []html.Attribute) bool {
  244. if tagName != "img" {
  245. return false
  246. }
  247. hasHeight := false
  248. hasWidth := false
  249. for _, attribute := range attributes {
  250. if attribute.Val == "1" {
  251. if attribute.Key == "height" {
  252. hasHeight = true
  253. } else if attribute.Key == "width" {
  254. hasWidth = true
  255. }
  256. }
  257. }
  258. return hasHeight && hasWidth
  259. }
  260. func hasRequiredAttributes(tagName string, attributes []string) bool {
  261. elements := map[string][]string{
  262. "a": {"href"},
  263. "iframe": {"src"},
  264. "img": {"src"},
  265. "source": {"src", "srcset"},
  266. }
  267. if attrs, ok := elements[tagName]; ok {
  268. for _, attribute := range attributes {
  269. if slices.Contains(attrs, attribute) {
  270. return true
  271. }
  272. }
  273. return false
  274. }
  275. return true
  276. }
  277. // See https://www.iana.org/assignments/uri-schemes/uri-schemes.xhtml
  278. func hasValidURIScheme(src string) bool {
  279. whitelist := []string{
  280. "apt:",
  281. "bitcoin:",
  282. "callto:",
  283. "dav:",
  284. "davs:",
  285. "ed2k://",
  286. "facetime://",
  287. "feed:",
  288. "ftp://",
  289. "geo:",
  290. "gopher://",
  291. "git://",
  292. "http://",
  293. "https://",
  294. "irc://",
  295. "irc6://",
  296. "ircs://",
  297. "itms://",
  298. "itms-apps://",
  299. "magnet:",
  300. "mailto:",
  301. "news:",
  302. "nntp:",
  303. "rtmp://",
  304. "sip:",
  305. "sips:",
  306. "skype:",
  307. "spotify:",
  308. "ssh://",
  309. "sftp://",
  310. "steam://",
  311. "svn://",
  312. "svn+ssh://",
  313. "tel:",
  314. "webcal://",
  315. "xmpp:",
  316. // iOS Apps
  317. "opener://", // https://www.opener.link
  318. "hack://", // https://apps.apple.com/it/app/hack-for-hacker-news-reader/id1464477788?l=en-GB
  319. }
  320. return slices.ContainsFunc(whitelist, func(prefix string) bool {
  321. return strings.HasPrefix(src, prefix)
  322. })
  323. }
  324. func isBlockedResource(src string) bool {
  325. blacklist := []string{
  326. "feedsportal.com",
  327. "api.flattr.com",
  328. "stats.wordpress.com",
  329. "plus.google.com/share",
  330. "twitter.com/share",
  331. "feeds.feedburner.com",
  332. }
  333. return slices.ContainsFunc(blacklist, func(element string) bool {
  334. return strings.Contains(src, element)
  335. })
  336. }
  337. func isValidIframeSource(baseURL, src string) bool {
  338. whitelist := []string{
  339. "bandcamp.com",
  340. "cdn.embedly.com",
  341. "player.bilibili.com",
  342. "player.twitch.tv",
  343. "player.vimeo.com",
  344. "soundcloud.com",
  345. "vk.com",
  346. "w.soundcloud.com",
  347. "dailymotion.com",
  348. "youtube-nocookie.com",
  349. "youtube.com",
  350. }
  351. domain := urllib.Domain(src)
  352. // allow iframe from same origin
  353. if urllib.Domain(baseURL) == domain {
  354. return true
  355. }
  356. // allow iframe from custom invidious instance
  357. if config.Opts != nil && config.Opts.InvidiousInstance() == domain {
  358. return true
  359. }
  360. return slices.Contains(whitelist, strings.TrimPrefix(domain, "www."))
  361. }
  362. func rewriteIframeURL(link string) string {
  363. matches := youtubeEmbedRegex.FindStringSubmatch(link)
  364. if len(matches) == 2 {
  365. return config.Opts.YouTubeEmbedUrlOverride() + matches[1]
  366. }
  367. return link
  368. }
  369. func isBlockedTag(tagName string) bool {
  370. blacklist := []string{
  371. "noscript",
  372. "script",
  373. "style",
  374. }
  375. return slices.Contains(blacklist, tagName)
  376. }
  377. func sanitizeSrcsetAttr(baseURL, value string) string {
  378. imageCandidates := ParseSrcSetAttribute(value)
  379. for _, imageCandidate := range imageCandidates {
  380. absoluteURL, err := urllib.AbsoluteURL(baseURL, imageCandidate.ImageURL)
  381. if err == nil {
  382. imageCandidate.ImageURL = absoluteURL
  383. }
  384. }
  385. return imageCandidates.String()
  386. }
  387. func isValidDataAttribute(value string) bool {
  388. var dataAttributeAllowList = []string{
  389. "data:image/avif",
  390. "data:image/apng",
  391. "data:image/png",
  392. "data:image/svg",
  393. "data:image/svg+xml",
  394. "data:image/jpg",
  395. "data:image/jpeg",
  396. "data:image/gif",
  397. "data:image/webp",
  398. }
  399. return slices.ContainsFunc(dataAttributeAllowList, func(prefix string) bool {
  400. return strings.HasPrefix(value, prefix)
  401. })
  402. }
  403. func isAnchor(tagName string, attribute html.Attribute) bool {
  404. return tagName == "a" && attribute.Key == "href" && strings.HasPrefix(attribute.Val, "#")
  405. }
  406. func isPositiveInteger(value string) bool {
  407. if number, err := strconv.Atoi(value); err == nil {
  408. return number > 0
  409. }
  410. return false
  411. }
  412. func getAttributeValue(name string, attributes []html.Attribute) string {
  413. for _, attribute := range attributes {
  414. if attribute.Key == name {
  415. return attribute.Val
  416. }
  417. }
  418. return ""
  419. }
  420. func getIntegerAttributeValue(name string, attributes []html.Attribute) int {
  421. number, _ := strconv.Atoi(getAttributeValue(name, attributes))
  422. return number
  423. }