html.go 16 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467
  1. // Package html minifies HTML5 following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2. package html // import "github.com/tdewolff/minify/html"
  3. import (
  4. "bytes"
  5. "io"
  6. "github.com/tdewolff/minify"
  7. "github.com/tdewolff/parse"
  8. "github.com/tdewolff/parse/buffer"
  9. "github.com/tdewolff/parse/html"
  10. )
  11. var (
  12. gtBytes = []byte(">")
  13. isBytes = []byte("=")
  14. spaceBytes = []byte(" ")
  15. doctypeBytes = []byte("<!doctype html>")
  16. jsMimeBytes = []byte("text/javascript")
  17. cssMimeBytes = []byte("text/css")
  18. htmlMimeBytes = []byte("text/html")
  19. svgMimeBytes = []byte("image/svg+xml")
  20. mathMimeBytes = []byte("application/mathml+xml")
  21. dataSchemeBytes = []byte("data:")
  22. jsSchemeBytes = []byte("javascript:")
  23. httpBytes = []byte("http")
  24. )
  25. ////////////////////////////////////////////////////////////////
  26. // DefaultMinifier is the default minifier.
  27. var DefaultMinifier = &Minifier{}
  28. // Minifier is an HTML minifier.
  29. type Minifier struct {
  30. KeepConditionalComments bool
  31. KeepDefaultAttrVals bool
  32. KeepDocumentTags bool
  33. KeepEndTags bool
  34. KeepWhitespace bool
  35. }
  36. // Minify minifies HTML data, it reads from r and writes to w.
  37. func Minify(m *minify.M, w io.Writer, r io.Reader, params map[string]string) error {
  38. return DefaultMinifier.Minify(m, w, r, params)
  39. }
  40. // Minify minifies HTML data, it reads from r and writes to w.
  41. func (o *Minifier) Minify(m *minify.M, w io.Writer, r io.Reader, _ map[string]string) error {
  42. var rawTagHash html.Hash
  43. var rawTagMediatype []byte
  44. omitSpace := true // if true the next leading space is omitted
  45. inPre := false
  46. defaultScriptType := jsMimeBytes
  47. defaultScriptParams := map[string]string(nil)
  48. defaultStyleType := cssMimeBytes
  49. defaultStyleParams := map[string]string(nil)
  50. defaultInlineStyleParams := map[string]string{"inline": "1"}
  51. attrMinifyBuffer := buffer.NewWriter(make([]byte, 0, 64))
  52. attrByteBuffer := make([]byte, 0, 64)
  53. l := html.NewLexer(r)
  54. defer l.Restore()
  55. tb := NewTokenBuffer(l)
  56. for {
  57. t := *tb.Shift()
  58. SWITCH:
  59. switch t.TokenType {
  60. case html.ErrorToken:
  61. if l.Err() == io.EOF {
  62. return nil
  63. }
  64. return l.Err()
  65. case html.DoctypeToken:
  66. if _, err := w.Write(doctypeBytes); err != nil {
  67. return err
  68. }
  69. case html.CommentToken:
  70. if o.KeepConditionalComments && len(t.Text) > 6 && (bytes.HasPrefix(t.Text, []byte("[if ")) || bytes.Equal(t.Text, []byte("[endif]")) || bytes.Equal(t.Text, []byte("<![endif]"))) {
  71. // [if ...] is always 7 or more characters, [endif] is only encountered for downlevel-revealed
  72. // see https://msdn.microsoft.com/en-us/library/ms537512(v=vs.85).aspx#syntax
  73. if bytes.HasPrefix(t.Data, []byte("<!--[if ")) && len(t.Data) > len("<!--[if ]><![endif]-->") { // downlevel-hidden
  74. begin := bytes.IndexByte(t.Data, '>') + 1
  75. end := len(t.Data) - len("<![endif]-->")
  76. if _, err := w.Write(t.Data[:begin]); err != nil {
  77. return err
  78. }
  79. if err := o.Minify(m, w, buffer.NewReader(t.Data[begin:end]), nil); err != nil {
  80. return err
  81. }
  82. if _, err := w.Write(t.Data[end:]); err != nil {
  83. return err
  84. }
  85. } else if _, err := w.Write(t.Data); err != nil { // downlevel-revealed or short downlevel-hidden
  86. return err
  87. }
  88. }
  89. case html.SvgToken:
  90. if err := m.MinifyMimetype(svgMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
  91. if err != minify.ErrNotExist {
  92. return err
  93. } else if _, err := w.Write(t.Data); err != nil {
  94. return err
  95. }
  96. }
  97. case html.MathToken:
  98. if err := m.MinifyMimetype(mathMimeBytes, w, buffer.NewReader(t.Data), nil); err != nil {
  99. if err != minify.ErrNotExist {
  100. return err
  101. } else if _, err := w.Write(t.Data); err != nil {
  102. return err
  103. }
  104. }
  105. case html.TextToken:
  106. // CSS and JS minifiers for inline code
  107. if rawTagHash != 0 {
  108. if rawTagHash == html.Style || rawTagHash == html.Script || rawTagHash == html.Iframe {
  109. var mimetype []byte
  110. var params map[string]string
  111. if rawTagHash == html.Iframe {
  112. mimetype = htmlMimeBytes
  113. } else if len(rawTagMediatype) > 0 {
  114. mimetype, params = parse.Mediatype(rawTagMediatype)
  115. } else if rawTagHash == html.Script {
  116. mimetype = defaultScriptType
  117. params = defaultScriptParams
  118. } else if rawTagHash == html.Style {
  119. mimetype = defaultStyleType
  120. params = defaultStyleParams
  121. }
  122. if err := m.MinifyMimetype(mimetype, w, buffer.NewReader(t.Data), params); err != nil {
  123. if err != minify.ErrNotExist {
  124. return err
  125. } else if _, err := w.Write(t.Data); err != nil {
  126. return err
  127. }
  128. }
  129. } else if _, err := w.Write(t.Data); err != nil {
  130. return err
  131. }
  132. } else if inPre {
  133. if _, err := w.Write(t.Data); err != nil {
  134. return err
  135. }
  136. } else {
  137. t.Data = parse.ReplaceMultipleWhitespace(t.Data)
  138. // whitespace removal; trim left
  139. if omitSpace && (t.Data[0] == ' ' || t.Data[0] == '\n') {
  140. t.Data = t.Data[1:]
  141. }
  142. // whitespace removal; trim right
  143. omitSpace = false
  144. if len(t.Data) == 0 {
  145. omitSpace = true
  146. } else if t.Data[len(t.Data)-1] == ' ' || t.Data[len(t.Data)-1] == '\n' {
  147. omitSpace = true
  148. i := 0
  149. for {
  150. next := tb.Peek(i)
  151. // trim if EOF, text token with leading whitespace or block token
  152. if next.TokenType == html.ErrorToken {
  153. t.Data = t.Data[:len(t.Data)-1]
  154. omitSpace = false
  155. break
  156. } else if next.TokenType == html.TextToken {
  157. // this only happens when a comment, doctype or phrasing end tag (only for !o.KeepWhitespace) was in between
  158. // remove if the text token starts with a whitespace
  159. if len(next.Data) > 0 && parse.IsWhitespace(next.Data[0]) {
  160. t.Data = t.Data[:len(t.Data)-1]
  161. omitSpace = false
  162. }
  163. break
  164. } else if next.TokenType == html.StartTagToken || next.TokenType == html.EndTagToken {
  165. if o.KeepWhitespace {
  166. break
  167. }
  168. // remove when followed up by a block tag
  169. if next.Traits&nonPhrasingTag != 0 {
  170. t.Data = t.Data[:len(t.Data)-1]
  171. omitSpace = false
  172. break
  173. } else if next.TokenType == html.StartTagToken {
  174. break
  175. }
  176. }
  177. i++
  178. }
  179. }
  180. if _, err := w.Write(t.Data); err != nil {
  181. return err
  182. }
  183. }
  184. case html.StartTagToken, html.EndTagToken:
  185. rawTagHash = 0
  186. hasAttributes := false
  187. if t.TokenType == html.StartTagToken {
  188. if next := tb.Peek(0); next.TokenType == html.AttributeToken {
  189. hasAttributes = true
  190. }
  191. if t.Traits&rawTag != 0 {
  192. // ignore empty script and style tags
  193. if !hasAttributes && (t.Hash == html.Script || t.Hash == html.Style) {
  194. if next := tb.Peek(1); next.TokenType == html.EndTagToken {
  195. tb.Shift()
  196. tb.Shift()
  197. break
  198. }
  199. }
  200. rawTagHash = t.Hash
  201. rawTagMediatype = nil
  202. }
  203. } else if t.Hash == html.Template {
  204. omitSpace = true // EndTagToken
  205. }
  206. if t.Hash == html.Pre {
  207. inPre = t.TokenType == html.StartTagToken
  208. }
  209. // remove superfluous tags, except for html, head and body tags when KeepDocumentTags is set
  210. if !hasAttributes && (!o.KeepDocumentTags && (t.Hash == html.Html || t.Hash == html.Head || t.Hash == html.Body) || t.Hash == html.Colgroup) {
  211. break
  212. } else if t.TokenType == html.EndTagToken {
  213. if !o.KeepEndTags {
  214. if t.Hash == html.Thead || t.Hash == html.Tbody || t.Hash == html.Tfoot || t.Hash == html.Tr || t.Hash == html.Th || t.Hash == html.Td ||
  215. t.Hash == html.Optgroup || t.Hash == html.Option || t.Hash == html.Dd || t.Hash == html.Dt ||
  216. t.Hash == html.Li || t.Hash == html.Rb || t.Hash == html.Rt || t.Hash == html.Rtc || t.Hash == html.Rp {
  217. break
  218. } else if t.Hash == html.P {
  219. i := 0
  220. for {
  221. next := tb.Peek(i)
  222. i++
  223. // continue if text token is empty or whitespace
  224. if next.TokenType == html.TextToken && parse.IsAllWhitespace(next.Data) {
  225. continue
  226. }
  227. if next.TokenType == html.ErrorToken || next.TokenType == html.EndTagToken && next.Traits&keepPTag == 0 || next.TokenType == html.StartTagToken && next.Traits&omitPTag != 0 {
  228. break SWITCH // omit p end tag
  229. }
  230. break
  231. }
  232. }
  233. }
  234. if o.KeepWhitespace || t.Traits&objectTag != 0 {
  235. omitSpace = false
  236. } else if t.Traits&nonPhrasingTag != 0 {
  237. omitSpace = true // omit spaces after block elements
  238. }
  239. if len(t.Data) > 3+len(t.Text) {
  240. t.Data[2+len(t.Text)] = '>'
  241. t.Data = t.Data[:3+len(t.Text)]
  242. }
  243. if _, err := w.Write(t.Data); err != nil {
  244. return err
  245. }
  246. break
  247. }
  248. if o.KeepWhitespace || t.Traits&objectTag != 0 {
  249. omitSpace = false
  250. } else if t.Traits&nonPhrasingTag != 0 {
  251. omitSpace = true // omit spaces after block elements
  252. }
  253. if _, err := w.Write(t.Data); err != nil {
  254. return err
  255. }
  256. if hasAttributes {
  257. if t.Hash == html.Meta {
  258. attrs := tb.Attributes(html.Content, html.Http_Equiv, html.Charset, html.Name)
  259. if content := attrs[0]; content != nil {
  260. if httpEquiv := attrs[1]; httpEquiv != nil {
  261. if charset := attrs[2]; charset == nil && parse.EqualFold(httpEquiv.AttrVal, []byte("content-type")) {
  262. content.AttrVal = minify.Mediatype(content.AttrVal)
  263. if bytes.Equal(content.AttrVal, []byte("text/html;charset=utf-8")) {
  264. httpEquiv.Text = nil
  265. content.Text = []byte("charset")
  266. content.Hash = html.Charset
  267. content.AttrVal = []byte("utf-8")
  268. }
  269. } else if parse.EqualFold(httpEquiv.AttrVal, []byte("content-style-type")) {
  270. content.AttrVal = minify.Mediatype(content.AttrVal)
  271. defaultStyleType, defaultStyleParams = parse.Mediatype(content.AttrVal)
  272. if defaultStyleParams != nil {
  273. defaultInlineStyleParams = defaultStyleParams
  274. defaultInlineStyleParams["inline"] = "1"
  275. } else {
  276. defaultInlineStyleParams = map[string]string{"inline": "1"}
  277. }
  278. } else if parse.EqualFold(httpEquiv.AttrVal, []byte("content-script-type")) {
  279. content.AttrVal = minify.Mediatype(content.AttrVal)
  280. defaultScriptType, defaultScriptParams = parse.Mediatype(content.AttrVal)
  281. }
  282. }
  283. if name := attrs[3]; name != nil {
  284. if parse.EqualFold(name.AttrVal, []byte("keywords")) {
  285. content.AttrVal = bytes.Replace(content.AttrVal, []byte(", "), []byte(","), -1)
  286. } else if parse.EqualFold(name.AttrVal, []byte("viewport")) {
  287. content.AttrVal = bytes.Replace(content.AttrVal, []byte(" "), []byte(""), -1)
  288. for i := 0; i < len(content.AttrVal); i++ {
  289. if content.AttrVal[i] == '=' && i+2 < len(content.AttrVal) {
  290. i++
  291. if n := parse.Number(content.AttrVal[i:]); n > 0 {
  292. minNum := minify.Number(content.AttrVal[i:i+n], -1)
  293. if len(minNum) < n {
  294. copy(content.AttrVal[i:i+len(minNum)], minNum)
  295. copy(content.AttrVal[i+len(minNum):], content.AttrVal[i+n:])
  296. content.AttrVal = content.AttrVal[:len(content.AttrVal)+len(minNum)-n]
  297. }
  298. i += len(minNum)
  299. }
  300. i-- // mitigate for-loop increase
  301. }
  302. }
  303. }
  304. }
  305. }
  306. } else if t.Hash == html.Script {
  307. attrs := tb.Attributes(html.Src, html.Charset)
  308. if attrs[0] != nil && attrs[1] != nil {
  309. attrs[1].Text = nil
  310. }
  311. }
  312. // write attributes
  313. htmlEqualIdName := false
  314. for {
  315. attr := *tb.Shift()
  316. if attr.TokenType != html.AttributeToken {
  317. break
  318. } else if attr.Text == nil {
  319. continue // removed attribute
  320. }
  321. if t.Hash == html.A && (attr.Hash == html.Id || attr.Hash == html.Name) {
  322. if attr.Hash == html.Id {
  323. if name := tb.Attributes(html.Name)[0]; name != nil && bytes.Equal(attr.AttrVal, name.AttrVal) {
  324. htmlEqualIdName = true
  325. }
  326. } else if htmlEqualIdName {
  327. continue
  328. } else if id := tb.Attributes(html.Id)[0]; id != nil && bytes.Equal(id.AttrVal, attr.AttrVal) {
  329. continue
  330. }
  331. }
  332. val := attr.AttrVal
  333. if len(val) == 0 && (attr.Hash == html.Class ||
  334. attr.Hash == html.Dir ||
  335. attr.Hash == html.Id ||
  336. attr.Hash == html.Lang ||
  337. attr.Hash == html.Name ||
  338. attr.Hash == html.Title ||
  339. attr.Hash == html.Action && t.Hash == html.Form ||
  340. attr.Hash == html.Value && t.Hash == html.Input) {
  341. continue // omit empty attribute values
  342. }
  343. if attr.Traits&caselessAttr != 0 {
  344. val = parse.ToLower(val)
  345. if attr.Hash == html.Enctype || attr.Hash == html.Codetype || attr.Hash == html.Accept || attr.Hash == html.Type && (t.Hash == html.A || t.Hash == html.Link || t.Hash == html.Object || t.Hash == html.Param || t.Hash == html.Script || t.Hash == html.Style || t.Hash == html.Source) {
  346. val = minify.Mediatype(val)
  347. }
  348. }
  349. if rawTagHash != 0 && attr.Hash == html.Type {
  350. rawTagMediatype = parse.Copy(val)
  351. }
  352. // default attribute values can be omitted
  353. if !o.KeepDefaultAttrVals && (attr.Hash == html.Type && (t.Hash == html.Script && bytes.Equal(val, []byte("text/javascript")) ||
  354. t.Hash == html.Style && bytes.Equal(val, []byte("text/css")) ||
  355. t.Hash == html.Link && bytes.Equal(val, []byte("text/css")) ||
  356. t.Hash == html.Input && bytes.Equal(val, []byte("text")) ||
  357. t.Hash == html.Button && bytes.Equal(val, []byte("submit"))) ||
  358. attr.Hash == html.Language && t.Hash == html.Script ||
  359. attr.Hash == html.Method && bytes.Equal(val, []byte("get")) ||
  360. attr.Hash == html.Enctype && bytes.Equal(val, []byte("application/x-www-form-urlencoded")) ||
  361. attr.Hash == html.Colspan && bytes.Equal(val, []byte("1")) ||
  362. attr.Hash == html.Rowspan && bytes.Equal(val, []byte("1")) ||
  363. attr.Hash == html.Shape && bytes.Equal(val, []byte("rect")) ||
  364. attr.Hash == html.Span && bytes.Equal(val, []byte("1")) ||
  365. attr.Hash == html.Clear && bytes.Equal(val, []byte("none")) ||
  366. attr.Hash == html.Frameborder && bytes.Equal(val, []byte("1")) ||
  367. attr.Hash == html.Scrolling && bytes.Equal(val, []byte("auto")) ||
  368. attr.Hash == html.Valuetype && bytes.Equal(val, []byte("data")) ||
  369. attr.Hash == html.Media && t.Hash == html.Style && bytes.Equal(val, []byte("all"))) {
  370. continue
  371. }
  372. // CSS and JS minifiers for attribute inline code
  373. if attr.Hash == html.Style {
  374. attrMinifyBuffer.Reset()
  375. if err := m.MinifyMimetype(defaultStyleType, attrMinifyBuffer, buffer.NewReader(val), defaultInlineStyleParams); err == nil {
  376. val = attrMinifyBuffer.Bytes()
  377. } else if err != minify.ErrNotExist {
  378. return err
  379. }
  380. if len(val) == 0 {
  381. continue
  382. }
  383. } else if len(attr.Text) > 2 && attr.Text[0] == 'o' && attr.Text[1] == 'n' {
  384. if len(val) >= 11 && parse.EqualFold(val[:11], jsSchemeBytes) {
  385. val = val[11:]
  386. }
  387. attrMinifyBuffer.Reset()
  388. if err := m.MinifyMimetype(defaultScriptType, attrMinifyBuffer, buffer.NewReader(val), defaultScriptParams); err == nil {
  389. val = attrMinifyBuffer.Bytes()
  390. } else if err != minify.ErrNotExist {
  391. return err
  392. }
  393. if len(val) == 0 {
  394. continue
  395. }
  396. } else if len(val) > 5 && attr.Traits&urlAttr != 0 { // anchors are already handled
  397. if parse.EqualFold(val[:4], httpBytes) {
  398. if val[4] == ':' {
  399. if m.URL != nil && m.URL.Scheme == "http" {
  400. val = val[5:]
  401. } else {
  402. parse.ToLower(val[:4])
  403. }
  404. } else if (val[4] == 's' || val[4] == 'S') && val[5] == ':' {
  405. if m.URL != nil && m.URL.Scheme == "https" {
  406. val = val[6:]
  407. } else {
  408. parse.ToLower(val[:5])
  409. }
  410. }
  411. } else if parse.EqualFold(val[:5], dataSchemeBytes) {
  412. val = minify.DataURI(m, val)
  413. }
  414. }
  415. if _, err := w.Write(spaceBytes); err != nil {
  416. return err
  417. }
  418. if _, err := w.Write(attr.Text); err != nil {
  419. return err
  420. }
  421. if len(val) > 0 && attr.Traits&booleanAttr == 0 {
  422. if _, err := w.Write(isBytes); err != nil {
  423. return err
  424. }
  425. // no quotes if possible, else prefer single or double depending on which occurs more often in value
  426. val = html.EscapeAttrVal(&attrByteBuffer, attr.AttrVal, val)
  427. if _, err := w.Write(val); err != nil {
  428. return err
  429. }
  430. }
  431. }
  432. }
  433. if _, err := w.Write(gtBytes); err != nil {
  434. return err
  435. }
  436. }
  437. }
  438. }