lex_test.go 8.1 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283
  1. package html // import "github.com/tdewolff/parse/html"
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "testing"
  7. "github.com/tdewolff/parse"
  8. "github.com/tdewolff/test"
  9. )
  10. type TTs []TokenType
  11. func TestTokens(t *testing.T) {
  12. var tokenTests = []struct {
  13. html string
  14. expected []TokenType
  15. }{
  16. {"<html></html>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
  17. {"<img/>", TTs{StartTagToken, StartTagVoidToken}},
  18. {"<!-- comment -->", TTs{CommentToken}},
  19. {"<!-- comment --!>", TTs{CommentToken}},
  20. {"<p>text</p>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  21. {"<input type='button'/>", TTs{StartTagToken, AttributeToken, StartTagVoidToken}},
  22. {"<input type='button' value=''/>", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagVoidToken}},
  23. {"<input type='=/>' \r\n\t\f value=\"'\" name=x checked />", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, AttributeToken, StartTagVoidToken}},
  24. {"<!doctype>", TTs{DoctypeToken}},
  25. {"<!doctype html>", TTs{DoctypeToken}},
  26. {"<?bogus>", TTs{CommentToken}},
  27. {"</0bogus>", TTs{CommentToken}},
  28. {"<!bogus>", TTs{CommentToken}},
  29. {"< ", TTs{TextToken}},
  30. {"</", TTs{TextToken}},
  31. // raw tags
  32. {"<title><p></p></title>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  33. {"<TITLE><p></p></TITLE>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  34. {"<plaintext></plaintext>", TTs{StartTagToken, StartTagCloseToken, TextToken}},
  35. {"<script></script>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
  36. {"<script>var x='</script>';</script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}},
  37. {"<script><!--var x='</script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken}},
  38. {"<script><!--var x='<script></script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  39. {"<script><!--var x='<script>';--></script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  40. {"<![CDATA[ test ]]>", TTs{TextToken}},
  41. {"<svg>text</svg>", TTs{SvgToken}},
  42. {"<math>text</math>", TTs{MathToken}},
  43. {`<svg>text<x a="</svg>"></x></svg>`, TTs{SvgToken}},
  44. {"<a><svg>text</svg></a>", TTs{StartTagToken, StartTagCloseToken, SvgToken, EndTagToken}},
  45. // early endings
  46. {"<!-- comment", TTs{CommentToken}},
  47. {"<? bogus comment", TTs{CommentToken}},
  48. {"<foo", TTs{StartTagToken}},
  49. {"</foo", TTs{EndTagToken}},
  50. {"<foo x", TTs{StartTagToken, AttributeToken}},
  51. {"<foo x=", TTs{StartTagToken, AttributeToken}},
  52. {"<foo x='", TTs{StartTagToken, AttributeToken}},
  53. {"<foo x=''", TTs{StartTagToken, AttributeToken}},
  54. {"<!DOCTYPE note SYSTEM", TTs{DoctypeToken}},
  55. {"<![CDATA[ test", TTs{TextToken}},
  56. {"<script>", TTs{StartTagToken, StartTagCloseToken}},
  57. {"<script><!--", TTs{StartTagToken, StartTagCloseToken, TextToken}},
  58. {"<script><!--var x='<script></script>';-->", TTs{StartTagToken, StartTagCloseToken, TextToken}},
  59. // NULL
  60. {"foo\x00bar", TTs{TextToken}},
  61. {"<\x00foo>", TTs{TextToken}},
  62. {"<foo\x00>", TTs{StartTagToken, StartTagCloseToken}},
  63. {"</\x00bogus>", TTs{CommentToken}},
  64. {"</foo\x00>", TTs{EndTagToken}},
  65. {"<plaintext>\x00</plaintext>", TTs{StartTagToken, StartTagCloseToken, TextToken}},
  66. {"<script>\x00</script>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  67. {"<!--\x00-->", TTs{CommentToken}},
  68. {"<![CDATA[\x00]]>", TTs{TextToken}},
  69. {"<!doctype\x00>", TTs{DoctypeToken}},
  70. {"<?bogus\x00>", TTs{CommentToken}},
  71. {"<?bogus\x00>", TTs{CommentToken}},
  72. // go-fuzz
  73. {"</>", TTs{TextToken}},
  74. }
  75. for _, tt := range tokenTests {
  76. t.Run(tt.html, func(t *testing.T) {
  77. l := NewLexer(bytes.NewBufferString(tt.html))
  78. i := 0
  79. for {
  80. token, _ := l.Next()
  81. if token == ErrorToken {
  82. test.T(t, l.Err(), io.EOF)
  83. test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
  84. break
  85. }
  86. test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected))
  87. if i < len(tt.expected) {
  88. test.T(t, token, tt.expected[i], "token types must match")
  89. }
  90. i++
  91. }
  92. })
  93. }
  94. test.T(t, TokenType(100).String(), "Invalid(100)")
  95. }
  96. func TestTags(t *testing.T) {
  97. var tagTests = []struct {
  98. html string
  99. expected string
  100. }{
  101. {"<foo:bar.qux-norf/>", "foo:bar.qux-norf"},
  102. {"<foo?bar/qux>", "foo?bar/qux"},
  103. {"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""},
  104. {"</foo >", "foo"},
  105. // early endings
  106. {"<foo ", "foo"},
  107. }
  108. for _, tt := range tagTests {
  109. t.Run(tt.html, func(t *testing.T) {
  110. l := NewLexer(bytes.NewBufferString(tt.html))
  111. for {
  112. token, _ := l.Next()
  113. if token == ErrorToken {
  114. test.T(t, l.Err(), io.EOF)
  115. test.Fail(t, "when error occurred we must be at the end")
  116. break
  117. } else if token == StartTagToken || token == EndTagToken || token == DoctypeToken {
  118. test.String(t, string(l.Text()), tt.expected)
  119. break
  120. }
  121. }
  122. })
  123. }
  124. }
  125. func TestAttributes(t *testing.T) {
  126. var attributeTests = []struct {
  127. attr string
  128. expected []string
  129. }{
  130. {"<foo a=\"b\" />", []string{"a", "\"b\""}},
  131. {"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}},
  132. {"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a \n\t\r b \""}},
  133. {"<foo a/>", []string{"a", ""}},
  134. {"<foo /=/>", []string{"/", "/"}},
  135. // early endings
  136. {"<foo x", []string{"x", ""}},
  137. {"<foo x=", []string{"x", ""}},
  138. {"<foo x='", []string{"x", "'"}},
  139. // NULL
  140. {"<foo \x00>", []string{"\x00", ""}},
  141. {"<foo \x00=\x00>", []string{"\x00", "\x00"}},
  142. {"<foo \x00='\x00'>", []string{"\x00", "'\x00'"}},
  143. }
  144. for _, tt := range attributeTests {
  145. t.Run(tt.attr, func(t *testing.T) {
  146. l := NewLexer(bytes.NewBufferString(tt.attr))
  147. i := 0
  148. for {
  149. token, _ := l.Next()
  150. if token == ErrorToken {
  151. test.T(t, l.Err(), io.EOF)
  152. test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
  153. break
  154. } else if token == AttributeToken {
  155. test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
  156. if i+1 < len(tt.expected) {
  157. test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
  158. test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
  159. i += 2
  160. }
  161. }
  162. }
  163. })
  164. }
  165. }
  166. func TestErrors(t *testing.T) {
  167. var errorTests = []struct {
  168. html string
  169. col int
  170. }{
  171. {"<svg>\x00</svg>", 6},
  172. {"<svg></svg\x00>", 11},
  173. }
  174. for _, tt := range errorTests {
  175. t.Run(tt.html, func(t *testing.T) {
  176. l := NewLexer(bytes.NewBufferString(tt.html))
  177. for {
  178. token, _ := l.Next()
  179. if token == ErrorToken {
  180. if tt.col == 0 {
  181. test.T(t, l.Err(), io.EOF)
  182. } else if perr, ok := l.Err().(*parse.Error); ok {
  183. _, col, _ := perr.Position()
  184. test.T(t, col, tt.col)
  185. } else {
  186. test.Fail(t, "bad error:", l.Err())
  187. }
  188. break
  189. }
  190. }
  191. })
  192. }
  193. }
  194. ////////////////////////////////////////////////////////////////
  195. var J int
  196. var ss = [][]byte{
  197. []byte(" style"),
  198. []byte("style"),
  199. []byte(" \r\n\tstyle"),
  200. []byte(" style"),
  201. []byte(" x"),
  202. []byte("x"),
  203. }
  204. func BenchmarkWhitespace1(b *testing.B) {
  205. for i := 0; i < b.N; i++ {
  206. for _, s := range ss {
  207. j := 0
  208. for {
  209. if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  210. j++
  211. } else {
  212. break
  213. }
  214. }
  215. J += j
  216. }
  217. }
  218. }
  219. func BenchmarkWhitespace2(b *testing.B) {
  220. for i := 0; i < b.N; i++ {
  221. for _, s := range ss {
  222. j := 0
  223. for {
  224. if c := s[j]; c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  225. j++
  226. continue
  227. }
  228. break
  229. }
  230. J += j
  231. }
  232. }
  233. }
  234. func BenchmarkWhitespace3(b *testing.B) {
  235. for i := 0; i < b.N; i++ {
  236. for _, s := range ss {
  237. j := 0
  238. for {
  239. if c := s[j]; c != ' ' && c != '\t' && c != '\n' && c != '\r' && c != '\f' {
  240. break
  241. }
  242. j++
  243. }
  244. J += j
  245. }
  246. }
  247. }
  248. ////////////////////////////////////////////////////////////////
  249. func ExampleNewLexer() {
  250. l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
  251. out := ""
  252. for {
  253. tt, data := l.Next()
  254. if tt == ErrorToken {
  255. break
  256. }
  257. out += string(data)
  258. }
  259. fmt.Println(out)
  260. // Output: <span class='user'>John Doe</span>
  261. }