lex_test.go 6.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195
  1. package xml // import "github.com/tdewolff/parse/xml"
  2. import (
  3. "bytes"
  4. "fmt"
  5. "io"
  6. "testing"
  7. "github.com/tdewolff/parse"
  8. "github.com/tdewolff/test"
  9. )
  10. type TTs []TokenType
  11. func TestTokens(t *testing.T) {
  12. var tokenTests = []struct {
  13. xml string
  14. expected []TokenType
  15. }{
  16. {"", TTs{}},
  17. {"<!-- comment -->", TTs{CommentToken}},
  18. {"<!-- comment \n multi \r line -->", TTs{CommentToken}},
  19. {"<foo/>", TTs{StartTagToken, StartTagCloseVoidToken}},
  20. {"<foo \t\r\n/>", TTs{StartTagToken, StartTagCloseVoidToken}},
  21. {"<foo:bar.qux-norf/>", TTs{StartTagToken, StartTagCloseVoidToken}},
  22. {"<foo></foo>", TTs{StartTagToken, StartTagCloseToken, EndTagToken}},
  23. {"<foo>text</foo>", TTs{StartTagToken, StartTagCloseToken, TextToken, EndTagToken}},
  24. {"<foo/> text", TTs{StartTagToken, StartTagCloseVoidToken, TextToken}},
  25. {"<a> <b> <c>text</c> </b> </a>", TTs{StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, StartTagToken, StartTagCloseToken, TextToken, EndTagToken, TextToken, EndTagToken, TextToken, EndTagToken}},
  26. {"<foo a='a' b=\"b\" c=c/>", TTs{StartTagToken, AttributeToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
  27. {"<foo a=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
  28. {"<foo a-b=\"\"/>", TTs{StartTagToken, AttributeToken, StartTagCloseVoidToken}},
  29. {"<foo \nchecked \r\n value\r=\t'=/>\"' />", TTs{StartTagToken, AttributeToken, AttributeToken, StartTagCloseVoidToken}},
  30. {"<?xml?>", TTs{StartTagPIToken, StartTagClosePIToken}},
  31. {"<?xml a=\"a\" ?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
  32. {"<?xml a=a?>", TTs{StartTagPIToken, AttributeToken, StartTagClosePIToken}},
  33. {"<![CDATA[ test ]]>", TTs{CDATAToken}},
  34. {"<!DOCTYPE>", TTs{DOCTYPEToken}},
  35. {"<!DOCTYPE note SYSTEM \"Note.dtd\">", TTs{DOCTYPEToken}},
  36. {`<!DOCTYPE note [<!ENTITY nbsp "&#xA0;"><!ENTITY writer "Writer: Donald Duck."><!ENTITY copyright "Copyright:]> W3Schools.">]>`, TTs{DOCTYPEToken}},
  37. {"<!foo>", TTs{StartTagToken, StartTagCloseToken}},
  38. // early endings
  39. {"<!-- comment", TTs{CommentToken}},
  40. {"<foo", TTs{StartTagToken}},
  41. {"</foo", TTs{EndTagToken}},
  42. {"<foo x", TTs{StartTagToken, AttributeToken}},
  43. {"<foo x=", TTs{StartTagToken, AttributeToken}},
  44. {"<foo x='", TTs{StartTagToken, AttributeToken}},
  45. {"<foo x=''", TTs{StartTagToken, AttributeToken}},
  46. {"<?xml", TTs{StartTagPIToken}},
  47. {"<![CDATA[ test", TTs{CDATAToken}},
  48. {"<!DOCTYPE note SYSTEM", TTs{DOCTYPEToken}},
  49. // go fuzz
  50. {"</", TTs{EndTagToken}},
  51. {"</\n", TTs{EndTagToken}},
  52. }
  53. for _, tt := range tokenTests {
  54. t.Run(tt.xml, func(t *testing.T) {
  55. l := NewLexer(bytes.NewBufferString(tt.xml))
  56. i := 0
  57. for {
  58. token, _ := l.Next()
  59. if token == ErrorToken {
  60. test.T(t, l.Err(), io.EOF)
  61. test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
  62. break
  63. }
  64. test.That(t, i < len(tt.expected), "index", i, "must not exceed expected token types size", len(tt.expected))
  65. if i < len(tt.expected) {
  66. test.T(t, token, tt.expected[i], "token types must match")
  67. }
  68. i++
  69. }
  70. })
  71. }
  72. test.T(t, TokenType(100).String(), "Invalid(100)")
  73. }
  74. func TestTags(t *testing.T) {
  75. var tagTests = []struct {
  76. xml string
  77. expected string
  78. }{
  79. {"<foo:bar.qux-norf/>", "foo:bar.qux-norf"},
  80. {"<?xml?>", "xml"},
  81. {"<foo?bar/qux>", "foo?bar/qux"},
  82. {"<!DOCTYPE note SYSTEM \"Note.dtd\">", " note SYSTEM \"Note.dtd\""},
  83. // early endings
  84. {"<foo ", "foo"},
  85. }
  86. for _, tt := range tagTests {
  87. t.Run(tt.xml, func(t *testing.T) {
  88. l := NewLexer(bytes.NewBufferString(tt.xml))
  89. for {
  90. token, _ := l.Next()
  91. if token == ErrorToken {
  92. test.T(t, l.Err(), io.EOF)
  93. test.Fail(t, "when error occurred we must be at the end")
  94. break
  95. } else if token == StartTagToken || token == StartTagPIToken || token == EndTagToken || token == DOCTYPEToken {
  96. test.String(t, string(l.Text()), tt.expected, "tags must match")
  97. break
  98. }
  99. }
  100. })
  101. }
  102. }
  103. func TestAttributes(t *testing.T) {
  104. var attributeTests = []struct {
  105. attr string
  106. expected []string
  107. }{
  108. {"<foo a=\"b\" />", []string{"a", "\"b\""}},
  109. {"<foo \nchecked \r\n value\r=\t'=/>\"' />", []string{"checked", "", "value", "'=/>\"'"}},
  110. {"<foo bar=\" a \n\t\r b \" />", []string{"bar", "\" a b \""}},
  111. {"<?xml a=b?>", []string{"a", "b"}},
  112. {"<foo /=? >", []string{"/", "?"}},
  113. // early endings
  114. {"<foo x", []string{"x", ""}},
  115. {"<foo x=", []string{"x", ""}},
  116. {"<foo x='", []string{"x", "'"}},
  117. }
  118. for _, tt := range attributeTests {
  119. t.Run(tt.attr, func(t *testing.T) {
  120. l := NewLexer(bytes.NewBufferString(tt.attr))
  121. i := 0
  122. for {
  123. token, _ := l.Next()
  124. if token == ErrorToken {
  125. test.T(t, l.Err(), io.EOF)
  126. test.T(t, i, len(tt.expected), "when error occurred we must be at the end")
  127. break
  128. } else if token == AttributeToken {
  129. test.That(t, i+1 < len(tt.expected), "index", i+1, "must not exceed expected attributes size", len(tt.expected))
  130. if i+1 < len(tt.expected) {
  131. test.String(t, string(l.Text()), tt.expected[i], "attribute keys must match")
  132. test.String(t, string(l.AttrVal()), tt.expected[i+1], "attribute keys must match")
  133. i += 2
  134. }
  135. }
  136. }
  137. })
  138. }
  139. }
  140. func TestErrors(t *testing.T) {
  141. var errorTests = []struct {
  142. xml string
  143. col int
  144. }{
  145. {"a\x00b", 2},
  146. {"<a\x00>", 3},
  147. }
  148. for _, tt := range errorTests {
  149. t.Run(tt.xml, func(t *testing.T) {
  150. l := NewLexer(bytes.NewBufferString(tt.xml))
  151. for {
  152. token, _ := l.Next()
  153. if token == ErrorToken {
  154. if tt.col == 0 {
  155. test.T(t, l.Err(), io.EOF)
  156. } else if perr, ok := l.Err().(*parse.Error); ok {
  157. _, col, _ := perr.Position()
  158. test.T(t, col, tt.col)
  159. } else {
  160. test.Fail(t, "bad error:", l.Err())
  161. }
  162. break
  163. }
  164. }
  165. })
  166. }
  167. }
  168. ////////////////////////////////////////////////////////////////
  169. func ExampleNewLexer() {
  170. l := NewLexer(bytes.NewBufferString("<span class='user'>John Doe</span>"))
  171. out := ""
  172. for {
  173. tt, data := l.Next()
  174. if tt == ErrorToken {
  175. break
  176. }
  177. out += string(data)
  178. }
  179. fmt.Println(out)
  180. // Output: <span class='user'>John Doe</span>
  181. }