lex.go 7.6 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348
  1. // Package xml is an XML1.0 lexer following the specifications at http://www.w3.org/TR/xml/.
  2. package xml // import "github.com/tdewolff/parse/xml"
  3. import (
  4. "io"
  5. "strconv"
  6. "github.com/tdewolff/parse"
  7. "github.com/tdewolff/parse/buffer"
  8. )
  9. // TokenType determines the type of token, eg. a number or a semicolon.
  10. type TokenType uint32
  11. // TokenType values.
  12. const (
  13. ErrorToken TokenType = iota // extra token when errors occur
  14. CommentToken
  15. DOCTYPEToken
  16. CDATAToken
  17. StartTagToken
  18. StartTagPIToken
  19. StartTagCloseToken
  20. StartTagCloseVoidToken
  21. StartTagClosePIToken
  22. EndTagToken
  23. AttributeToken
  24. TextToken
  25. )
  26. // String returns the string representation of a TokenType.
  27. func (tt TokenType) String() string {
  28. switch tt {
  29. case ErrorToken:
  30. return "Error"
  31. case CommentToken:
  32. return "Comment"
  33. case DOCTYPEToken:
  34. return "DOCTYPE"
  35. case CDATAToken:
  36. return "CDATA"
  37. case StartTagToken:
  38. return "StartTag"
  39. case StartTagPIToken:
  40. return "StartTagPI"
  41. case StartTagCloseToken:
  42. return "StartTagClose"
  43. case StartTagCloseVoidToken:
  44. return "StartTagCloseVoid"
  45. case StartTagClosePIToken:
  46. return "StartTagClosePI"
  47. case EndTagToken:
  48. return "EndTag"
  49. case AttributeToken:
  50. return "Attribute"
  51. case TextToken:
  52. return "Text"
  53. }
  54. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  55. }
  56. ////////////////////////////////////////////////////////////////
  57. // Lexer is the state for the lexer.
  58. type Lexer struct {
  59. r *buffer.Lexer
  60. err error
  61. inTag bool
  62. text []byte
  63. attrVal []byte
  64. }
  65. // NewLexer returns a new Lexer for a given io.Reader.
  66. func NewLexer(r io.Reader) *Lexer {
  67. return &Lexer{
  68. r: buffer.NewLexer(r),
  69. }
  70. }
  71. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  72. func (l *Lexer) Err() error {
  73. if l.err != nil {
  74. return l.err
  75. }
  76. return l.r.Err()
  77. }
  78. // Restore restores the NULL byte at the end of the buffer.
  79. func (l *Lexer) Restore() {
  80. l.r.Restore()
  81. }
  82. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  83. func (l *Lexer) Next() (TokenType, []byte) {
  84. l.text = nil
  85. var c byte
  86. if l.inTag {
  87. l.attrVal = nil
  88. for { // before attribute name state
  89. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  90. l.r.Move(1)
  91. continue
  92. }
  93. break
  94. }
  95. if c == 0 {
  96. if l.r.Err() == nil {
  97. l.err = parse.NewErrorLexer("unexpected null character", l.r)
  98. }
  99. return ErrorToken, nil
  100. } else if c != '>' && (c != '/' && c != '?' || l.r.Peek(1) != '>') {
  101. return AttributeToken, l.shiftAttribute()
  102. }
  103. start := l.r.Pos()
  104. l.inTag = false
  105. if c == '/' {
  106. l.r.Move(2)
  107. l.text = l.r.Lexeme()[start:]
  108. return StartTagCloseVoidToken, l.r.Shift()
  109. } else if c == '?' {
  110. l.r.Move(2)
  111. l.text = l.r.Lexeme()[start:]
  112. return StartTagClosePIToken, l.r.Shift()
  113. } else {
  114. l.r.Move(1)
  115. l.text = l.r.Lexeme()[start:]
  116. return StartTagCloseToken, l.r.Shift()
  117. }
  118. }
  119. for {
  120. c = l.r.Peek(0)
  121. if c == '<' {
  122. if l.r.Pos() > 0 {
  123. return TextToken, l.r.Shift()
  124. }
  125. c = l.r.Peek(1)
  126. if c == '/' {
  127. l.r.Move(2)
  128. return EndTagToken, l.shiftEndTag()
  129. } else if c == '!' {
  130. l.r.Move(2)
  131. if l.at('-', '-') {
  132. l.r.Move(2)
  133. return CommentToken, l.shiftCommentText()
  134. } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
  135. l.r.Move(7)
  136. return CDATAToken, l.shiftCDATAText()
  137. } else if l.at('D', 'O', 'C', 'T', 'Y', 'P', 'E') {
  138. l.r.Move(7)
  139. return DOCTYPEToken, l.shiftDOCTYPEText()
  140. }
  141. l.r.Move(-2)
  142. } else if c == '?' {
  143. l.r.Move(2)
  144. l.inTag = true
  145. return StartTagPIToken, l.shiftStartTag()
  146. }
  147. l.r.Move(1)
  148. l.inTag = true
  149. return StartTagToken, l.shiftStartTag()
  150. } else if c == 0 {
  151. if l.r.Pos() > 0 {
  152. return TextToken, l.r.Shift()
  153. }
  154. if l.r.Err() == nil {
  155. l.err = parse.NewErrorLexer("unexpected null character", l.r)
  156. }
  157. return ErrorToken, nil
  158. }
  159. l.r.Move(1)
  160. }
  161. }
  162. // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
  163. func (l *Lexer) Text() []byte {
  164. return l.text
  165. }
  166. // AttrVal returns the attribute value when an AttributeToken was returned from Next.
  167. func (l *Lexer) AttrVal() []byte {
  168. return l.attrVal
  169. }
  170. ////////////////////////////////////////////////////////////////
  171. // The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
  172. func (l *Lexer) shiftDOCTYPEText() []byte {
  173. inString := false
  174. inBrackets := false
  175. for {
  176. c := l.r.Peek(0)
  177. if c == '"' {
  178. inString = !inString
  179. } else if (c == '[' || c == ']') && !inString {
  180. inBrackets = (c == '[')
  181. } else if c == '>' && !inString && !inBrackets {
  182. l.text = l.r.Lexeme()[9:]
  183. l.r.Move(1)
  184. return l.r.Shift()
  185. } else if c == 0 {
  186. l.text = l.r.Lexeme()[9:]
  187. return l.r.Shift()
  188. }
  189. l.r.Move(1)
  190. }
  191. }
  192. func (l *Lexer) shiftCDATAText() []byte {
  193. for {
  194. c := l.r.Peek(0)
  195. if c == ']' && l.r.Peek(1) == ']' && l.r.Peek(2) == '>' {
  196. l.text = l.r.Lexeme()[9:]
  197. l.r.Move(3)
  198. return l.r.Shift()
  199. } else if c == 0 {
  200. l.text = l.r.Lexeme()[9:]
  201. return l.r.Shift()
  202. }
  203. l.r.Move(1)
  204. }
  205. }
  206. func (l *Lexer) shiftCommentText() []byte {
  207. for {
  208. c := l.r.Peek(0)
  209. if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  210. l.text = l.r.Lexeme()[4:]
  211. l.r.Move(3)
  212. return l.r.Shift()
  213. } else if c == 0 {
  214. return l.r.Shift()
  215. }
  216. l.r.Move(1)
  217. }
  218. }
  219. func (l *Lexer) shiftStartTag() []byte {
  220. nameStart := l.r.Pos()
  221. for {
  222. if c := l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  223. break
  224. }
  225. l.r.Move(1)
  226. }
  227. l.text = l.r.Lexeme()[nameStart:]
  228. return l.r.Shift()
  229. }
  230. func (l *Lexer) shiftAttribute() []byte {
  231. nameStart := l.r.Pos()
  232. var c byte
  233. for { // attribute name state
  234. if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  235. break
  236. }
  237. l.r.Move(1)
  238. }
  239. nameEnd := l.r.Pos()
  240. for { // after attribute name state
  241. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  242. l.r.Move(1)
  243. continue
  244. }
  245. break
  246. }
  247. if c == '=' {
  248. l.r.Move(1)
  249. for { // before attribute value state
  250. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  251. l.r.Move(1)
  252. continue
  253. }
  254. break
  255. }
  256. attrPos := l.r.Pos()
  257. delim := c
  258. if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
  259. l.r.Move(1)
  260. for {
  261. c = l.r.Peek(0)
  262. if c == delim {
  263. l.r.Move(1)
  264. break
  265. } else if c == 0 {
  266. break
  267. }
  268. l.r.Move(1)
  269. if c == '\t' || c == '\n' || c == '\r' {
  270. l.r.Lexeme()[l.r.Pos()-1] = ' '
  271. }
  272. }
  273. } else { // attribute value unquoted state
  274. for {
  275. if c = l.r.Peek(0); c == ' ' || c == '>' || (c == '/' || c == '?') && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == 0 {
  276. break
  277. }
  278. l.r.Move(1)
  279. }
  280. }
  281. l.attrVal = l.r.Lexeme()[attrPos:]
  282. } else {
  283. l.r.Rewind(nameEnd)
  284. l.attrVal = nil
  285. }
  286. l.text = l.r.Lexeme()[nameStart:nameEnd]
  287. return l.r.Shift()
  288. }
  289. func (l *Lexer) shiftEndTag() []byte {
  290. for {
  291. c := l.r.Peek(0)
  292. if c == '>' {
  293. l.text = l.r.Lexeme()[2:]
  294. l.r.Move(1)
  295. break
  296. } else if c == 0 {
  297. l.text = l.r.Lexeme()[2:]
  298. break
  299. }
  300. l.r.Move(1)
  301. }
  302. end := len(l.text)
  303. for end > 0 {
  304. if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  305. end--
  306. continue
  307. }
  308. break
  309. }
  310. l.text = l.text[:end]
  311. return l.r.Shift()
  312. }
  313. ////////////////////////////////////////////////////////////////
  314. func (l *Lexer) at(b ...byte) bool {
  315. for i, c := range b {
  316. if l.r.Peek(i) != c {
  317. return false
  318. }
  319. }
  320. return true
  321. }