lex.go 14 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652
  1. // Package js is an ECMAScript5.1 lexer following the specifications at http://www.ecma-international.org/ecma-262/5.1/.
  2. package js // import "github.com/tdewolff/parse/js"
  3. import (
  4. "io"
  5. "strconv"
  6. "unicode"
  7. "github.com/tdewolff/parse/buffer"
  8. )
  9. var identifierStart = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Other_ID_Start}
  10. var identifierContinue = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue}
  11. ////////////////////////////////////////////////////////////////
  12. // TokenType determines the type of token, eg. a number or a semicolon.
  13. type TokenType uint32
  14. // TokenType values.
  15. const (
  16. ErrorToken TokenType = iota // extra token when errors occur
  17. UnknownToken // extra token when no token can be matched
  18. WhitespaceToken // space \t \v \f
  19. LineTerminatorToken // \r \n \r\n
  20. CommentToken
  21. IdentifierToken
  22. PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !== + - * % ++ -- << >>
  23. >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= >= */
  24. NumericToken
  25. StringToken
  26. RegexpToken
  27. TemplateToken
  28. )
  29. // TokenState determines a state in which next token should be read
  30. type TokenState uint32
  31. // TokenState values
  32. const (
  33. ExprState TokenState = iota
  34. StmtParensState
  35. SubscriptState
  36. PropNameState
  37. )
  38. // ParsingContext determines the context in which following token should be parsed.
  39. // This affects parsing regular expressions and template literals.
  40. type ParsingContext uint32
  41. // ParsingContext values
  42. const (
  43. GlobalContext ParsingContext = iota
  44. StmtParensContext
  45. ExprParensContext
  46. BracesContext
  47. TemplateContext
  48. )
  49. // String returns the string representation of a TokenType.
  50. func (tt TokenType) String() string {
  51. switch tt {
  52. case ErrorToken:
  53. return "Error"
  54. case UnknownToken:
  55. return "Unknown"
  56. case WhitespaceToken:
  57. return "Whitespace"
  58. case LineTerminatorToken:
  59. return "LineTerminator"
  60. case CommentToken:
  61. return "Comment"
  62. case IdentifierToken:
  63. return "Identifier"
  64. case PunctuatorToken:
  65. return "Punctuator"
  66. case NumericToken:
  67. return "Numeric"
  68. case StringToken:
  69. return "String"
  70. case RegexpToken:
  71. return "Regexp"
  72. case TemplateToken:
  73. return "Template"
  74. }
  75. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  76. }
  77. ////////////////////////////////////////////////////////////////
  78. // Lexer is the state for the lexer.
  79. type Lexer struct {
  80. r *buffer.Lexer
  81. stack []ParsingContext
  82. state TokenState
  83. emptyLine bool
  84. }
  85. // NewLexer returns a new Lexer for a given io.Reader.
  86. func NewLexer(r io.Reader) *Lexer {
  87. return &Lexer{
  88. r: buffer.NewLexer(r),
  89. stack: make([]ParsingContext, 0, 16),
  90. state: ExprState,
  91. emptyLine: true,
  92. }
  93. }
  94. func (l *Lexer) enterContext(context ParsingContext) {
  95. l.stack = append(l.stack, context)
  96. }
  97. func (l *Lexer) leaveContext() ParsingContext {
  98. ctx := GlobalContext
  99. if last := len(l.stack) - 1; last >= 0 {
  100. ctx, l.stack = l.stack[last], l.stack[:last]
  101. }
  102. return ctx
  103. }
  104. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  105. func (l *Lexer) Err() error {
  106. return l.r.Err()
  107. }
  108. // Restore restores the NULL byte at the end of the buffer.
  109. func (l *Lexer) Restore() {
  110. l.r.Restore()
  111. }
  112. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  113. func (l *Lexer) Next() (TokenType, []byte) {
  114. tt := UnknownToken
  115. c := l.r.Peek(0)
  116. switch c {
  117. case '(':
  118. if l.state == StmtParensState {
  119. l.enterContext(StmtParensContext)
  120. } else {
  121. l.enterContext(ExprParensContext)
  122. }
  123. l.state = ExprState
  124. l.r.Move(1)
  125. tt = PunctuatorToken
  126. case ')':
  127. if l.leaveContext() == StmtParensContext {
  128. l.state = ExprState
  129. } else {
  130. l.state = SubscriptState
  131. }
  132. l.r.Move(1)
  133. tt = PunctuatorToken
  134. case '{':
  135. l.enterContext(BracesContext)
  136. l.state = ExprState
  137. l.r.Move(1)
  138. tt = PunctuatorToken
  139. case '}':
  140. if l.leaveContext() == TemplateContext && l.consumeTemplateToken() {
  141. tt = TemplateToken
  142. } else {
  143. // will work incorrectly for objects or functions divided by something,
  144. // but that's an extremely rare case
  145. l.state = ExprState
  146. l.r.Move(1)
  147. tt = PunctuatorToken
  148. }
  149. case ']':
  150. l.state = SubscriptState
  151. l.r.Move(1)
  152. tt = PunctuatorToken
  153. case '[', ';', ',', '~', '?', ':':
  154. l.state = ExprState
  155. l.r.Move(1)
  156. tt = PunctuatorToken
  157. case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^':
  158. if (c == '<' || (l.emptyLine && c == '-')) && l.consumeCommentToken() {
  159. return CommentToken, l.r.Shift()
  160. } else if l.consumeLongPunctuatorToken() {
  161. l.state = ExprState
  162. tt = PunctuatorToken
  163. }
  164. case '/':
  165. if l.consumeCommentToken() {
  166. return CommentToken, l.r.Shift()
  167. } else if l.state == ExprState && l.consumeRegexpToken() {
  168. l.state = SubscriptState
  169. tt = RegexpToken
  170. } else if l.consumeLongPunctuatorToken() {
  171. l.state = ExprState
  172. tt = PunctuatorToken
  173. }
  174. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
  175. if l.consumeNumericToken() {
  176. tt = NumericToken
  177. l.state = SubscriptState
  178. } else if c == '.' {
  179. l.state = PropNameState
  180. l.r.Move(1)
  181. tt = PunctuatorToken
  182. }
  183. case '\'', '"':
  184. if l.consumeStringToken() {
  185. l.state = SubscriptState
  186. tt = StringToken
  187. }
  188. case ' ', '\t', '\v', '\f':
  189. l.r.Move(1)
  190. for l.consumeWhitespace() {
  191. }
  192. return WhitespaceToken, l.r.Shift()
  193. case '\n', '\r':
  194. l.r.Move(1)
  195. for l.consumeLineTerminator() {
  196. }
  197. tt = LineTerminatorToken
  198. case '`':
  199. if l.consumeTemplateToken() {
  200. tt = TemplateToken
  201. }
  202. default:
  203. if l.consumeIdentifierToken() {
  204. tt = IdentifierToken
  205. if l.state != PropNameState {
  206. switch hash := ToHash(l.r.Lexeme()); hash {
  207. case 0, This, False, True, Null:
  208. l.state = SubscriptState
  209. case If, While, For, With:
  210. l.state = StmtParensState
  211. default:
  212. // This will include keywords that can't be followed by a regexp, but only
  213. // by a specified char (like `switch` or `try`), but we don't check for syntax
  214. // errors as we don't attempt to parse a full JS grammar when streaming
  215. l.state = ExprState
  216. }
  217. } else {
  218. l.state = SubscriptState
  219. }
  220. } else if c >= 0xC0 {
  221. if l.consumeWhitespace() {
  222. for l.consumeWhitespace() {
  223. }
  224. return WhitespaceToken, l.r.Shift()
  225. } else if l.consumeLineTerminator() {
  226. for l.consumeLineTerminator() {
  227. }
  228. tt = LineTerminatorToken
  229. }
  230. } else if l.Err() != nil {
  231. return ErrorToken, nil
  232. }
  233. }
  234. l.emptyLine = tt == LineTerminatorToken
  235. if tt == UnknownToken {
  236. _, n := l.r.PeekRune(0)
  237. l.r.Move(n)
  238. }
  239. return tt, l.r.Shift()
  240. }
  241. ////////////////////////////////////////////////////////////////
  242. /*
  243. The following functions follow the specifications at http://www.ecma-international.org/ecma-262/5.1/
  244. */
  245. func (l *Lexer) consumeWhitespace() bool {
  246. c := l.r.Peek(0)
  247. if c == ' ' || c == '\t' || c == '\v' || c == '\f' {
  248. l.r.Move(1)
  249. return true
  250. } else if c >= 0xC0 {
  251. if r, n := l.r.PeekRune(0); r == '\u00A0' || r == '\uFEFF' || unicode.Is(unicode.Zs, r) {
  252. l.r.Move(n)
  253. return true
  254. }
  255. }
  256. return false
  257. }
  258. func (l *Lexer) consumeLineTerminator() bool {
  259. c := l.r.Peek(0)
  260. if c == '\n' {
  261. l.r.Move(1)
  262. return true
  263. } else if c == '\r' {
  264. if l.r.Peek(1) == '\n' {
  265. l.r.Move(2)
  266. } else {
  267. l.r.Move(1)
  268. }
  269. return true
  270. } else if c >= 0xC0 {
  271. if r, n := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  272. l.r.Move(n)
  273. return true
  274. }
  275. }
  276. return false
  277. }
  278. func (l *Lexer) consumeDigit() bool {
  279. if c := l.r.Peek(0); c >= '0' && c <= '9' {
  280. l.r.Move(1)
  281. return true
  282. }
  283. return false
  284. }
  285. func (l *Lexer) consumeHexDigit() bool {
  286. if c := l.r.Peek(0); (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
  287. l.r.Move(1)
  288. return true
  289. }
  290. return false
  291. }
  292. func (l *Lexer) consumeBinaryDigit() bool {
  293. if c := l.r.Peek(0); c == '0' || c == '1' {
  294. l.r.Move(1)
  295. return true
  296. }
  297. return false
  298. }
  299. func (l *Lexer) consumeOctalDigit() bool {
  300. if c := l.r.Peek(0); c >= '0' && c <= '7' {
  301. l.r.Move(1)
  302. return true
  303. }
  304. return false
  305. }
  306. func (l *Lexer) consumeUnicodeEscape() bool {
  307. if l.r.Peek(0) != '\\' || l.r.Peek(1) != 'u' {
  308. return false
  309. }
  310. mark := l.r.Pos()
  311. l.r.Move(2)
  312. if c := l.r.Peek(0); c == '{' {
  313. l.r.Move(1)
  314. if l.consumeHexDigit() {
  315. for l.consumeHexDigit() {
  316. }
  317. if c := l.r.Peek(0); c == '}' {
  318. l.r.Move(1)
  319. return true
  320. }
  321. }
  322. l.r.Rewind(mark)
  323. return false
  324. } else if !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() {
  325. l.r.Rewind(mark)
  326. return false
  327. }
  328. return true
  329. }
  330. func (l *Lexer) consumeSingleLineComment() {
  331. for {
  332. c := l.r.Peek(0)
  333. if c == '\r' || c == '\n' || c == 0 {
  334. break
  335. } else if c >= 0xC0 {
  336. if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  337. break
  338. }
  339. }
  340. l.r.Move(1)
  341. }
  342. }
  343. ////////////////////////////////////////////////////////////////
  344. func (l *Lexer) consumeCommentToken() bool {
  345. c := l.r.Peek(0)
  346. if c == '/' {
  347. c = l.r.Peek(1)
  348. if c == '/' {
  349. // single line
  350. l.r.Move(2)
  351. l.consumeSingleLineComment()
  352. } else if c == '*' {
  353. // multi line
  354. l.r.Move(2)
  355. for {
  356. c := l.r.Peek(0)
  357. if c == '*' && l.r.Peek(1) == '/' {
  358. l.r.Move(2)
  359. return true
  360. } else if c == 0 {
  361. break
  362. } else if l.consumeLineTerminator() {
  363. l.emptyLine = true
  364. } else {
  365. l.r.Move(1)
  366. }
  367. }
  368. } else {
  369. return false
  370. }
  371. } else if c == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  372. // opening HTML-style single line comment
  373. l.r.Move(4)
  374. l.consumeSingleLineComment()
  375. } else if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  376. // closing HTML-style single line comment
  377. // (only if current line didn't contain any meaningful tokens)
  378. l.r.Move(3)
  379. l.consumeSingleLineComment()
  380. } else {
  381. return false
  382. }
  383. return true
  384. }
  385. func (l *Lexer) consumeLongPunctuatorToken() bool {
  386. c := l.r.Peek(0)
  387. if c == '!' || c == '=' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^' {
  388. l.r.Move(1)
  389. if l.r.Peek(0) == '=' {
  390. l.r.Move(1)
  391. if (c == '!' || c == '=') && l.r.Peek(0) == '=' {
  392. l.r.Move(1)
  393. }
  394. } else if (c == '+' || c == '-' || c == '&' || c == '|') && l.r.Peek(0) == c {
  395. l.r.Move(1)
  396. } else if c == '=' && l.r.Peek(0) == '>' {
  397. l.r.Move(1)
  398. }
  399. } else { // c == '<' || c == '>'
  400. l.r.Move(1)
  401. if l.r.Peek(0) == c {
  402. l.r.Move(1)
  403. if c == '>' && l.r.Peek(0) == '>' {
  404. l.r.Move(1)
  405. }
  406. }
  407. if l.r.Peek(0) == '=' {
  408. l.r.Move(1)
  409. }
  410. }
  411. return true
  412. }
  413. func (l *Lexer) consumeIdentifierToken() bool {
  414. c := l.r.Peek(0)
  415. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_' {
  416. l.r.Move(1)
  417. } else if c >= 0xC0 {
  418. if r, n := l.r.PeekRune(0); unicode.IsOneOf(identifierStart, r) {
  419. l.r.Move(n)
  420. } else {
  421. return false
  422. }
  423. } else if !l.consumeUnicodeEscape() {
  424. return false
  425. }
  426. for {
  427. c := l.r.Peek(0)
  428. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
  429. l.r.Move(1)
  430. } else if c >= 0xC0 {
  431. if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
  432. l.r.Move(n)
  433. } else {
  434. break
  435. }
  436. } else {
  437. break
  438. }
  439. }
  440. return true
  441. }
  442. func (l *Lexer) consumeNumericToken() bool {
  443. // assume to be on 0 1 2 3 4 5 6 7 8 9 .
  444. mark := l.r.Pos()
  445. c := l.r.Peek(0)
  446. if c == '0' {
  447. l.r.Move(1)
  448. if l.r.Peek(0) == 'x' || l.r.Peek(0) == 'X' {
  449. l.r.Move(1)
  450. if l.consumeHexDigit() {
  451. for l.consumeHexDigit() {
  452. }
  453. } else {
  454. l.r.Move(-1) // return just the zero
  455. }
  456. return true
  457. } else if l.r.Peek(0) == 'b' || l.r.Peek(0) == 'B' {
  458. l.r.Move(1)
  459. if l.consumeBinaryDigit() {
  460. for l.consumeBinaryDigit() {
  461. }
  462. } else {
  463. l.r.Move(-1) // return just the zero
  464. }
  465. return true
  466. } else if l.r.Peek(0) == 'o' || l.r.Peek(0) == 'O' {
  467. l.r.Move(1)
  468. if l.consumeOctalDigit() {
  469. for l.consumeOctalDigit() {
  470. }
  471. } else {
  472. l.r.Move(-1) // return just the zero
  473. }
  474. return true
  475. }
  476. } else if c != '.' {
  477. for l.consumeDigit() {
  478. }
  479. }
  480. if l.r.Peek(0) == '.' {
  481. l.r.Move(1)
  482. if l.consumeDigit() {
  483. for l.consumeDigit() {
  484. }
  485. } else if c != '.' {
  486. // . could belong to the next token
  487. l.r.Move(-1)
  488. return true
  489. } else {
  490. l.r.Rewind(mark)
  491. return false
  492. }
  493. }
  494. mark = l.r.Pos()
  495. c = l.r.Peek(0)
  496. if c == 'e' || c == 'E' {
  497. l.r.Move(1)
  498. c = l.r.Peek(0)
  499. if c == '+' || c == '-' {
  500. l.r.Move(1)
  501. }
  502. if !l.consumeDigit() {
  503. // e could belong to the next token
  504. l.r.Rewind(mark)
  505. return true
  506. }
  507. for l.consumeDigit() {
  508. }
  509. }
  510. return true
  511. }
  512. func (l *Lexer) consumeStringToken() bool {
  513. // assume to be on ' or "
  514. mark := l.r.Pos()
  515. delim := l.r.Peek(0)
  516. l.r.Move(1)
  517. for {
  518. c := l.r.Peek(0)
  519. if c == delim {
  520. l.r.Move(1)
  521. break
  522. } else if c == '\\' {
  523. l.r.Move(1)
  524. if !l.consumeLineTerminator() {
  525. if c := l.r.Peek(0); c == delim || c == '\\' {
  526. l.r.Move(1)
  527. }
  528. }
  529. continue
  530. } else if c == '\n' || c == '\r' {
  531. l.r.Rewind(mark)
  532. return false
  533. } else if c >= 0xC0 {
  534. if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  535. l.r.Rewind(mark)
  536. return false
  537. }
  538. } else if c == 0 {
  539. break
  540. }
  541. l.r.Move(1)
  542. }
  543. return true
  544. }
  545. func (l *Lexer) consumeRegexpToken() bool {
  546. // assume to be on / and not /*
  547. mark := l.r.Pos()
  548. l.r.Move(1)
  549. inClass := false
  550. for {
  551. c := l.r.Peek(0)
  552. if !inClass && c == '/' {
  553. l.r.Move(1)
  554. break
  555. } else if c == '[' {
  556. inClass = true
  557. } else if c == ']' {
  558. inClass = false
  559. } else if c == '\\' {
  560. l.r.Move(1)
  561. if l.consumeLineTerminator() {
  562. l.r.Rewind(mark)
  563. return false
  564. } else if l.r.Peek(0) == 0 {
  565. return true
  566. }
  567. } else if l.consumeLineTerminator() {
  568. l.r.Rewind(mark)
  569. return false
  570. } else if c == 0 {
  571. return true
  572. }
  573. l.r.Move(1)
  574. }
  575. // flags
  576. for {
  577. c := l.r.Peek(0)
  578. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
  579. l.r.Move(1)
  580. } else if c >= 0xC0 {
  581. if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
  582. l.r.Move(n)
  583. } else {
  584. break
  585. }
  586. } else {
  587. break
  588. }
  589. }
  590. return true
  591. }
  592. func (l *Lexer) consumeTemplateToken() bool {
  593. // assume to be on ` or } when already within template
  594. mark := l.r.Pos()
  595. l.r.Move(1)
  596. for {
  597. c := l.r.Peek(0)
  598. if c == '`' {
  599. l.state = SubscriptState
  600. l.r.Move(1)
  601. return true
  602. } else if c == '$' && l.r.Peek(1) == '{' {
  603. l.enterContext(TemplateContext)
  604. l.state = ExprState
  605. l.r.Move(2)
  606. return true
  607. } else if c == 0 {
  608. l.r.Rewind(mark)
  609. return false
  610. }
  611. l.r.Move(1)
  612. }
  613. }