lex.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669
  1. // Package js is an ECMAScript5.1 lexer following the specifications at http://www.ecma-international.org/ecma-262/5.1/.
  2. package js // import "github.com/tdewolff/parse/js"
  3. import (
  4. "io"
  5. "strconv"
  6. "unicode"
  7. "github.com/tdewolff/parse/buffer"
  8. )
  9. var identifierStart = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Other_ID_Start}
  10. var identifierContinue = []*unicode.RangeTable{unicode.Lu, unicode.Ll, unicode.Lt, unicode.Lm, unicode.Lo, unicode.Nl, unicode.Mn, unicode.Mc, unicode.Nd, unicode.Pc, unicode.Other_ID_Continue}
  11. ////////////////////////////////////////////////////////////////
  12. // TokenType determines the type of token, eg. a number or a semicolon.
  13. type TokenType uint32
  14. // TokenType values.
  15. const (
  16. ErrorToken TokenType = iota // extra token when errors occur
  17. UnknownToken // extra token when no token can be matched
  18. WhitespaceToken // space \t \v \f
  19. LineTerminatorToken // \r \n \r\n
  20. SingleLineCommentToken
  21. MultiLineCommentToken // token for comments with line terminators (not just any /*block*/)
  22. IdentifierToken
  23. PunctuatorToken /* { } ( ) [ ] . ; , < > <= >= == != === !== + - * % ++ -- << >>
  24. >>> & | ^ ! ~ && || ? : = += -= *= %= <<= >>= >>>= &= |= ^= / /= >= */
  25. NumericToken
  26. StringToken
  27. RegexpToken
  28. TemplateToken
  29. )
  30. // TokenState determines a state in which next token should be read
  31. type TokenState uint32
  32. // TokenState values
  33. const (
  34. ExprState TokenState = iota
  35. StmtParensState
  36. SubscriptState
  37. PropNameState
  38. )
  39. // ParsingContext determines the context in which following token should be parsed.
  40. // This affects parsing regular expressions and template literals.
  41. type ParsingContext uint32
  42. // ParsingContext values
  43. const (
  44. GlobalContext ParsingContext = iota
  45. StmtParensContext
  46. ExprParensContext
  47. BracesContext
  48. TemplateContext
  49. )
  50. // String returns the string representation of a TokenType.
  51. func (tt TokenType) String() string {
  52. switch tt {
  53. case ErrorToken:
  54. return "Error"
  55. case UnknownToken:
  56. return "Unknown"
  57. case WhitespaceToken:
  58. return "Whitespace"
  59. case LineTerminatorToken:
  60. return "LineTerminator"
  61. case SingleLineCommentToken:
  62. return "SingleLineComment"
  63. case MultiLineCommentToken:
  64. return "MultiLineComment"
  65. case IdentifierToken:
  66. return "Identifier"
  67. case PunctuatorToken:
  68. return "Punctuator"
  69. case NumericToken:
  70. return "Numeric"
  71. case StringToken:
  72. return "String"
  73. case RegexpToken:
  74. return "Regexp"
  75. case TemplateToken:
  76. return "Template"
  77. }
  78. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  79. }
  80. ////////////////////////////////////////////////////////////////
  81. // Lexer is the state for the lexer.
  82. type Lexer struct {
  83. r *buffer.Lexer
  84. stack []ParsingContext
  85. state TokenState
  86. emptyLine bool
  87. }
  88. // NewLexer returns a new Lexer for a given io.Reader.
  89. func NewLexer(r io.Reader) *Lexer {
  90. return &Lexer{
  91. r: buffer.NewLexer(r),
  92. stack: make([]ParsingContext, 0, 16),
  93. state: ExprState,
  94. emptyLine: true,
  95. }
  96. }
  97. func (l *Lexer) enterContext(context ParsingContext) {
  98. l.stack = append(l.stack, context)
  99. }
  100. func (l *Lexer) leaveContext() ParsingContext {
  101. ctx := GlobalContext
  102. if last := len(l.stack) - 1; last >= 0 {
  103. ctx, l.stack = l.stack[last], l.stack[:last]
  104. }
  105. return ctx
  106. }
  107. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  108. func (l *Lexer) Err() error {
  109. return l.r.Err()
  110. }
  111. // Restore restores the NULL byte at the end of the buffer.
  112. func (l *Lexer) Restore() {
  113. l.r.Restore()
  114. }
  115. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  116. func (l *Lexer) Next() (TokenType, []byte) {
  117. tt := UnknownToken
  118. c := l.r.Peek(0)
  119. switch c {
  120. case '(':
  121. if l.state == StmtParensState {
  122. l.enterContext(StmtParensContext)
  123. } else {
  124. l.enterContext(ExprParensContext)
  125. }
  126. l.state = ExprState
  127. l.r.Move(1)
  128. tt = PunctuatorToken
  129. case ')':
  130. if l.leaveContext() == StmtParensContext {
  131. l.state = ExprState
  132. } else {
  133. l.state = SubscriptState
  134. }
  135. l.r.Move(1)
  136. tt = PunctuatorToken
  137. case '{':
  138. l.enterContext(BracesContext)
  139. l.state = ExprState
  140. l.r.Move(1)
  141. tt = PunctuatorToken
  142. case '}':
  143. if l.leaveContext() == TemplateContext && l.consumeTemplateToken() {
  144. tt = TemplateToken
  145. } else {
  146. // will work incorrectly for objects or functions divided by something,
  147. // but that's an extremely rare case
  148. l.state = ExprState
  149. l.r.Move(1)
  150. tt = PunctuatorToken
  151. }
  152. case ']':
  153. l.state = SubscriptState
  154. l.r.Move(1)
  155. tt = PunctuatorToken
  156. case '[', ';', ',', '~', '?', ':':
  157. l.state = ExprState
  158. l.r.Move(1)
  159. tt = PunctuatorToken
  160. case '<', '>', '=', '!', '+', '-', '*', '%', '&', '|', '^':
  161. if l.consumeHTMLLikeCommentToken() {
  162. return SingleLineCommentToken, l.r.Shift()
  163. } else if l.consumeLongPunctuatorToken() {
  164. l.state = ExprState
  165. tt = PunctuatorToken
  166. }
  167. case '/':
  168. if tt = l.consumeCommentToken(); tt != UnknownToken {
  169. return tt, l.r.Shift()
  170. } else if l.state == ExprState && l.consumeRegexpToken() {
  171. l.state = SubscriptState
  172. tt = RegexpToken
  173. } else if l.consumeLongPunctuatorToken() {
  174. l.state = ExprState
  175. tt = PunctuatorToken
  176. }
  177. case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '.':
  178. if l.consumeNumericToken() {
  179. tt = NumericToken
  180. l.state = SubscriptState
  181. } else if c == '.' {
  182. l.state = PropNameState
  183. l.r.Move(1)
  184. tt = PunctuatorToken
  185. }
  186. case '\'', '"':
  187. if l.consumeStringToken() {
  188. l.state = SubscriptState
  189. tt = StringToken
  190. }
  191. case ' ', '\t', '\v', '\f':
  192. l.r.Move(1)
  193. for l.consumeWhitespace() {
  194. }
  195. return WhitespaceToken, l.r.Shift()
  196. case '\n', '\r':
  197. l.r.Move(1)
  198. for l.consumeLineTerminator() {
  199. }
  200. tt = LineTerminatorToken
  201. case '`':
  202. if l.consumeTemplateToken() {
  203. tt = TemplateToken
  204. }
  205. default:
  206. if l.consumeIdentifierToken() {
  207. tt = IdentifierToken
  208. if l.state != PropNameState {
  209. switch hash := ToHash(l.r.Lexeme()); hash {
  210. case 0, This, False, True, Null:
  211. l.state = SubscriptState
  212. case If, While, For, With:
  213. l.state = StmtParensState
  214. default:
  215. // This will include keywords that can't be followed by a regexp, but only
  216. // by a specified char (like `switch` or `try`), but we don't check for syntax
  217. // errors as we don't attempt to parse a full JS grammar when streaming
  218. l.state = ExprState
  219. }
  220. } else {
  221. l.state = SubscriptState
  222. }
  223. } else if c >= 0xC0 {
  224. if l.consumeWhitespace() {
  225. for l.consumeWhitespace() {
  226. }
  227. return WhitespaceToken, l.r.Shift()
  228. } else if l.consumeLineTerminator() {
  229. for l.consumeLineTerminator() {
  230. }
  231. tt = LineTerminatorToken
  232. }
  233. } else if l.Err() != nil {
  234. return ErrorToken, nil
  235. }
  236. }
  237. l.emptyLine = tt == LineTerminatorToken
  238. if tt == UnknownToken {
  239. _, n := l.r.PeekRune(0)
  240. l.r.Move(n)
  241. }
  242. return tt, l.r.Shift()
  243. }
  244. ////////////////////////////////////////////////////////////////
  245. /*
  246. The following functions follow the specifications at http://www.ecma-international.org/ecma-262/5.1/
  247. */
  248. func (l *Lexer) consumeWhitespace() bool {
  249. c := l.r.Peek(0)
  250. if c == ' ' || c == '\t' || c == '\v' || c == '\f' {
  251. l.r.Move(1)
  252. return true
  253. } else if c >= 0xC0 {
  254. if r, n := l.r.PeekRune(0); r == '\u00A0' || r == '\uFEFF' || unicode.Is(unicode.Zs, r) {
  255. l.r.Move(n)
  256. return true
  257. }
  258. }
  259. return false
  260. }
  261. func (l *Lexer) consumeLineTerminator() bool {
  262. c := l.r.Peek(0)
  263. if c == '\n' {
  264. l.r.Move(1)
  265. return true
  266. } else if c == '\r' {
  267. if l.r.Peek(1) == '\n' {
  268. l.r.Move(2)
  269. } else {
  270. l.r.Move(1)
  271. }
  272. return true
  273. } else if c >= 0xC0 {
  274. if r, n := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  275. l.r.Move(n)
  276. return true
  277. }
  278. }
  279. return false
  280. }
  281. func (l *Lexer) consumeDigit() bool {
  282. if c := l.r.Peek(0); c >= '0' && c <= '9' {
  283. l.r.Move(1)
  284. return true
  285. }
  286. return false
  287. }
  288. func (l *Lexer) consumeHexDigit() bool {
  289. if c := l.r.Peek(0); (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
  290. l.r.Move(1)
  291. return true
  292. }
  293. return false
  294. }
  295. func (l *Lexer) consumeBinaryDigit() bool {
  296. if c := l.r.Peek(0); c == '0' || c == '1' {
  297. l.r.Move(1)
  298. return true
  299. }
  300. return false
  301. }
  302. func (l *Lexer) consumeOctalDigit() bool {
  303. if c := l.r.Peek(0); c >= '0' && c <= '7' {
  304. l.r.Move(1)
  305. return true
  306. }
  307. return false
  308. }
  309. func (l *Lexer) consumeUnicodeEscape() bool {
  310. if l.r.Peek(0) != '\\' || l.r.Peek(1) != 'u' {
  311. return false
  312. }
  313. mark := l.r.Pos()
  314. l.r.Move(2)
  315. if c := l.r.Peek(0); c == '{' {
  316. l.r.Move(1)
  317. if l.consumeHexDigit() {
  318. for l.consumeHexDigit() {
  319. }
  320. if c := l.r.Peek(0); c == '}' {
  321. l.r.Move(1)
  322. return true
  323. }
  324. }
  325. l.r.Rewind(mark)
  326. return false
  327. } else if !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() || !l.consumeHexDigit() {
  328. l.r.Rewind(mark)
  329. return false
  330. }
  331. return true
  332. }
  333. func (l *Lexer) consumeSingleLineComment() {
  334. for {
  335. c := l.r.Peek(0)
  336. if c == '\r' || c == '\n' || c == 0 {
  337. break
  338. } else if c >= 0xC0 {
  339. if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  340. break
  341. }
  342. }
  343. l.r.Move(1)
  344. }
  345. }
  346. ////////////////////////////////////////////////////////////////
  347. func (l *Lexer) consumeHTMLLikeCommentToken() bool {
  348. c := l.r.Peek(0)
  349. if c == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  350. // opening HTML-style single line comment
  351. l.r.Move(4)
  352. l.consumeSingleLineComment()
  353. return true
  354. } else if l.emptyLine && c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  355. // closing HTML-style single line comment
  356. // (only if current line didn't contain any meaningful tokens)
  357. l.r.Move(3)
  358. l.consumeSingleLineComment()
  359. return true
  360. }
  361. return false
  362. }
  363. func (l *Lexer) consumeCommentToken() TokenType {
  364. c := l.r.Peek(0)
  365. if c == '/' {
  366. c = l.r.Peek(1)
  367. if c == '/' {
  368. // single line comment
  369. l.r.Move(2)
  370. l.consumeSingleLineComment()
  371. return SingleLineCommentToken
  372. } else if c == '*' {
  373. // block comment (potentially multiline)
  374. tt := SingleLineCommentToken
  375. l.r.Move(2)
  376. for {
  377. c := l.r.Peek(0)
  378. if c == '*' && l.r.Peek(1) == '/' {
  379. l.r.Move(2)
  380. break
  381. } else if c == 0 {
  382. break
  383. } else if l.consumeLineTerminator() {
  384. tt = MultiLineCommentToken
  385. l.emptyLine = true
  386. } else {
  387. l.r.Move(1)
  388. }
  389. }
  390. return tt
  391. }
  392. }
  393. return UnknownToken
  394. }
  395. func (l *Lexer) consumeLongPunctuatorToken() bool {
  396. c := l.r.Peek(0)
  397. if c == '!' || c == '=' || c == '+' || c == '-' || c == '*' || c == '/' || c == '%' || c == '&' || c == '|' || c == '^' {
  398. l.r.Move(1)
  399. if l.r.Peek(0) == '=' {
  400. l.r.Move(1)
  401. if (c == '!' || c == '=') && l.r.Peek(0) == '=' {
  402. l.r.Move(1)
  403. }
  404. } else if (c == '+' || c == '-' || c == '&' || c == '|') && l.r.Peek(0) == c {
  405. l.r.Move(1)
  406. } else if c == '=' && l.r.Peek(0) == '>' {
  407. l.r.Move(1)
  408. }
  409. } else { // c == '<' || c == '>'
  410. l.r.Move(1)
  411. if l.r.Peek(0) == c {
  412. l.r.Move(1)
  413. if c == '>' && l.r.Peek(0) == '>' {
  414. l.r.Move(1)
  415. }
  416. }
  417. if l.r.Peek(0) == '=' {
  418. l.r.Move(1)
  419. }
  420. }
  421. return true
  422. }
  423. func (l *Lexer) consumeIdentifierToken() bool {
  424. c := l.r.Peek(0)
  425. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '$' || c == '_' {
  426. l.r.Move(1)
  427. } else if c >= 0xC0 {
  428. if r, n := l.r.PeekRune(0); unicode.IsOneOf(identifierStart, r) {
  429. l.r.Move(n)
  430. } else {
  431. return false
  432. }
  433. } else if !l.consumeUnicodeEscape() {
  434. return false
  435. }
  436. for {
  437. c := l.r.Peek(0)
  438. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
  439. l.r.Move(1)
  440. } else if c >= 0xC0 {
  441. if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
  442. l.r.Move(n)
  443. } else {
  444. break
  445. }
  446. } else {
  447. break
  448. }
  449. }
  450. return true
  451. }
  452. func (l *Lexer) consumeNumericToken() bool {
  453. // assume to be on 0 1 2 3 4 5 6 7 8 9 .
  454. mark := l.r.Pos()
  455. c := l.r.Peek(0)
  456. if c == '0' {
  457. l.r.Move(1)
  458. if l.r.Peek(0) == 'x' || l.r.Peek(0) == 'X' {
  459. l.r.Move(1)
  460. if l.consumeHexDigit() {
  461. for l.consumeHexDigit() {
  462. }
  463. } else {
  464. l.r.Move(-1) // return just the zero
  465. }
  466. return true
  467. } else if l.r.Peek(0) == 'b' || l.r.Peek(0) == 'B' {
  468. l.r.Move(1)
  469. if l.consumeBinaryDigit() {
  470. for l.consumeBinaryDigit() {
  471. }
  472. } else {
  473. l.r.Move(-1) // return just the zero
  474. }
  475. return true
  476. } else if l.r.Peek(0) == 'o' || l.r.Peek(0) == 'O' {
  477. l.r.Move(1)
  478. if l.consumeOctalDigit() {
  479. for l.consumeOctalDigit() {
  480. }
  481. } else {
  482. l.r.Move(-1) // return just the zero
  483. }
  484. return true
  485. }
  486. } else if c != '.' {
  487. for l.consumeDigit() {
  488. }
  489. }
  490. if l.r.Peek(0) == '.' {
  491. l.r.Move(1)
  492. if l.consumeDigit() {
  493. for l.consumeDigit() {
  494. }
  495. } else if c != '.' {
  496. // . could belong to the next token
  497. l.r.Move(-1)
  498. return true
  499. } else {
  500. l.r.Rewind(mark)
  501. return false
  502. }
  503. }
  504. mark = l.r.Pos()
  505. c = l.r.Peek(0)
  506. if c == 'e' || c == 'E' {
  507. l.r.Move(1)
  508. c = l.r.Peek(0)
  509. if c == '+' || c == '-' {
  510. l.r.Move(1)
  511. }
  512. if !l.consumeDigit() {
  513. // e could belong to the next token
  514. l.r.Rewind(mark)
  515. return true
  516. }
  517. for l.consumeDigit() {
  518. }
  519. }
  520. return true
  521. }
  522. func (l *Lexer) consumeStringToken() bool {
  523. // assume to be on ' or "
  524. mark := l.r.Pos()
  525. delim := l.r.Peek(0)
  526. l.r.Move(1)
  527. for {
  528. c := l.r.Peek(0)
  529. if c == delim {
  530. l.r.Move(1)
  531. break
  532. } else if c == '\\' {
  533. l.r.Move(1)
  534. if !l.consumeLineTerminator() {
  535. if c := l.r.Peek(0); c == delim || c == '\\' {
  536. l.r.Move(1)
  537. }
  538. }
  539. continue
  540. } else if c == '\n' || c == '\r' {
  541. l.r.Rewind(mark)
  542. return false
  543. } else if c >= 0xC0 {
  544. if r, _ := l.r.PeekRune(0); r == '\u2028' || r == '\u2029' {
  545. l.r.Rewind(mark)
  546. return false
  547. }
  548. } else if c == 0 {
  549. break
  550. }
  551. l.r.Move(1)
  552. }
  553. return true
  554. }
  555. func (l *Lexer) consumeRegexpToken() bool {
  556. // assume to be on / and not /*
  557. mark := l.r.Pos()
  558. l.r.Move(1)
  559. inClass := false
  560. for {
  561. c := l.r.Peek(0)
  562. if !inClass && c == '/' {
  563. l.r.Move(1)
  564. break
  565. } else if c == '[' {
  566. inClass = true
  567. } else if c == ']' {
  568. inClass = false
  569. } else if c == '\\' {
  570. l.r.Move(1)
  571. if l.consumeLineTerminator() {
  572. l.r.Rewind(mark)
  573. return false
  574. } else if l.r.Peek(0) == 0 {
  575. return true
  576. }
  577. } else if l.consumeLineTerminator() {
  578. l.r.Rewind(mark)
  579. return false
  580. } else if c == 0 {
  581. return true
  582. }
  583. l.r.Move(1)
  584. }
  585. // flags
  586. for {
  587. c := l.r.Peek(0)
  588. if (c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '$' || c == '_' {
  589. l.r.Move(1)
  590. } else if c >= 0xC0 {
  591. if r, n := l.r.PeekRune(0); r == '\u200C' || r == '\u200D' || unicode.IsOneOf(identifierContinue, r) {
  592. l.r.Move(n)
  593. } else {
  594. break
  595. }
  596. } else {
  597. break
  598. }
  599. }
  600. return true
  601. }
  602. func (l *Lexer) consumeTemplateToken() bool {
  603. // assume to be on ` or } when already within template
  604. mark := l.r.Pos()
  605. l.r.Move(1)
  606. for {
  607. c := l.r.Peek(0)
  608. if c == '`' {
  609. l.state = SubscriptState
  610. l.r.Move(1)
  611. return true
  612. } else if c == '$' && l.r.Peek(1) == '{' {
  613. l.enterContext(TemplateContext)
  614. l.state = ExprState
  615. l.r.Move(2)
  616. return true
  617. } else if c == '\\' {
  618. l.r.Move(1)
  619. if c := l.r.Peek(0); c != 0 {
  620. l.r.Move(1)
  621. }
  622. continue
  623. } else if c == 0 {
  624. l.r.Rewind(mark)
  625. return false
  626. }
  627. l.r.Move(1)
  628. }
  629. }