lex.go 15 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710
  1. // Package css is a CSS3 lexer and parser following the specifications at http://www.w3.org/TR/css-syntax-3/.
  2. package css // import "github.com/tdewolff/parse/css"
  3. // TODO: \uFFFD replacement character for NULL bytes in strings for example, or atleast don't end the string early
  4. import (
  5. "bytes"
  6. "io"
  7. "strconv"
  8. "github.com/tdewolff/parse"
  9. "github.com/tdewolff/parse/buffer"
  10. )
  11. // TokenType determines the type of token, eg. a number or a semicolon.
  12. type TokenType uint32
  13. // TokenType values.
  14. const (
  15. ErrorToken TokenType = iota // extra token when errors occur
  16. IdentToken
  17. FunctionToken // rgb( rgba( ...
  18. AtKeywordToken // @abc
  19. HashToken // #abc
  20. StringToken
  21. BadStringToken
  22. URLToken
  23. BadURLToken
  24. DelimToken // any unmatched character
  25. NumberToken // 5
  26. PercentageToken // 5%
  27. DimensionToken // 5em
  28. UnicodeRangeToken // U+554A
  29. IncludeMatchToken // ~=
  30. DashMatchToken // |=
  31. PrefixMatchToken // ^=
  32. SuffixMatchToken // $=
  33. SubstringMatchToken // *=
  34. ColumnToken // ||
  35. WhitespaceToken // space \t \r \n \f
  36. CDOToken // <!--
  37. CDCToken // -->
  38. ColonToken // :
  39. SemicolonToken // ;
  40. CommaToken // ,
  41. LeftBracketToken // [
  42. RightBracketToken // ]
  43. LeftParenthesisToken // (
  44. RightParenthesisToken // )
  45. LeftBraceToken // {
  46. RightBraceToken // }
  47. CommentToken // extra token for comments
  48. EmptyToken
  49. CustomPropertyNameToken
  50. CustomPropertyValueToken
  51. )
  52. // String returns the string representation of a TokenType.
  53. func (tt TokenType) String() string {
  54. switch tt {
  55. case ErrorToken:
  56. return "Error"
  57. case IdentToken:
  58. return "Ident"
  59. case FunctionToken:
  60. return "Function"
  61. case AtKeywordToken:
  62. return "AtKeyword"
  63. case HashToken:
  64. return "Hash"
  65. case StringToken:
  66. return "String"
  67. case BadStringToken:
  68. return "BadString"
  69. case URLToken:
  70. return "URL"
  71. case BadURLToken:
  72. return "BadURL"
  73. case DelimToken:
  74. return "Delim"
  75. case NumberToken:
  76. return "Number"
  77. case PercentageToken:
  78. return "Percentage"
  79. case DimensionToken:
  80. return "Dimension"
  81. case UnicodeRangeToken:
  82. return "UnicodeRange"
  83. case IncludeMatchToken:
  84. return "IncludeMatch"
  85. case DashMatchToken:
  86. return "DashMatch"
  87. case PrefixMatchToken:
  88. return "PrefixMatch"
  89. case SuffixMatchToken:
  90. return "SuffixMatch"
  91. case SubstringMatchToken:
  92. return "SubstringMatch"
  93. case ColumnToken:
  94. return "Column"
  95. case WhitespaceToken:
  96. return "Whitespace"
  97. case CDOToken:
  98. return "CDO"
  99. case CDCToken:
  100. return "CDC"
  101. case ColonToken:
  102. return "Colon"
  103. case SemicolonToken:
  104. return "Semicolon"
  105. case CommaToken:
  106. return "Comma"
  107. case LeftBracketToken:
  108. return "LeftBracket"
  109. case RightBracketToken:
  110. return "RightBracket"
  111. case LeftParenthesisToken:
  112. return "LeftParenthesis"
  113. case RightParenthesisToken:
  114. return "RightParenthesis"
  115. case LeftBraceToken:
  116. return "LeftBrace"
  117. case RightBraceToken:
  118. return "RightBrace"
  119. case CommentToken:
  120. return "Comment"
  121. case EmptyToken:
  122. return "Empty"
  123. case CustomPropertyNameToken:
  124. return "CustomPropertyName"
  125. case CustomPropertyValueToken:
  126. return "CustomPropertyValue"
  127. }
  128. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  129. }
  130. ////////////////////////////////////////////////////////////////
  131. // Lexer is the state for the lexer.
  132. type Lexer struct {
  133. r *buffer.Lexer
  134. }
  135. // NewLexer returns a new Lexer for a given io.Reader.
  136. func NewLexer(r io.Reader) *Lexer {
  137. return &Lexer{
  138. buffer.NewLexer(r),
  139. }
  140. }
  141. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  142. func (l *Lexer) Err() error {
  143. return l.r.Err()
  144. }
  145. // Restore restores the NULL byte at the end of the buffer.
  146. func (l *Lexer) Restore() {
  147. l.r.Restore()
  148. }
  149. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  150. func (l *Lexer) Next() (TokenType, []byte) {
  151. switch l.r.Peek(0) {
  152. case ' ', '\t', '\n', '\r', '\f':
  153. l.r.Move(1)
  154. for l.consumeWhitespace() {
  155. }
  156. return WhitespaceToken, l.r.Shift()
  157. case ':':
  158. l.r.Move(1)
  159. return ColonToken, l.r.Shift()
  160. case ';':
  161. l.r.Move(1)
  162. return SemicolonToken, l.r.Shift()
  163. case ',':
  164. l.r.Move(1)
  165. return CommaToken, l.r.Shift()
  166. case '(', ')', '[', ']', '{', '}':
  167. if t := l.consumeBracket(); t != ErrorToken {
  168. return t, l.r.Shift()
  169. }
  170. case '#':
  171. if l.consumeHashToken() {
  172. return HashToken, l.r.Shift()
  173. }
  174. case '"', '\'':
  175. if t := l.consumeString(); t != ErrorToken {
  176. return t, l.r.Shift()
  177. }
  178. case '.', '+':
  179. if t := l.consumeNumeric(); t != ErrorToken {
  180. return t, l.r.Shift()
  181. }
  182. case '-':
  183. if t := l.consumeNumeric(); t != ErrorToken {
  184. return t, l.r.Shift()
  185. } else if t := l.consumeIdentlike(); t != ErrorToken {
  186. return t, l.r.Shift()
  187. } else if l.consumeCDCToken() {
  188. return CDCToken, l.r.Shift()
  189. } else if l.consumeCustomVariableToken() {
  190. return CustomPropertyNameToken, l.r.Shift()
  191. }
  192. case '@':
  193. if l.consumeAtKeywordToken() {
  194. return AtKeywordToken, l.r.Shift()
  195. }
  196. case '$', '*', '^', '~':
  197. if t := l.consumeMatch(); t != ErrorToken {
  198. return t, l.r.Shift()
  199. }
  200. case '/':
  201. if l.consumeComment() {
  202. return CommentToken, l.r.Shift()
  203. }
  204. case '<':
  205. if l.consumeCDOToken() {
  206. return CDOToken, l.r.Shift()
  207. }
  208. case '\\':
  209. if t := l.consumeIdentlike(); t != ErrorToken {
  210. return t, l.r.Shift()
  211. }
  212. case 'u', 'U':
  213. if l.consumeUnicodeRangeToken() {
  214. return UnicodeRangeToken, l.r.Shift()
  215. } else if t := l.consumeIdentlike(); t != ErrorToken {
  216. return t, l.r.Shift()
  217. }
  218. case '|':
  219. if t := l.consumeMatch(); t != ErrorToken {
  220. return t, l.r.Shift()
  221. } else if l.consumeColumnToken() {
  222. return ColumnToken, l.r.Shift()
  223. }
  224. case 0:
  225. if l.Err() != nil {
  226. return ErrorToken, nil
  227. }
  228. default:
  229. if t := l.consumeNumeric(); t != ErrorToken {
  230. return t, l.r.Shift()
  231. } else if t := l.consumeIdentlike(); t != ErrorToken {
  232. return t, l.r.Shift()
  233. }
  234. }
  235. // can't be rune because consumeIdentlike consumes that as an identifier
  236. l.r.Move(1)
  237. return DelimToken, l.r.Shift()
  238. }
  239. ////////////////////////////////////////////////////////////////
  240. /*
  241. The following functions follow the railroad diagrams in http://www.w3.org/TR/css3-syntax/
  242. */
  243. func (l *Lexer) consumeByte(c byte) bool {
  244. if l.r.Peek(0) == c {
  245. l.r.Move(1)
  246. return true
  247. }
  248. return false
  249. }
  250. func (l *Lexer) consumeComment() bool {
  251. if l.r.Peek(0) != '/' || l.r.Peek(1) != '*' {
  252. return false
  253. }
  254. l.r.Move(2)
  255. for {
  256. c := l.r.Peek(0)
  257. if c == 0 && l.Err() != nil {
  258. break
  259. } else if c == '*' && l.r.Peek(1) == '/' {
  260. l.r.Move(2)
  261. return true
  262. }
  263. l.r.Move(1)
  264. }
  265. return true
  266. }
  267. func (l *Lexer) consumeNewline() bool {
  268. c := l.r.Peek(0)
  269. if c == '\n' || c == '\f' {
  270. l.r.Move(1)
  271. return true
  272. } else if c == '\r' {
  273. if l.r.Peek(1) == '\n' {
  274. l.r.Move(2)
  275. } else {
  276. l.r.Move(1)
  277. }
  278. return true
  279. }
  280. return false
  281. }
  282. func (l *Lexer) consumeWhitespace() bool {
  283. c := l.r.Peek(0)
  284. if c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  285. l.r.Move(1)
  286. return true
  287. }
  288. return false
  289. }
  290. func (l *Lexer) consumeDigit() bool {
  291. c := l.r.Peek(0)
  292. if c >= '0' && c <= '9' {
  293. l.r.Move(1)
  294. return true
  295. }
  296. return false
  297. }
  298. func (l *Lexer) consumeHexDigit() bool {
  299. c := l.r.Peek(0)
  300. if (c >= '0' && c <= '9') || (c >= 'a' && c <= 'f') || (c >= 'A' && c <= 'F') {
  301. l.r.Move(1)
  302. return true
  303. }
  304. return false
  305. }
  306. func (l *Lexer) consumeEscape() bool {
  307. if l.r.Peek(0) != '\\' {
  308. return false
  309. }
  310. mark := l.r.Pos()
  311. l.r.Move(1)
  312. if l.consumeNewline() {
  313. l.r.Rewind(mark)
  314. return false
  315. } else if l.consumeHexDigit() {
  316. for k := 1; k < 6; k++ {
  317. if !l.consumeHexDigit() {
  318. break
  319. }
  320. }
  321. l.consumeWhitespace()
  322. return true
  323. } else {
  324. c := l.r.Peek(0)
  325. if c >= 0xC0 {
  326. _, n := l.r.PeekRune(0)
  327. l.r.Move(n)
  328. return true
  329. } else if c == 0 && l.r.Err() != nil {
  330. return true
  331. }
  332. }
  333. l.r.Move(1)
  334. return true
  335. }
  336. func (l *Lexer) consumeIdentToken() bool {
  337. mark := l.r.Pos()
  338. if l.r.Peek(0) == '-' {
  339. l.r.Move(1)
  340. }
  341. c := l.r.Peek(0)
  342. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || c == '_' || c >= 0x80) {
  343. if c != '\\' || !l.consumeEscape() {
  344. l.r.Rewind(mark)
  345. return false
  346. }
  347. } else {
  348. l.r.Move(1)
  349. }
  350. for {
  351. c := l.r.Peek(0)
  352. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  353. if c != '\\' || !l.consumeEscape() {
  354. break
  355. }
  356. } else {
  357. l.r.Move(1)
  358. }
  359. }
  360. return true
  361. }
  362. // support custom variables, https://www.w3.org/TR/css-variables-1/
  363. func (l *Lexer) consumeCustomVariableToken() bool {
  364. // expect to be on a '-'
  365. l.r.Move(1)
  366. if l.r.Peek(0) != '-' {
  367. l.r.Move(-1)
  368. return false
  369. }
  370. if !l.consumeIdentToken() {
  371. l.r.Move(-1)
  372. return false
  373. }
  374. return true
  375. }
  376. func (l *Lexer) consumeAtKeywordToken() bool {
  377. // expect to be on an '@'
  378. l.r.Move(1)
  379. if !l.consumeIdentToken() {
  380. l.r.Move(-1)
  381. return false
  382. }
  383. return true
  384. }
  385. func (l *Lexer) consumeHashToken() bool {
  386. // expect to be on a '#'
  387. mark := l.r.Pos()
  388. l.r.Move(1)
  389. c := l.r.Peek(0)
  390. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  391. if c != '\\' || !l.consumeEscape() {
  392. l.r.Rewind(mark)
  393. return false
  394. }
  395. } else {
  396. l.r.Move(1)
  397. }
  398. for {
  399. c := l.r.Peek(0)
  400. if !((c >= 'a' && c <= 'z') || (c >= 'A' && c <= 'Z') || (c >= '0' && c <= '9') || c == '_' || c == '-' || c >= 0x80) {
  401. if c != '\\' || !l.consumeEscape() {
  402. break
  403. }
  404. } else {
  405. l.r.Move(1)
  406. }
  407. }
  408. return true
  409. }
  410. func (l *Lexer) consumeNumberToken() bool {
  411. mark := l.r.Pos()
  412. c := l.r.Peek(0)
  413. if c == '+' || c == '-' {
  414. l.r.Move(1)
  415. }
  416. firstDigit := l.consumeDigit()
  417. if firstDigit {
  418. for l.consumeDigit() {
  419. }
  420. }
  421. if l.r.Peek(0) == '.' {
  422. l.r.Move(1)
  423. if l.consumeDigit() {
  424. for l.consumeDigit() {
  425. }
  426. } else if firstDigit {
  427. // . could belong to the next token
  428. l.r.Move(-1)
  429. return true
  430. } else {
  431. l.r.Rewind(mark)
  432. return false
  433. }
  434. } else if !firstDigit {
  435. l.r.Rewind(mark)
  436. return false
  437. }
  438. mark = l.r.Pos()
  439. c = l.r.Peek(0)
  440. if c == 'e' || c == 'E' {
  441. l.r.Move(1)
  442. c = l.r.Peek(0)
  443. if c == '+' || c == '-' {
  444. l.r.Move(1)
  445. }
  446. if !l.consumeDigit() {
  447. // e could belong to next token
  448. l.r.Rewind(mark)
  449. return true
  450. }
  451. for l.consumeDigit() {
  452. }
  453. }
  454. return true
  455. }
  456. func (l *Lexer) consumeUnicodeRangeToken() bool {
  457. c := l.r.Peek(0)
  458. if (c != 'u' && c != 'U') || l.r.Peek(1) != '+' {
  459. return false
  460. }
  461. mark := l.r.Pos()
  462. l.r.Move(2)
  463. if l.consumeHexDigit() {
  464. // consume up to 6 hexDigits
  465. k := 1
  466. for ; k < 6; k++ {
  467. if !l.consumeHexDigit() {
  468. break
  469. }
  470. }
  471. // either a minus or a question mark or the end is expected
  472. if l.consumeByte('-') {
  473. // consume another up to 6 hexDigits
  474. if l.consumeHexDigit() {
  475. for k := 1; k < 6; k++ {
  476. if !l.consumeHexDigit() {
  477. break
  478. }
  479. }
  480. } else {
  481. l.r.Rewind(mark)
  482. return false
  483. }
  484. } else {
  485. // could be filled up to 6 characters with question marks or else regular hexDigits
  486. if l.consumeByte('?') {
  487. k++
  488. for ; k < 6; k++ {
  489. if !l.consumeByte('?') {
  490. l.r.Rewind(mark)
  491. return false
  492. }
  493. }
  494. }
  495. }
  496. } else {
  497. // consume 6 question marks
  498. for k := 0; k < 6; k++ {
  499. if !l.consumeByte('?') {
  500. l.r.Rewind(mark)
  501. return false
  502. }
  503. }
  504. }
  505. return true
  506. }
  507. func (l *Lexer) consumeColumnToken() bool {
  508. if l.r.Peek(0) == '|' && l.r.Peek(1) == '|' {
  509. l.r.Move(2)
  510. return true
  511. }
  512. return false
  513. }
  514. func (l *Lexer) consumeCDOToken() bool {
  515. if l.r.Peek(0) == '<' && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  516. l.r.Move(4)
  517. return true
  518. }
  519. return false
  520. }
  521. func (l *Lexer) consumeCDCToken() bool {
  522. if l.r.Peek(0) == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  523. l.r.Move(3)
  524. return true
  525. }
  526. return false
  527. }
  528. ////////////////////////////////////////////////////////////////
  529. // consumeMatch consumes any MatchToken.
  530. func (l *Lexer) consumeMatch() TokenType {
  531. if l.r.Peek(1) == '=' {
  532. switch l.r.Peek(0) {
  533. case '~':
  534. l.r.Move(2)
  535. return IncludeMatchToken
  536. case '|':
  537. l.r.Move(2)
  538. return DashMatchToken
  539. case '^':
  540. l.r.Move(2)
  541. return PrefixMatchToken
  542. case '$':
  543. l.r.Move(2)
  544. return SuffixMatchToken
  545. case '*':
  546. l.r.Move(2)
  547. return SubstringMatchToken
  548. }
  549. }
  550. return ErrorToken
  551. }
  552. // consumeBracket consumes any bracket token.
  553. func (l *Lexer) consumeBracket() TokenType {
  554. switch l.r.Peek(0) {
  555. case '(':
  556. l.r.Move(1)
  557. return LeftParenthesisToken
  558. case ')':
  559. l.r.Move(1)
  560. return RightParenthesisToken
  561. case '[':
  562. l.r.Move(1)
  563. return LeftBracketToken
  564. case ']':
  565. l.r.Move(1)
  566. return RightBracketToken
  567. case '{':
  568. l.r.Move(1)
  569. return LeftBraceToken
  570. case '}':
  571. l.r.Move(1)
  572. return RightBraceToken
  573. }
  574. return ErrorToken
  575. }
  576. // consumeNumeric consumes NumberToken, PercentageToken or DimensionToken.
  577. func (l *Lexer) consumeNumeric() TokenType {
  578. if l.consumeNumberToken() {
  579. if l.consumeByte('%') {
  580. return PercentageToken
  581. } else if l.consumeIdentToken() {
  582. return DimensionToken
  583. }
  584. return NumberToken
  585. }
  586. return ErrorToken
  587. }
  588. // consumeString consumes a string and may return BadStringToken when a newline is encountered.
  589. func (l *Lexer) consumeString() TokenType {
  590. // assume to be on " or '
  591. delim := l.r.Peek(0)
  592. l.r.Move(1)
  593. for {
  594. c := l.r.Peek(0)
  595. if c == 0 && l.Err() != nil {
  596. break
  597. } else if c == '\n' || c == '\r' || c == '\f' {
  598. l.r.Move(1)
  599. return BadStringToken
  600. } else if c == delim {
  601. l.r.Move(1)
  602. break
  603. } else if c == '\\' {
  604. if !l.consumeEscape() {
  605. l.r.Move(1)
  606. l.consumeNewline()
  607. }
  608. } else {
  609. l.r.Move(1)
  610. }
  611. }
  612. return StringToken
  613. }
  614. func (l *Lexer) consumeUnquotedURL() bool {
  615. for {
  616. c := l.r.Peek(0)
  617. if c == 0 && l.Err() != nil || c == ')' {
  618. break
  619. } else if c == '"' || c == '\'' || c == '(' || c == '\\' || c == ' ' || c <= 0x1F || c == 0x7F {
  620. if c != '\\' || !l.consumeEscape() {
  621. return false
  622. }
  623. } else {
  624. l.r.Move(1)
  625. }
  626. }
  627. return true
  628. }
  629. // consumeRemnantsBadUrl consumes bytes of a BadUrlToken so that normal tokenization may continue.
  630. func (l *Lexer) consumeRemnantsBadURL() {
  631. for {
  632. if l.consumeByte(')') || l.Err() != nil {
  633. break
  634. } else if !l.consumeEscape() {
  635. l.r.Move(1)
  636. }
  637. }
  638. }
  639. // consumeIdentlike consumes IdentToken, FunctionToken or UrlToken.
  640. func (l *Lexer) consumeIdentlike() TokenType {
  641. if l.consumeIdentToken() {
  642. if l.r.Peek(0) != '(' {
  643. return IdentToken
  644. } else if !parse.EqualFold(bytes.Replace(l.r.Lexeme(), []byte{'\\'}, nil, -1), []byte{'u', 'r', 'l'}) {
  645. l.r.Move(1)
  646. return FunctionToken
  647. }
  648. l.r.Move(1)
  649. // consume url
  650. for l.consumeWhitespace() {
  651. }
  652. if c := l.r.Peek(0); c == '"' || c == '\'' {
  653. if l.consumeString() == BadStringToken {
  654. l.consumeRemnantsBadURL()
  655. return BadURLToken
  656. }
  657. } else if !l.consumeUnquotedURL() && !l.consumeWhitespace() {
  658. l.consumeRemnantsBadURL()
  659. return BadURLToken
  660. }
  661. for l.consumeWhitespace() {
  662. }
  663. if !l.consumeByte(')') && l.Err() != io.EOF {
  664. l.consumeRemnantsBadURL()
  665. return BadURLToken
  666. }
  667. return URLToken
  668. }
  669. return ErrorToken
  670. }