lex.go 11 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498
  1. // Package html is an HTML5 lexer following the specifications at http://www.w3.org/TR/html5/syntax.html.
  2. package html // import "github.com/tdewolff/parse/html"
  3. import (
  4. "io"
  5. "strconv"
  6. "github.com/tdewolff/parse"
  7. "github.com/tdewolff/parse/buffer"
  8. )
  9. // TokenType determines the type of token, eg. a number or a semicolon.
  10. type TokenType uint32
  11. // TokenType values.
  12. const (
  13. ErrorToken TokenType = iota // extra token when errors occur
  14. CommentToken
  15. DoctypeToken
  16. StartTagToken
  17. StartTagCloseToken
  18. StartTagVoidToken
  19. EndTagToken
  20. AttributeToken
  21. TextToken
  22. SvgToken
  23. MathToken
  24. )
  25. // String returns the string representation of a TokenType.
  26. func (tt TokenType) String() string {
  27. switch tt {
  28. case ErrorToken:
  29. return "Error"
  30. case CommentToken:
  31. return "Comment"
  32. case DoctypeToken:
  33. return "Doctype"
  34. case StartTagToken:
  35. return "StartTag"
  36. case StartTagCloseToken:
  37. return "StartTagClose"
  38. case StartTagVoidToken:
  39. return "StartTagVoid"
  40. case EndTagToken:
  41. return "EndTag"
  42. case AttributeToken:
  43. return "Attribute"
  44. case TextToken:
  45. return "Text"
  46. case SvgToken:
  47. return "Svg"
  48. case MathToken:
  49. return "Math"
  50. }
  51. return "Invalid(" + strconv.Itoa(int(tt)) + ")"
  52. }
  53. ////////////////////////////////////////////////////////////////
  54. // Lexer is the state for the lexer.
  55. type Lexer struct {
  56. r *buffer.Lexer
  57. err error
  58. rawTag Hash
  59. inTag bool
  60. text []byte
  61. attrVal []byte
  62. }
  63. // NewLexer returns a new Lexer for a given io.Reader.
  64. func NewLexer(r io.Reader) *Lexer {
  65. return &Lexer{
  66. r: buffer.NewLexer(r),
  67. }
  68. }
  69. // Err returns the error encountered during lexing, this is often io.EOF but also other errors can be returned.
  70. func (l *Lexer) Err() error {
  71. if l.err != nil {
  72. return l.err
  73. }
  74. return l.r.Err()
  75. }
  76. // Restore restores the NULL byte at the end of the buffer.
  77. func (l *Lexer) Restore() {
  78. l.r.Restore()
  79. }
  80. // Next returns the next Token. It returns ErrorToken when an error was encountered. Using Err() one can retrieve the error message.
  81. func (l *Lexer) Next() (TokenType, []byte) {
  82. l.text = nil
  83. var c byte
  84. if l.inTag {
  85. l.attrVal = nil
  86. for { // before attribute name state
  87. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  88. l.r.Move(1)
  89. continue
  90. }
  91. break
  92. }
  93. if c == 0 && l.r.Err() != nil {
  94. return ErrorToken, nil
  95. } else if c != '>' && (c != '/' || l.r.Peek(1) != '>') {
  96. return AttributeToken, l.shiftAttribute()
  97. }
  98. start := l.r.Pos()
  99. l.inTag = false
  100. if c == '/' {
  101. l.r.Move(2)
  102. l.text = l.r.Lexeme()[start:]
  103. return StartTagVoidToken, l.r.Shift()
  104. }
  105. l.r.Move(1)
  106. l.text = l.r.Lexeme()[start:]
  107. return StartTagCloseToken, l.r.Shift()
  108. }
  109. if l.rawTag != 0 {
  110. if rawText := l.shiftRawText(); len(rawText) > 0 {
  111. l.rawTag = 0
  112. return TextToken, rawText
  113. }
  114. l.rawTag = 0
  115. }
  116. for {
  117. c = l.r.Peek(0)
  118. if c == '<' {
  119. c = l.r.Peek(1)
  120. isEndTag := c == '/' && l.r.Peek(2) != '>' && (l.r.Peek(2) != 0 || l.r.PeekErr(2) == nil)
  121. if l.r.Pos() > 0 {
  122. if isEndTag || 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || c == '!' || c == '?' {
  123. // return currently buffered texttoken so that we can return tag next iteration
  124. return TextToken, l.r.Shift()
  125. }
  126. } else if isEndTag {
  127. l.r.Move(2)
  128. // only endtags that are not followed by > or EOF arrive here
  129. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  130. return CommentToken, l.shiftBogusComment()
  131. }
  132. return EndTagToken, l.shiftEndTag()
  133. } else if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' {
  134. l.r.Move(1)
  135. l.inTag = true
  136. return l.shiftStartTag()
  137. } else if c == '!' {
  138. l.r.Move(2)
  139. return l.readMarkup()
  140. } else if c == '?' {
  141. l.r.Move(1)
  142. return CommentToken, l.shiftBogusComment()
  143. }
  144. } else if c == 0 && l.r.Err() != nil {
  145. if l.r.Pos() > 0 {
  146. return TextToken, l.r.Shift()
  147. }
  148. return ErrorToken, nil
  149. }
  150. l.r.Move(1)
  151. }
  152. }
  153. // Text returns the textual representation of a token. This excludes delimiters and additional leading/trailing characters.
  154. func (l *Lexer) Text() []byte {
  155. return l.text
  156. }
  157. // AttrVal returns the attribute value when an AttributeToken was returned from Next.
  158. func (l *Lexer) AttrVal() []byte {
  159. return l.attrVal
  160. }
  161. ////////////////////////////////////////////////////////////////
  162. // The following functions follow the specifications at http://www.w3.org/html/wg/drafts/html/master/syntax.html
  163. func (l *Lexer) shiftRawText() []byte {
  164. if l.rawTag == Plaintext {
  165. for {
  166. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  167. return l.r.Shift()
  168. }
  169. l.r.Move(1)
  170. }
  171. } else { // RCDATA, RAWTEXT and SCRIPT
  172. for {
  173. c := l.r.Peek(0)
  174. if c == '<' {
  175. if l.r.Peek(1) == '/' {
  176. mark := l.r.Pos()
  177. l.r.Move(2)
  178. for {
  179. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  180. break
  181. }
  182. l.r.Move(1)
  183. }
  184. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == l.rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  185. l.r.Rewind(mark)
  186. return l.r.Shift()
  187. }
  188. } else if l.rawTag == Script && l.r.Peek(1) == '!' && l.r.Peek(2) == '-' && l.r.Peek(3) == '-' {
  189. l.r.Move(4)
  190. inScript := false
  191. for {
  192. c := l.r.Peek(0)
  193. if c == '-' && l.r.Peek(1) == '-' && l.r.Peek(2) == '>' {
  194. l.r.Move(3)
  195. break
  196. } else if c == '<' {
  197. isEnd := l.r.Peek(1) == '/'
  198. if isEnd {
  199. l.r.Move(2)
  200. } else {
  201. l.r.Move(1)
  202. }
  203. mark := l.r.Pos()
  204. for {
  205. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  206. break
  207. }
  208. l.r.Move(1)
  209. }
  210. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark:]))); h == Script { // copy so that ToLower doesn't change the case of the underlying slice
  211. if !isEnd {
  212. inScript = true
  213. } else {
  214. if !inScript {
  215. l.r.Rewind(mark - 2)
  216. return l.r.Shift()
  217. }
  218. inScript = false
  219. }
  220. }
  221. } else if c == 0 && l.r.Err() != nil {
  222. return l.r.Shift()
  223. } else {
  224. l.r.Move(1)
  225. }
  226. }
  227. } else {
  228. l.r.Move(1)
  229. }
  230. } else if c == 0 && l.r.Err() != nil {
  231. return l.r.Shift()
  232. } else {
  233. l.r.Move(1)
  234. }
  235. }
  236. }
  237. }
  238. func (l *Lexer) readMarkup() (TokenType, []byte) {
  239. if l.at('-', '-') {
  240. l.r.Move(2)
  241. for {
  242. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  243. return CommentToken, l.r.Shift()
  244. } else if l.at('-', '-', '>') {
  245. l.text = l.r.Lexeme()[4:]
  246. l.r.Move(3)
  247. return CommentToken, l.r.Shift()
  248. } else if l.at('-', '-', '!', '>') {
  249. l.text = l.r.Lexeme()[4:]
  250. l.r.Move(4)
  251. return CommentToken, l.r.Shift()
  252. }
  253. l.r.Move(1)
  254. }
  255. } else if l.at('[', 'C', 'D', 'A', 'T', 'A', '[') {
  256. l.r.Move(7)
  257. for {
  258. if l.r.Peek(0) == 0 && l.r.Err() != nil {
  259. return TextToken, l.r.Shift()
  260. } else if l.at(']', ']', '>') {
  261. l.r.Move(3)
  262. return TextToken, l.r.Shift()
  263. }
  264. l.r.Move(1)
  265. }
  266. } else {
  267. if l.atCaseInsensitive('d', 'o', 'c', 't', 'y', 'p', 'e') {
  268. l.r.Move(7)
  269. if l.r.Peek(0) == ' ' {
  270. l.r.Move(1)
  271. }
  272. for {
  273. if c := l.r.Peek(0); c == '>' || c == 0 && l.r.Err() != nil {
  274. l.text = l.r.Lexeme()[9:]
  275. if c == '>' {
  276. l.r.Move(1)
  277. }
  278. return DoctypeToken, l.r.Shift()
  279. }
  280. l.r.Move(1)
  281. }
  282. }
  283. }
  284. return CommentToken, l.shiftBogusComment()
  285. }
  286. func (l *Lexer) shiftBogusComment() []byte {
  287. for {
  288. c := l.r.Peek(0)
  289. if c == '>' {
  290. l.text = l.r.Lexeme()[2:]
  291. l.r.Move(1)
  292. return l.r.Shift()
  293. } else if c == 0 && l.r.Err() != nil {
  294. l.text = l.r.Lexeme()[2:]
  295. return l.r.Shift()
  296. }
  297. l.r.Move(1)
  298. }
  299. }
  300. func (l *Lexer) shiftStartTag() (TokenType, []byte) {
  301. for {
  302. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  303. break
  304. }
  305. l.r.Move(1)
  306. }
  307. l.text = parse.ToLower(l.r.Lexeme()[1:])
  308. if h := ToHash(l.text); h == Textarea || h == Title || h == Style || h == Xmp || h == Iframe || h == Script || h == Plaintext || h == Svg || h == Math {
  309. if h == Svg || h == Math {
  310. data := l.shiftXml(h)
  311. if l.err != nil {
  312. return ErrorToken, nil
  313. }
  314. l.inTag = false
  315. if h == Svg {
  316. return SvgToken, data
  317. } else {
  318. return MathToken, data
  319. }
  320. }
  321. l.rawTag = h
  322. }
  323. return StartTagToken, l.r.Shift()
  324. }
  325. func (l *Lexer) shiftAttribute() []byte {
  326. nameStart := l.r.Pos()
  327. var c byte
  328. for { // attribute name state
  329. if c = l.r.Peek(0); c == ' ' || c == '=' || c == '>' || c == '/' && l.r.Peek(1) == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  330. break
  331. }
  332. l.r.Move(1)
  333. }
  334. nameEnd := l.r.Pos()
  335. for { // after attribute name state
  336. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  337. l.r.Move(1)
  338. continue
  339. }
  340. break
  341. }
  342. if c == '=' {
  343. l.r.Move(1)
  344. for { // before attribute value state
  345. if c = l.r.Peek(0); c == ' ' || c == '\t' || c == '\n' || c == '\r' || c == '\f' {
  346. l.r.Move(1)
  347. continue
  348. }
  349. break
  350. }
  351. attrPos := l.r.Pos()
  352. delim := c
  353. if delim == '"' || delim == '\'' { // attribute value single- and double-quoted state
  354. l.r.Move(1)
  355. for {
  356. c := l.r.Peek(0)
  357. if c == delim {
  358. l.r.Move(1)
  359. break
  360. } else if c == 0 && l.r.Err() != nil {
  361. break
  362. }
  363. l.r.Move(1)
  364. }
  365. } else { // attribute value unquoted state
  366. for {
  367. if c := l.r.Peek(0); c == ' ' || c == '>' || c == '\t' || c == '\n' || c == '\r' || c == '\f' || c == 0 && l.r.Err() != nil {
  368. break
  369. }
  370. l.r.Move(1)
  371. }
  372. }
  373. l.attrVal = l.r.Lexeme()[attrPos:]
  374. } else {
  375. l.r.Rewind(nameEnd)
  376. l.attrVal = nil
  377. }
  378. l.text = parse.ToLower(l.r.Lexeme()[nameStart:nameEnd])
  379. return l.r.Shift()
  380. }
  381. func (l *Lexer) shiftEndTag() []byte {
  382. for {
  383. c := l.r.Peek(0)
  384. if c == '>' {
  385. l.text = l.r.Lexeme()[2:]
  386. l.r.Move(1)
  387. break
  388. } else if c == 0 && l.r.Err() != nil {
  389. l.text = l.r.Lexeme()[2:]
  390. break
  391. }
  392. l.r.Move(1)
  393. }
  394. end := len(l.text)
  395. for end > 0 {
  396. if c := l.text[end-1]; c == ' ' || c == '\t' || c == '\n' || c == '\r' {
  397. end--
  398. continue
  399. }
  400. break
  401. }
  402. l.text = l.text[:end]
  403. return parse.ToLower(l.r.Shift())
  404. }
  405. // shiftXml parses the content of a svg or math tag according to the XML 1.1 specifications, including the tag itself.
  406. // So far we have already parsed `<svg` or `<math`.
  407. func (l *Lexer) shiftXml(rawTag Hash) []byte {
  408. inQuote := false
  409. for {
  410. c := l.r.Peek(0)
  411. if c == '"' {
  412. inQuote = !inQuote
  413. l.r.Move(1)
  414. } else if c == '<' && !inQuote && l.r.Peek(1) == '/' {
  415. mark := l.r.Pos()
  416. l.r.Move(2)
  417. for {
  418. if c = l.r.Peek(0); !('a' <= c && c <= 'z' || 'A' <= c && c <= 'Z') {
  419. break
  420. }
  421. l.r.Move(1)
  422. }
  423. if h := ToHash(parse.ToLower(parse.Copy(l.r.Lexeme()[mark+2:]))); h == rawTag { // copy so that ToLower doesn't change the case of the underlying slice
  424. break
  425. }
  426. } else if c == 0 {
  427. if l.r.Err() == nil {
  428. l.err = parse.NewErrorLexer("unexpected null character", l.r)
  429. }
  430. return l.r.Shift()
  431. } else {
  432. l.r.Move(1)
  433. }
  434. }
  435. for {
  436. c := l.r.Peek(0)
  437. if c == '>' {
  438. l.r.Move(1)
  439. break
  440. } else if c == 0 {
  441. if l.r.Err() == nil {
  442. l.err = parse.NewErrorLexer("unexpected null character", l.r)
  443. }
  444. return l.r.Shift()
  445. }
  446. l.r.Move(1)
  447. }
  448. return l.r.Shift()
  449. }
  450. ////////////////////////////////////////////////////////////////
  451. func (l *Lexer) at(b ...byte) bool {
  452. for i, c := range b {
  453. if l.r.Peek(i) != c {
  454. return false
  455. }
  456. }
  457. return true
  458. }
  459. func (l *Lexer) atCaseInsensitive(b ...byte) bool {
  460. for i, c := range b {
  461. if l.r.Peek(i) != c && (l.r.Peek(i)+('a'-'A')) != c {
  462. return false
  463. }
  464. }
  465. return true
  466. }