parse.go 7.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307
  1. // Package json is a JSON parser following the specifications at http://json.org/.
  2. package json // import "github.com/tdewolff/parse/json"
  3. import (
  4. "io"
  5. "strconv"
  6. "github.com/tdewolff/parse"
  7. "github.com/tdewolff/parse/buffer"
  8. )
  9. // GrammarType determines the type of grammar
  10. type GrammarType uint32
  11. // GrammarType values.
  12. const (
  13. ErrorGrammar GrammarType = iota // extra grammar when errors occur
  14. WhitespaceGrammar
  15. LiteralGrammar
  16. NumberGrammar
  17. StringGrammar
  18. StartObjectGrammar // {
  19. EndObjectGrammar // }
  20. StartArrayGrammar // [
  21. EndArrayGrammar // ]
  22. )
  23. // String returns the string representation of a GrammarType.
  24. func (gt GrammarType) String() string {
  25. switch gt {
  26. case ErrorGrammar:
  27. return "Error"
  28. case WhitespaceGrammar:
  29. return "Whitespace"
  30. case LiteralGrammar:
  31. return "Literal"
  32. case NumberGrammar:
  33. return "Number"
  34. case StringGrammar:
  35. return "String"
  36. case StartObjectGrammar:
  37. return "StartObject"
  38. case EndObjectGrammar:
  39. return "EndObject"
  40. case StartArrayGrammar:
  41. return "StartArray"
  42. case EndArrayGrammar:
  43. return "EndArray"
  44. }
  45. return "Invalid(" + strconv.Itoa(int(gt)) + ")"
  46. }
  47. ////////////////////////////////////////////////////////////////
  48. // State determines the current state the parser is in.
  49. type State uint32
  50. // State values.
  51. const (
  52. ValueState State = iota // extra token when errors occur
  53. ObjectKeyState
  54. ObjectValueState
  55. ArrayState
  56. )
  57. // String returns the string representation of a State.
  58. func (state State) String() string {
  59. switch state {
  60. case ValueState:
  61. return "Value"
  62. case ObjectKeyState:
  63. return "ObjectKey"
  64. case ObjectValueState:
  65. return "ObjectValue"
  66. case ArrayState:
  67. return "Array"
  68. }
  69. return "Invalid(" + strconv.Itoa(int(state)) + ")"
  70. }
  71. ////////////////////////////////////////////////////////////////
  72. // Parser is the state for the lexer.
  73. type Parser struct {
  74. r *buffer.Lexer
  75. state []State
  76. err error
  77. needComma bool
  78. }
  79. // NewParser returns a new Parser for a given io.Reader.
  80. func NewParser(r io.Reader) *Parser {
  81. return &Parser{
  82. r: buffer.NewLexer(r),
  83. state: []State{ValueState},
  84. }
  85. }
  86. // Err returns the error encountered during tokenization, this is often io.EOF but also other errors can be returned.
  87. func (p *Parser) Err() error {
  88. if p.err != nil {
  89. return p.err
  90. }
  91. return p.r.Err()
  92. }
  93. // Restore restores the NULL byte at the end of the buffer.
  94. func (p *Parser) Restore() {
  95. p.r.Restore()
  96. }
  97. // Next returns the next Grammar. It returns ErrorGrammar when an error was encountered. Using Err() one can retrieve the error message.
  98. func (p *Parser) Next() (GrammarType, []byte) {
  99. p.moveWhitespace()
  100. c := p.r.Peek(0)
  101. state := p.state[len(p.state)-1]
  102. if c == ',' {
  103. if state != ArrayState && state != ObjectKeyState {
  104. p.err = parse.NewErrorLexer("unexpected comma character outside an array or object", p.r)
  105. return ErrorGrammar, nil
  106. }
  107. p.r.Move(1)
  108. p.moveWhitespace()
  109. p.needComma = false
  110. c = p.r.Peek(0)
  111. }
  112. p.r.Skip()
  113. if p.needComma && c != '}' && c != ']' && c != 0 {
  114. p.err = parse.NewErrorLexer("expected comma character or an array or object ending", p.r)
  115. return ErrorGrammar, nil
  116. } else if c == '{' {
  117. p.state = append(p.state, ObjectKeyState)
  118. p.r.Move(1)
  119. return StartObjectGrammar, p.r.Shift()
  120. } else if c == '}' {
  121. if state != ObjectKeyState {
  122. p.err = parse.NewErrorLexer("unexpected right brace character", p.r)
  123. return ErrorGrammar, nil
  124. }
  125. p.needComma = true
  126. p.state = p.state[:len(p.state)-1]
  127. if p.state[len(p.state)-1] == ObjectValueState {
  128. p.state[len(p.state)-1] = ObjectKeyState
  129. }
  130. p.r.Move(1)
  131. return EndObjectGrammar, p.r.Shift()
  132. } else if c == '[' {
  133. p.state = append(p.state, ArrayState)
  134. p.r.Move(1)
  135. return StartArrayGrammar, p.r.Shift()
  136. } else if c == ']' {
  137. p.needComma = true
  138. if state != ArrayState {
  139. p.err = parse.NewErrorLexer("unexpected right bracket character", p.r)
  140. return ErrorGrammar, nil
  141. }
  142. p.state = p.state[:len(p.state)-1]
  143. if p.state[len(p.state)-1] == ObjectValueState {
  144. p.state[len(p.state)-1] = ObjectKeyState
  145. }
  146. p.r.Move(1)
  147. return EndArrayGrammar, p.r.Shift()
  148. } else if state == ObjectKeyState {
  149. if c != '"' || !p.consumeStringToken() {
  150. p.err = parse.NewErrorLexer("expected object key to be a quoted string", p.r)
  151. return ErrorGrammar, nil
  152. }
  153. n := p.r.Pos()
  154. p.moveWhitespace()
  155. if c := p.r.Peek(0); c != ':' {
  156. p.err = parse.NewErrorLexer("expected colon character after object key", p.r)
  157. return ErrorGrammar, nil
  158. }
  159. p.r.Move(1)
  160. p.state[len(p.state)-1] = ObjectValueState
  161. return StringGrammar, p.r.Shift()[:n]
  162. } else {
  163. p.needComma = true
  164. if state == ObjectValueState {
  165. p.state[len(p.state)-1] = ObjectKeyState
  166. }
  167. if c == '"' && p.consumeStringToken() {
  168. return StringGrammar, p.r.Shift()
  169. } else if p.consumeNumberToken() {
  170. return NumberGrammar, p.r.Shift()
  171. } else if p.consumeLiteralToken() {
  172. return LiteralGrammar, p.r.Shift()
  173. }
  174. }
  175. return ErrorGrammar, nil
  176. }
  177. // State returns the state the parser is currently in (ie. which token is expected).
  178. func (p *Parser) State() State {
  179. return p.state[len(p.state)-1]
  180. }
  181. ////////////////////////////////////////////////////////////////
  182. /*
  183. The following functions follow the specifications at http://json.org/
  184. */
  185. func (p *Parser) moveWhitespace() {
  186. for {
  187. if c := p.r.Peek(0); c != ' ' && c != '\n' && c != '\r' && c != '\t' {
  188. break
  189. }
  190. p.r.Move(1)
  191. }
  192. }
  193. func (p *Parser) consumeLiteralToken() bool {
  194. c := p.r.Peek(0)
  195. if c == 't' && p.r.Peek(1) == 'r' && p.r.Peek(2) == 'u' && p.r.Peek(3) == 'e' {
  196. p.r.Move(4)
  197. return true
  198. } else if c == 'f' && p.r.Peek(1) == 'a' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 's' && p.r.Peek(4) == 'e' {
  199. p.r.Move(5)
  200. return true
  201. } else if c == 'n' && p.r.Peek(1) == 'u' && p.r.Peek(2) == 'l' && p.r.Peek(3) == 'l' {
  202. p.r.Move(4)
  203. return true
  204. }
  205. return false
  206. }
  207. func (p *Parser) consumeNumberToken() bool {
  208. mark := p.r.Pos()
  209. if p.r.Peek(0) == '-' {
  210. p.r.Move(1)
  211. }
  212. c := p.r.Peek(0)
  213. if c >= '1' && c <= '9' {
  214. p.r.Move(1)
  215. for {
  216. if c := p.r.Peek(0); c < '0' || c > '9' {
  217. break
  218. }
  219. p.r.Move(1)
  220. }
  221. } else if c != '0' {
  222. p.r.Rewind(mark)
  223. return false
  224. } else {
  225. p.r.Move(1) // 0
  226. }
  227. if c := p.r.Peek(0); c == '.' {
  228. p.r.Move(1)
  229. if c := p.r.Peek(0); c < '0' || c > '9' {
  230. p.r.Move(-1)
  231. return true
  232. }
  233. for {
  234. if c := p.r.Peek(0); c < '0' || c > '9' {
  235. break
  236. }
  237. p.r.Move(1)
  238. }
  239. }
  240. mark = p.r.Pos()
  241. if c := p.r.Peek(0); c == 'e' || c == 'E' {
  242. p.r.Move(1)
  243. if c := p.r.Peek(0); c == '+' || c == '-' {
  244. p.r.Move(1)
  245. }
  246. if c := p.r.Peek(0); c < '0' || c > '9' {
  247. p.r.Rewind(mark)
  248. return true
  249. }
  250. for {
  251. if c := p.r.Peek(0); c < '0' || c > '9' {
  252. break
  253. }
  254. p.r.Move(1)
  255. }
  256. }
  257. return true
  258. }
  259. func (p *Parser) consumeStringToken() bool {
  260. // assume to be on "
  261. p.r.Move(1)
  262. for {
  263. c := p.r.Peek(0)
  264. if c == '"' {
  265. escaped := false
  266. for i := p.r.Pos() - 1; i >= 0; i-- {
  267. if p.r.Lexeme()[i] == '\\' {
  268. escaped = !escaped
  269. } else {
  270. break
  271. }
  272. }
  273. if !escaped {
  274. p.r.Move(1)
  275. break
  276. }
  277. } else if c == 0 {
  278. return false
  279. }
  280. p.r.Move(1)
  281. }
  282. return true
  283. }