scanner.go

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package json
     6  
     7  // JSON value parser state machine.
     8  // Just about at the limit of what is reasonable to write by hand.
     9  // Some parts are a bit tedious, but overall it nicely factors out the
    10  // otherwise common code from the multiple scanning functions
    11  // in this package (Compact, Indent, checkValid, etc).
    12  //
    13  // This file starts with two simple examples using the scanner
    14  // before diving into the scanner itself.
    15  
    16  import (
    17  	"strconv"
    18  	"sync"
    19  )
    20  
    21  // Valid reports whether data is a valid JSON encoding.
    22  func Valid(data []byte) bool {
    23  	scan := newScanner()
    24  	defer freeScanner(scan)
    25  	return checkValid(data, scan) == nil
    26  }
    27  
    28  // checkValid verifies that data is valid JSON-encoded data.
    29  // scan is passed in for use by checkValid to avoid an allocation.
    30  // checkValid returns nil or a SyntaxError.
    31  func checkValid(data []byte, scan *scanner) error {
    32  	scan.reset()
    33  	for _, c := range data {
    34  		scan.bytes++
    35  		if scan.step(scan, c) == scanError {
    36  			return scan.err
    37  		}
    38  	}
    39  	if scan.eof() == scanError {
    40  		return scan.err
    41  	}
    42  	return nil
    43  }
    44  
    45  // A SyntaxError is a description of a JSON syntax error.
    46  // [Unmarshal] will return a SyntaxError if the JSON can't be parsed.
    47  type SyntaxError struct {
    48  	msg    string // description of error
    49  	Offset int64  // error occurred after reading Offset bytes
    50  }
    51  
    52  func (e *SyntaxError) Error() string { return e.msg }
    53  
    54  // A scanner is a JSON scanning state machine.
    55  // Callers call scan.reset and then pass bytes in one at a time
    56  // by calling scan.step(&scan, c) for each byte.
    57  // The return value, referred to as an opcode, tells the
    58  // caller about significant parsing events like beginning
    59  // and ending literals, objects, and arrays, so that the
    60  // caller can follow along if it wishes.
    61  // The return value scanEnd indicates that a single top-level
    62  // JSON value has been completed, *before* the byte that
    63  // just got passed in.  (The indication must be delayed in order
    64  // to recognize the end of numbers: is 123 a whole value or
    65  // the beginning of 12345e+6?).
    66  type scanner struct {
    67  	// The step is a func to be called to execute the next transition.
    68  	// Also tried using an integer constant and a single func
    69  	// with a switch, but using the func directly was 10% faster
    70  	// on a 64-bit Mac Mini, and it's nicer to read.
    71  	step func(*scanner, byte) int
    72  
    73  	// Reached end of top-level value.
    74  	endTop bool
    75  
    76  	// Stack of what we're in the middle of - array values, object keys, object values.
    77  	parseState []int
    78  
    79  	// Error that happened, if any.
    80  	err error
    81  
    82  	// total bytes consumed, updated by decoder.Decode (and deliberately
    83  	// not set to zero by scan.reset)
    84  	bytes int64
    85  }
    86  
    87  var scannerPool = sync.Pool{
    88  	New: func() any {
    89  		return &scanner{}
    90  	},
    91  }
    92  
    93  func newScanner() *scanner {
    94  	scan := scannerPool.Get().(*scanner)
    95  	// scan.reset by design doesn't set bytes to zero
    96  	scan.bytes = 0
    97  	scan.reset()
    98  	return scan
    99  }
   100  
   101  func freeScanner(scan *scanner) {
   102  	// Avoid hanging on to too much memory in extreme cases.
   103  	if len(scan.parseState) > 1024 {
   104  		scan.parseState = nil
   105  	}
   106  	scannerPool.Put(scan)
   107  }
   108  
   109  // These values are returned by the state transition functions
   110  // assigned to scanner.state and the method scanner.eof.
   111  // They give details about the current state of the scan that
   112  // callers might be interested to know about.
   113  // It is okay to ignore the return value of any particular
   114  // call to scanner.state: if one call returns scanError,
   115  // every subsequent call will return scanError too.
   116  const (
   117  	// Continue.
   118  	scanContinue     = iota // uninteresting byte
   119  	scanBeginLiteral        // end implied by next result != scanContinue
   120  	scanBeginObject         // begin object
   121  	scanObjectKey           // just finished object key (string)
   122  	scanObjectValue         // just finished non-last object value
   123  	scanEndObject           // end object (implies scanObjectValue if possible)
   124  	scanBeginArray          // begin array
   125  	scanArrayValue          // just finished array value
   126  	scanEndArray            // end array (implies scanArrayValue if possible)
   127  	scanSkipSpace           // space byte; can skip; known to be last "continue" result
   128  
   129  	// Stop.
   130  	scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
   131  	scanError // hit an error, scanner.err.
   132  )
   133  
   134  // These values are stored in the parseState stack.
   135  // They give the current state of a composite value
   136  // being scanned. If the parser is inside a nested value
   137  // the parseState describes the nested state, outermost at entry 0.
   138  const (
   139  	parseObjectKey   = iota // parsing object key (before colon)
   140  	parseObjectValue        // parsing object value (after colon)
   141  	parseArrayValue         // parsing array value
   142  )
   143  
   144  // This limits the max nesting depth to prevent stack overflow.
   145  // This is permitted by https://tools.ietf.org/html/rfc7159#section-9
   146  const maxNestingDepth = 10000
   147  
   148  // reset prepares the scanner for use.
   149  // It must be called before calling s.step.
   150  func (s *scanner) reset() {
   151  	s.step = stateBeginValue
   152  	s.parseState = s.parseState[0:0]
   153  	s.err = nil
   154  	s.endTop = false
   155  }
   156  
   157  // eof tells the scanner that the end of input has been reached.
   158  // It returns a scan status just as s.step does.
   159  func (s *scanner) eof() int {
   160  	if s.err != nil {
   161  		return scanError
   162  	}
   163  	if s.endTop {
   164  		return scanEnd
   165  	}
   166  	s.step(s, ' ')
   167  	if s.endTop {
   168  		return scanEnd
   169  	}
   170  	if s.err == nil {
   171  		s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
   172  	}
   173  	return scanError
   174  }
   175  
   176  // pushParseState pushes a new parse state p onto the parse stack.
   177  // an error state is returned if maxNestingDepth was exceeded, otherwise successState is returned.
   178  func (s *scanner) pushParseState(c byte, newParseState int, successState int) int {
   179  	s.parseState = append(s.parseState, newParseState)
   180  	if len(s.parseState) <= maxNestingDepth {
   181  		return successState
   182  	}
   183  	return s.error(c, "exceeded max depth")
   184  }
   185  
   186  // popParseState pops a parse state (already obtained) off the stack
   187  // and updates s.step accordingly.
   188  func (s *scanner) popParseState() {
   189  	n := len(s.parseState) - 1
   190  	s.parseState = s.parseState[0:n]
   191  	if n == 0 {
   192  		s.step = stateEndTop
   193  		s.endTop = true
   194  	} else {
   195  		s.step = stateEndValue
   196  	}
   197  }
   198  
   199  func isSpace(c byte) bool {
   200  	return c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
   201  }
   202  
   203  // stateBeginValueOrEmpty is the state after reading `[`.
   204  func stateBeginValueOrEmpty(s *scanner, c byte) int {
   205  	if isSpace(c) {
   206  		return scanSkipSpace
   207  	}
   208  	if c == ']' {
   209  		return stateEndValue(s, c)
   210  	}
   211  	return stateBeginValue(s, c)
   212  }
   213  
   214  // stateBeginValue is the state at the beginning of the input.
   215  func stateBeginValue(s *scanner, c byte) int {
   216  	if isSpace(c) {
   217  		return scanSkipSpace
   218  	}
   219  	switch c {
   220  	case '{':
   221  		s.step = stateBeginStringOrEmpty
   222  		return s.pushParseState(c, parseObjectKey, scanBeginObject)
   223  	case '[':
   224  		s.step = stateBeginValueOrEmpty
   225  		return s.pushParseState(c, parseArrayValue, scanBeginArray)
   226  	case '"':
   227  		s.step = stateInString
   228  		return scanBeginLiteral
   229  	case '-':
   230  		s.step = stateNeg
   231  		return scanBeginLiteral
   232  	case '0': // beginning of 0.123
   233  		s.step = state0
   234  		return scanBeginLiteral
   235  	case 't': // beginning of true
   236  		s.step = stateT
   237  		return scanBeginLiteral
   238  	case 'f': // beginning of false
   239  		s.step = stateF
   240  		return scanBeginLiteral
   241  	case 'n': // beginning of null
   242  		s.step = stateN
   243  		return scanBeginLiteral
   244  	}
   245  	if '1' <= c && c <= '9' { // beginning of 1234.5
   246  		s.step = state1
   247  		return scanBeginLiteral
   248  	}
   249  	return s.error(c, "looking for beginning of value")
   250  }
   251  
   252  // stateBeginStringOrEmpty is the state after reading `{`.
   253  func stateBeginStringOrEmpty(s *scanner, c byte) int {
   254  	if isSpace(c) {
   255  		return scanSkipSpace
   256  	}
   257  	if c == '}' {
   258  		n := len(s.parseState)
   259  		s.parseState[n-1] = parseObjectValue
   260  		return stateEndValue(s, c)
   261  	}
   262  	return stateBeginString(s, c)
   263  }
   264  
   265  // stateBeginString is the state after reading `{"key": value,`.
   266  func stateBeginString(s *scanner, c byte) int {
   267  	if isSpace(c) {
   268  		return scanSkipSpace
   269  	}
   270  	if c == '"' {
   271  		s.step = stateInString
   272  		return scanBeginLiteral
   273  	}
   274  	return s.error(c, "looking for beginning of object key string")
   275  }
   276  
   277  // stateEndValue is the state after completing a value,
   278  // such as after reading `{}` or `true` or `["x"`.
   279  func stateEndValue(s *scanner, c byte) int {
   280  	n := len(s.parseState)
   281  	if n == 0 {
   282  		// Completed top-level before the current byte.
   283  		s.step = stateEndTop
   284  		s.endTop = true
   285  		return stateEndTop(s, c)
   286  	}
   287  	if isSpace(c) {
   288  		s.step = stateEndValue
   289  		return scanSkipSpace
   290  	}
   291  	ps := s.parseState[n-1]
   292  	switch ps {
   293  	case parseObjectKey:
   294  		if c == ':' {
   295  			s.parseState[n-1] = parseObjectValue
   296  			s.step = stateBeginValue
   297  			return scanObjectKey
   298  		}
   299  		return s.error(c, "after object key")
   300  	case parseObjectValue:
   301  		if c == ',' {
   302  			s.parseState[n-1] = parseObjectKey
   303  			s.step = stateBeginString
   304  			return scanObjectValue
   305  		}
   306  		if c == '}' {
   307  			s.popParseState()
   308  			return scanEndObject
   309  		}
   310  		return s.error(c, "after object key:value pair")
   311  	case parseArrayValue:
   312  		if c == ',' {
   313  			s.step = stateBeginValue
   314  			return scanArrayValue
   315  		}
   316  		if c == ']' {
   317  			s.popParseState()
   318  			return scanEndArray
   319  		}
   320  		return s.error(c, "after array element")
   321  	}
   322  	return s.error(c, "")
   323  }
   324  
   325  // stateEndTop is the state after finishing the top-level value,
   326  // such as after reading `{}` or `[1,2,3]`.
   327  // Only space characters should be seen now.
   328  func stateEndTop(s *scanner, c byte) int {
   329  	if !isSpace(c) {
   330  		// Complain about non-space byte on next call.
   331  		s.error(c, "after top-level value")
   332  	}
   333  	return scanEnd
   334  }
   335  
   336  // stateInString is the state after reading `"`.
   337  func stateInString(s *scanner, c byte) int {
   338  	if c == '"' {
   339  		s.step = stateEndValue
   340  		return scanContinue
   341  	}
   342  	if c == '\\' {
   343  		s.step = stateInStringEsc
   344  		return scanContinue
   345  	}
   346  	if c < 0x20 {
   347  		return s.error(c, "in string literal")
   348  	}
   349  	return scanContinue
   350  }
   351  
   352  // stateInStringEsc is the state after reading `"\` during a quoted string.
   353  func stateInStringEsc(s *scanner, c byte) int {
   354  	switch c {
   355  	case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
   356  		s.step = stateInString
   357  		return scanContinue
   358  	case 'u':
   359  		s.step = stateInStringEscU
   360  		return scanContinue
   361  	}
   362  	return s.error(c, "in string escape code")
   363  }
   364  
   365  // stateInStringEscU is the state after reading `"\u` during a quoted string.
   366  func stateInStringEscU(s *scanner, c byte) int {
   367  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   368  		s.step = stateInStringEscU1
   369  		return scanContinue
   370  	}
   371  	// numbers
   372  	return s.error(c, "in \\u hexadecimal character escape")
   373  }
   374  
   375  // stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
   376  func stateInStringEscU1(s *scanner, c byte) int {
   377  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   378  		s.step = stateInStringEscU12
   379  		return scanContinue
   380  	}
   381  	// numbers
   382  	return s.error(c, "in \\u hexadecimal character escape")
   383  }
   384  
   385  // stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
   386  func stateInStringEscU12(s *scanner, c byte) int {
   387  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   388  		s.step = stateInStringEscU123
   389  		return scanContinue
   390  	}
   391  	// numbers
   392  	return s.error(c, "in \\u hexadecimal character escape")
   393  }
   394  
   395  // stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
   396  func stateInStringEscU123(s *scanner, c byte) int {
   397  	if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   398  		s.step = stateInString
   399  		return scanContinue
   400  	}
   401  	// numbers
   402  	return s.error(c, "in \\u hexadecimal character escape")
   403  }
   404  
   405  // stateNeg is the state after reading `-` during a number.
   406  func stateNeg(s *scanner, c byte) int {
   407  	if c == '0' {
   408  		s.step = state0
   409  		return scanContinue
   410  	}
   411  	if '1' <= c && c <= '9' {
   412  		s.step = state1
   413  		return scanContinue
   414  	}
   415  	return s.error(c, "in numeric literal")
   416  }
   417  
   418  // state1 is the state after reading a non-zero integer during a number,
   419  // such as after reading `1` or `100` but not `0`.
   420  func state1(s *scanner, c byte) int {
   421  	if '0' <= c && c <= '9' {
   422  		s.step = state1
   423  		return scanContinue
   424  	}
   425  	return state0(s, c)
   426  }
   427  
   428  // state0 is the state after reading `0` during a number.
   429  func state0(s *scanner, c byte) int {
   430  	if c == '.' {
   431  		s.step = stateDot
   432  		return scanContinue
   433  	}
   434  	if c == 'e' || c == 'E' {
   435  		s.step = stateE
   436  		return scanContinue
   437  	}
   438  	return stateEndValue(s, c)
   439  }
   440  
   441  // stateDot is the state after reading the integer and decimal point in a number,
   442  // such as after reading `1.`.
   443  func stateDot(s *scanner, c byte) int {
   444  	if '0' <= c && c <= '9' {
   445  		s.step = stateDot0
   446  		return scanContinue
   447  	}
   448  	return s.error(c, "after decimal point in numeric literal")
   449  }
   450  
   451  // stateDot0 is the state after reading the integer, decimal point, and subsequent
   452  // digits of a number, such as after reading `3.14`.
   453  func stateDot0(s *scanner, c byte) int {
   454  	if '0' <= c && c <= '9' {
   455  		return scanContinue
   456  	}
   457  	if c == 'e' || c == 'E' {
   458  		s.step = stateE
   459  		return scanContinue
   460  	}
   461  	return stateEndValue(s, c)
   462  }
   463  
   464  // stateE is the state after reading the mantissa and e in a number,
   465  // such as after reading `314e` or `0.314e`.
   466  func stateE(s *scanner, c byte) int {
   467  	if c == '+' || c == '-' {
   468  		s.step = stateESign
   469  		return scanContinue
   470  	}
   471  	return stateESign(s, c)
   472  }
   473  
   474  // stateESign is the state after reading the mantissa, e, and sign in a number,
   475  // such as after reading `314e-` or `0.314e+`.
   476  func stateESign(s *scanner, c byte) int {
   477  	if '0' <= c && c <= '9' {
   478  		s.step = stateE0
   479  		return scanContinue
   480  	}
   481  	return s.error(c, "in exponent of numeric literal")
   482  }
   483  
   484  // stateE0 is the state after reading the mantissa, e, optional sign,
   485  // and at least one digit of the exponent in a number,
   486  // such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
   487  func stateE0(s *scanner, c byte) int {
   488  	if '0' <= c && c <= '9' {
   489  		return scanContinue
   490  	}
   491  	return stateEndValue(s, c)
   492  }
   493  
   494  // stateT is the state after reading `t`.
   495  func stateT(s *scanner, c byte) int {
   496  	if c == 'r' {
   497  		s.step = stateTr
   498  		return scanContinue
   499  	}
   500  	return s.error(c, "in literal true (expecting 'r')")
   501  }
   502  
   503  // stateTr is the state after reading `tr`.
   504  func stateTr(s *scanner, c byte) int {
   505  	if c == 'u' {
   506  		s.step = stateTru
   507  		return scanContinue
   508  	}
   509  	return s.error(c, "in literal true (expecting 'u')")
   510  }
   511  
   512  // stateTru is the state after reading `tru`.
   513  func stateTru(s *scanner, c byte) int {
   514  	if c == 'e' {
   515  		s.step = stateEndValue
   516  		return scanContinue
   517  	}
   518  	return s.error(c, "in literal true (expecting 'e')")
   519  }
   520  
   521  // stateF is the state after reading `f`.
   522  func stateF(s *scanner, c byte) int {
   523  	if c == 'a' {
   524  		s.step = stateFa
   525  		return scanContinue
   526  	}
   527  	return s.error(c, "in literal false (expecting 'a')")
   528  }
   529  
   530  // stateFa is the state after reading `fa`.
   531  func stateFa(s *scanner, c byte) int {
   532  	if c == 'l' {
   533  		s.step = stateFal
   534  		return scanContinue
   535  	}
   536  	return s.error(c, "in literal false (expecting 'l')")
   537  }
   538  
   539  // stateFal is the state after reading `fal`.
   540  func stateFal(s *scanner, c byte) int {
   541  	if c == 's' {
   542  		s.step = stateFals
   543  		return scanContinue
   544  	}
   545  	return s.error(c, "in literal false (expecting 's')")
   546  }
   547  
   548  // stateFals is the state after reading `fals`.
   549  func stateFals(s *scanner, c byte) int {
   550  	if c == 'e' {
   551  		s.step = stateEndValue
   552  		return scanContinue
   553  	}
   554  	return s.error(c, "in literal false (expecting 'e')")
   555  }
   556  
   557  // stateN is the state after reading `n`.
   558  func stateN(s *scanner, c byte) int {
   559  	if c == 'u' {
   560  		s.step = stateNu
   561  		return scanContinue
   562  	}
   563  	return s.error(c, "in literal null (expecting 'u')")
   564  }
   565  
   566  // stateNu is the state after reading `nu`.
   567  func stateNu(s *scanner, c byte) int {
   568  	if c == 'l' {
   569  		s.step = stateNul
   570  		return scanContinue
   571  	}
   572  	return s.error(c, "in literal null (expecting 'l')")
   573  }
   574  
   575  // stateNul is the state after reading `nul`.
   576  func stateNul(s *scanner, c byte) int {
   577  	if c == 'l' {
   578  		s.step = stateEndValue
   579  		return scanContinue
   580  	}
   581  	return s.error(c, "in literal null (expecting 'l')")
   582  }
   583  
   584  // stateError is the state after reaching a syntax error,
   585  // such as after reading `[1}` or `5.1.2`.
   586  func stateError(s *scanner, c byte) int {
   587  	return scanError
   588  }
   589  
   590  // error records an error and switches to the error state.
   591  func (s *scanner) error(c byte, context string) int {
   592  	s.step = stateError
   593  	s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
   594  	return scanError
   595  }
   596  
   597  // quoteChar formats c as a quoted character literal.
   598  func quoteChar(c byte) string {
   599  	// special cases - different from quoted strings
   600  	if c == '\'' {
   601  		return `'\''`
   602  	}
   603  	if c == '"' {
   604  		return `'"'`
   605  	}
   606  
   607  	// use quoted string with different quotation marks
   608  	s := strconv.Quote(string(c))
   609  	return "'" + s[1:len(s)-1] + "'"
   610  }
   611
View as plain text