The Go Programming Language

Source file src/pkg/json/scanner.go

     1	// Copyright 2010 The Go Authors.  All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package json
     6	
     7	// JSON value parser state machine.
     8	// Just about at the limit of what is reasonable to write by hand.
     9	// Some parts are a bit tedious, but overall it nicely factors out the
    10	// otherwise common code from the multiple scanning functions
    11	// in this package (Compact, Indent, checkValid, nextValue, etc).
    12	//
    13	// This file starts with two simple examples using the scanner
    14	// before diving into the scanner itself.
    15	
    16	import (
    17		"os"
    18		"strconv"
    19	)
    20	
    21	// checkValid verifies that data is valid JSON-encoded data.
    22	// scan is passed in for use by checkValid to avoid an allocation.
    23	func checkValid(data []byte, scan *scanner) os.Error {
    24		scan.reset()
    25		for _, c := range data {
    26			scan.bytes++
    27			if scan.step(scan, int(c)) == scanError {
    28				return scan.err
    29			}
    30		}
    31		if scan.eof() == scanError {
    32			return scan.err
    33		}
    34		return nil
    35	}
    36	
    37	// nextValue splits data after the next whole JSON value,
    38	// returning that value and the bytes that follow it as separate slices.
    39	// scan is passed in for use by nextValue to avoid an allocation.
    40	func nextValue(data []byte, scan *scanner) (value, rest []byte, err os.Error) {
    41		scan.reset()
    42		for i, c := range data {
    43			v := scan.step(scan, int(c))
    44			if v >= scanEnd {
    45				switch v {
    46				case scanError:
    47					return nil, nil, scan.err
    48				case scanEnd:
    49					return data[0:i], data[i:], nil
    50				}
    51			}
    52		}
    53		if scan.eof() == scanError {
    54			return nil, nil, scan.err
    55		}
    56		return data, nil, nil
    57	}
    58	
    59	// A SyntaxError is a description of a JSON syntax error.
    60	type SyntaxError struct {
    61		msg    string // description of error
    62		Offset int64  // error occurred after reading Offset bytes
    63	}
    64	
    65	func (e *SyntaxError) String() string { return e.msg }
    66	
    67	// A scanner is a JSON scanning state machine.
    68	// Callers call scan.reset() and then pass bytes in one at a time
    69	// by calling scan.step(&scan, c) for each byte.
    70	// The return value, referred to as an opcode, tells the
    71	// caller about significant parsing events like beginning
    72	// and ending literals, objects, and arrays, so that the
    73	// caller can follow along if it wishes.
    74	// The return value scanEnd indicates that a single top-level
    75	// JSON value has been completed, *before* the byte that
    76	// just got passed in.  (The indication must be delayed in order
    77	// to recognize the end of numbers: is 123 a whole value or
    78	// the beginning of 12345e+6?).
    79	type scanner struct {
    80		// The step is a func to be called to execute the next transition.
    81		// Also tried using an integer constant and a single func
    82		// with a switch, but using the func directly was 10% faster
    83		// on a 64-bit Mac Mini, and it's nicer to read.
    84		step func(*scanner, int) int
    85	
    86		// Stack of what we're in the middle of - array values, object keys, object values.
    87		parseState []int
    88	
    89		// Error that happened, if any.
    90		err os.Error
    91	
    92		// 1-byte redo (see undo method)
    93		redoCode  int
    94		redoState func(*scanner, int) int
    95	
    96		// total bytes consumed, updated by decoder.Decode
    97		bytes int64
    98	}
    99	
   100	// These values are returned by the state transition functions
   101	// assigned to scanner.state and the method scanner.eof.
   102	// They give details about the current state of the scan that
   103	// callers might be interested to know about.
   104	// It is okay to ignore the return value of any particular
   105	// call to scanner.state: if one call returns scanError,
   106	// every subsequent call will return scanError too.
   107	const (
   108		// Continue.
   109		scanContinue     = iota // uninteresting byte
   110		scanBeginLiteral        // end implied by next result != scanContinue
   111		scanBeginObject         // begin object
   112		scanObjectKey           // just finished object key (string)
   113		scanObjectValue         // just finished non-last object value
   114		scanEndObject           // end object (implies scanObjectValue if possible)
   115		scanBeginArray          // begin array
   116		scanArrayValue          // just finished array value
   117		scanEndArray            // end array (implies scanArrayValue if possible)
   118		scanSkipSpace           // space byte; can skip; known to be last "continue" result
   119	
   120		// Stop.
   121		scanEnd   // top-level value ended *before* this byte; known to be first "stop" result
   122		scanError // hit an error, scanner.err.
   123	)
   124	
   125	// These values are stored in the parseState stack.
   126	// They give the current state of a composite value
   127	// being scanned.  If the parser is inside a nested value
   128	// the parseState describes the nested state, outermost at entry 0.
   129	const (
   130		parseObjectKey   = iota // parsing object key (before colon)
   131		parseObjectValue        // parsing object value (after colon)
   132		parseArrayValue         // parsing array value
   133	)
   134	
   135	// reset prepares the scanner for use.
   136	// It must be called before calling s.step.
   137	func (s *scanner) reset() {
   138		s.step = stateBeginValue
   139		s.parseState = s.parseState[0:0]
   140		s.err = nil
   141	}
   142	
   143	// eof tells the scanner that the end of input has been reached.
   144	// It returns a scan status just as s.step does.
   145	func (s *scanner) eof() int {
   146		if s.err != nil {
   147			return scanError
   148		}
   149		if s.step == stateEndTop {
   150			return scanEnd
   151		}
   152		s.step(s, ' ')
   153		if s.step == stateEndTop {
   154			return scanEnd
   155		}
   156		if s.err == nil {
   157			s.err = &SyntaxError{"unexpected end of JSON input", s.bytes}
   158		}
   159		return scanError
   160	}
   161	
   162	// pushParseState pushes a new parse state p onto the parse stack.
   163	func (s *scanner) pushParseState(p int) {
   164		s.parseState = append(s.parseState, p)
   165	}
   166	
   167	// popParseState pops a parse state (already obtained) off the stack
   168	// and updates s.step accordingly.
   169	func (s *scanner) popParseState() {
   170		n := len(s.parseState) - 1
   171		s.parseState = s.parseState[0:n]
   172		if n == 0 {
   173			s.step = stateEndTop
   174		} else {
   175			s.step = stateEndValue
   176		}
   177	}
   178	
   179	func isSpace(c int) bool {
   180		return c == ' ' || c == '\t' || c == '\r' || c == '\n'
   181	}
   182	
   183	// NOTE(rsc): The various instances of
   184	//
   185	//	if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n')
   186	//
   187	// below should all be if c <= ' ' && isSpace(c), but inlining
   188	// the checks makes a significant difference (>10%) in tight loops
   189	// such as nextValue.  These should be rewritten with the clearer
   190	// function call once 6g knows to inline the call.
   191	
   192	// stateBeginValueOrEmpty is the state after reading `[`.
   193	func stateBeginValueOrEmpty(s *scanner, c int) int {
   194		if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
   195			return scanSkipSpace
   196		}
   197		if c == ']' {
   198			return stateEndValue(s, c)
   199		}
   200		return stateBeginValue(s, c)
   201	}
   202	
   203	// stateBeginValue is the state at the beginning of the input.
   204	func stateBeginValue(s *scanner, c int) int {
   205		if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
   206			return scanSkipSpace
   207		}
   208		switch c {
   209		case '{':
   210			s.step = stateBeginStringOrEmpty
   211			s.pushParseState(parseObjectKey)
   212			return scanBeginObject
   213		case '[':
   214			s.step = stateBeginValueOrEmpty
   215			s.pushParseState(parseArrayValue)
   216			return scanBeginArray
   217		case '"':
   218			s.step = stateInString
   219			return scanBeginLiteral
   220		case '-':
   221			s.step = stateNeg
   222			return scanBeginLiteral
   223		case '0': // beginning of 0.123
   224			s.step = state0
   225			return scanBeginLiteral
   226		case 't': // beginning of true
   227			s.step = stateT
   228			return scanBeginLiteral
   229		case 'f': // beginning of false
   230			s.step = stateF
   231			return scanBeginLiteral
   232		case 'n': // beginning of null
   233			s.step = stateN
   234			return scanBeginLiteral
   235		}
   236		if '1' <= c && c <= '9' { // beginning of 1234.5
   237			s.step = state1
   238			return scanBeginLiteral
   239		}
   240		return s.error(c, "looking for beginning of value")
   241	}
   242	
   243	// stateBeginStringOrEmpty is the state after reading `{`.
   244	func stateBeginStringOrEmpty(s *scanner, c int) int {
   245		if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
   246			return scanSkipSpace
   247		}
   248		if c == '}' {
   249			n := len(s.parseState)
   250			s.parseState[n-1] = parseObjectValue
   251			return stateEndValue(s, c)
   252		}
   253		return stateBeginString(s, c)
   254	}
   255	
   256	// stateBeginString is the state after reading `{"key": value,`.
   257	func stateBeginString(s *scanner, c int) int {
   258		if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
   259			return scanSkipSpace
   260		}
   261		if c == '"' {
   262			s.step = stateInString
   263			return scanBeginLiteral
   264		}
   265		return s.error(c, "looking for beginning of object key string")
   266	}
   267	
   268	// stateEndValue is the state after completing a value,
   269	// such as after reading `{}` or `true` or `["x"`.
   270	func stateEndValue(s *scanner, c int) int {
   271		n := len(s.parseState)
   272		if n == 0 {
   273			// Completed top-level before the current byte.
   274			s.step = stateEndTop
   275			return stateEndTop(s, c)
   276		}
   277		if c <= ' ' && (c == ' ' || c == '\t' || c == '\r' || c == '\n') {
   278			s.step = stateEndValue
   279			return scanSkipSpace
   280		}
   281		ps := s.parseState[n-1]
   282		switch ps {
   283		case parseObjectKey:
   284			if c == ':' {
   285				s.parseState[n-1] = parseObjectValue
   286				s.step = stateBeginValue
   287				return scanObjectKey
   288			}
   289			return s.error(c, "after object key")
   290		case parseObjectValue:
   291			if c == ',' {
   292				s.parseState[n-1] = parseObjectKey
   293				s.step = stateBeginString
   294				return scanObjectValue
   295			}
   296			if c == '}' {
   297				s.popParseState()
   298				return scanEndObject
   299			}
   300			return s.error(c, "after object key:value pair")
   301		case parseArrayValue:
   302			if c == ',' {
   303				s.step = stateBeginValue
   304				return scanArrayValue
   305			}
   306			if c == ']' {
   307				s.popParseState()
   308				return scanEndArray
   309			}
   310			return s.error(c, "after array element")
   311		}
   312		return s.error(c, "")
   313	}
   314	
   315	// stateEndTop is the state after finishing the top-level value,
   316	// such as after reading `{}` or `[1,2,3]`.
   317	// Only space characters should be seen now.
   318	func stateEndTop(s *scanner, c int) int {
   319		if c != ' ' && c != '\t' && c != '\r' && c != '\n' {
   320			// Complain about non-space byte on next call.
   321			s.error(c, "after top-level value")
   322		}
   323		return scanEnd
   324	}
   325	
   326	// stateInString is the state after reading `"`.
   327	func stateInString(s *scanner, c int) int {
   328		if c == '"' {
   329			s.step = stateEndValue
   330			return scanContinue
   331		}
   332		if c == '\\' {
   333			s.step = stateInStringEsc
   334			return scanContinue
   335		}
   336		if c < 0x20 {
   337			return s.error(c, "in string literal")
   338		}
   339		return scanContinue
   340	}
   341	
   342	// stateInStringEsc is the state after reading `"\` during a quoted string.
   343	func stateInStringEsc(s *scanner, c int) int {
   344		switch c {
   345		case 'b', 'f', 'n', 'r', 't', '\\', '/', '"':
   346			s.step = stateInString
   347			return scanContinue
   348		}
   349		if c == 'u' {
   350			s.step = stateInStringEscU
   351			return scanContinue
   352		}
   353		return s.error(c, "in string escape code")
   354	}
   355	
   356	// stateInStringEscU is the state after reading `"\u` during a quoted string.
   357	func stateInStringEscU(s *scanner, c int) int {
   358		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   359			s.step = stateInStringEscU1
   360			return scanContinue
   361		}
   362		// numbers
   363		return s.error(c, "in \\u hexadecimal character escape")
   364	}
   365	
   366	// stateInStringEscU1 is the state after reading `"\u1` during a quoted string.
   367	func stateInStringEscU1(s *scanner, c int) int {
   368		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   369			s.step = stateInStringEscU12
   370			return scanContinue
   371		}
   372		// numbers
   373		return s.error(c, "in \\u hexadecimal character escape")
   374	}
   375	
   376	// stateInStringEscU12 is the state after reading `"\u12` during a quoted string.
   377	func stateInStringEscU12(s *scanner, c int) int {
   378		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   379			s.step = stateInStringEscU123
   380			return scanContinue
   381		}
   382		// numbers
   383		return s.error(c, "in \\u hexadecimal character escape")
   384	}
   385	
   386	// stateInStringEscU123 is the state after reading `"\u123` during a quoted string.
   387	func stateInStringEscU123(s *scanner, c int) int {
   388		if '0' <= c && c <= '9' || 'a' <= c && c <= 'f' || 'A' <= c && c <= 'F' {
   389			s.step = stateInString
   390			return scanContinue
   391		}
   392		// numbers
   393		return s.error(c, "in \\u hexadecimal character escape")
   394	}
   395	
   396	// stateInStringEscU123 is the state after reading `-` during a number.
   397	func stateNeg(s *scanner, c int) int {
   398		if c == '0' {
   399			s.step = state0
   400			return scanContinue
   401		}
   402		if '1' <= c && c <= '9' {
   403			s.step = state1
   404			return scanContinue
   405		}
   406		return s.error(c, "in numeric literal")
   407	}
   408	
   409	// state1 is the state after reading a non-zero integer during a number,
   410	// such as after reading `1` or `100` but not `0`.
   411	func state1(s *scanner, c int) int {
   412		if '0' <= c && c <= '9' {
   413			s.step = state1
   414			return scanContinue
   415		}
   416		return state0(s, c)
   417	}
   418	
   419	// state0 is the state after reading `0` during a number.
   420	func state0(s *scanner, c int) int {
   421		if c == '.' {
   422			s.step = stateDot
   423			return scanContinue
   424		}
   425		if c == 'e' || c == 'E' {
   426			s.step = stateE
   427			return scanContinue
   428		}
   429		return stateEndValue(s, c)
   430	}
   431	
   432	// stateDot is the state after reading the integer and decimal point in a number,
   433	// such as after reading `1.`.
   434	func stateDot(s *scanner, c int) int {
   435		if '0' <= c && c <= '9' {
   436			s.step = stateDot0
   437			return scanContinue
   438		}
   439		return s.error(c, "after decimal point in numeric literal")
   440	}
   441	
   442	// stateDot0 is the state after reading the integer, decimal point, and subsequent
   443	// digits of a number, such as after reading `3.14`.
   444	func stateDot0(s *scanner, c int) int {
   445		if '0' <= c && c <= '9' {
   446			s.step = stateDot0
   447			return scanContinue
   448		}
   449		if c == 'e' || c == 'E' {
   450			s.step = stateE
   451			return scanContinue
   452		}
   453		return stateEndValue(s, c)
   454	}
   455	
   456	// stateE is the state after reading the mantissa and e in a number,
   457	// such as after reading `314e` or `0.314e`.
   458	func stateE(s *scanner, c int) int {
   459		if c == '+' {
   460			s.step = stateESign
   461			return scanContinue
   462		}
   463		if c == '-' {
   464			s.step = stateESign
   465			return scanContinue
   466		}
   467		return stateESign(s, c)
   468	}
   469	
   470	// stateESign is the state after reading the mantissa, e, and sign in a number,
   471	// such as after reading `314e-` or `0.314e+`.
   472	func stateESign(s *scanner, c int) int {
   473		if '0' <= c && c <= '9' {
   474			s.step = stateE0
   475			return scanContinue
   476		}
   477		return s.error(c, "in exponent of numeric literal")
   478	}
   479	
   480	// stateE0 is the state after reading the mantissa, e, optional sign,
   481	// and at least one digit of the exponent in a number,
   482	// such as after reading `314e-2` or `0.314e+1` or `3.14e0`.
   483	func stateE0(s *scanner, c int) int {
   484		if '0' <= c && c <= '9' {
   485			s.step = stateE0
   486			return scanContinue
   487		}
   488		return stateEndValue(s, c)
   489	}
   490	
   491	// stateT is the state after reading `t`.
   492	func stateT(s *scanner, c int) int {
   493		if c == 'r' {
   494			s.step = stateTr
   495			return scanContinue
   496		}
   497		return s.error(c, "in literal true (expecting 'r')")
   498	}
   499	
   500	// stateTr is the state after reading `tr`.
   501	func stateTr(s *scanner, c int) int {
   502		if c == 'u' {
   503			s.step = stateTru
   504			return scanContinue
   505		}
   506		return s.error(c, "in literal true (expecting 'u')")
   507	}
   508	
   509	// stateTru is the state after reading `tru`.
   510	func stateTru(s *scanner, c int) int {
   511		if c == 'e' {
   512			s.step = stateEndValue
   513			return scanContinue
   514		}
   515		return s.error(c, "in literal true (expecting 'e')")
   516	}
   517	
   518	// stateF is the state after reading `f`.
   519	func stateF(s *scanner, c int) int {
   520		if c == 'a' {
   521			s.step = stateFa
   522			return scanContinue
   523		}
   524		return s.error(c, "in literal false (expecting 'a')")
   525	}
   526	
   527	// stateFa is the state after reading `fa`.
   528	func stateFa(s *scanner, c int) int {
   529		if c == 'l' {
   530			s.step = stateFal
   531			return scanContinue
   532		}
   533		return s.error(c, "in literal false (expecting 'l')")
   534	}
   535	
   536	// stateFal is the state after reading `fal`.
   537	func stateFal(s *scanner, c int) int {
   538		if c == 's' {
   539			s.step = stateFals
   540			return scanContinue
   541		}
   542		return s.error(c, "in literal false (expecting 's')")
   543	}
   544	
   545	// stateFals is the state after reading `fals`.
   546	func stateFals(s *scanner, c int) int {
   547		if c == 'e' {
   548			s.step = stateEndValue
   549			return scanContinue
   550		}
   551		return s.error(c, "in literal false (expecting 'e')")
   552	}
   553	
   554	// stateN is the state after reading `n`.
   555	func stateN(s *scanner, c int) int {
   556		if c == 'u' {
   557			s.step = stateNu
   558			return scanContinue
   559		}
   560		return s.error(c, "in literal null (expecting 'u')")
   561	}
   562	
   563	// stateNu is the state after reading `nu`.
   564	func stateNu(s *scanner, c int) int {
   565		if c == 'l' {
   566			s.step = stateNul
   567			return scanContinue
   568		}
   569		return s.error(c, "in literal null (expecting 'l')")
   570	}
   571	
   572	// stateNul is the state after reading `nul`.
   573	func stateNul(s *scanner, c int) int {
   574		if c == 'l' {
   575			s.step = stateEndValue
   576			return scanContinue
   577		}
   578		return s.error(c, "in literal null (expecting 'l')")
   579	}
   580	
   581	// stateError is the state after reaching a syntax error,
   582	// such as after reading `[1}` or `5.1.2`.
   583	func stateError(s *scanner, c int) int {
   584		return scanError
   585	}
   586	
   587	// error records an error and switches to the error state.
   588	func (s *scanner) error(c int, context string) int {
   589		s.step = stateError
   590		s.err = &SyntaxError{"invalid character " + quoteChar(c) + " " + context, s.bytes}
   591		return scanError
   592	}
   593	
   594	// quoteChar formats c as a quoted character literal
   595	func quoteChar(c int) string {
   596		// special cases - different from quoted strings
   597		if c == '\'' {
   598			return `'\''`
   599		}
   600		if c == '"' {
   601			return `'"'`
   602		}
   603	
   604		// use quoted string with different quotation marks
   605		s := strconv.Quote(string(c))
   606		return "'" + s[1:len(s)-1] + "'"
   607	}
   608	
   609	// undo causes the scanner to return scanCode from the next state transition.
   610	// This gives callers a simple 1-byte undo mechanism.
   611	func (s *scanner) undo(scanCode int) {
   612		if s.step == stateRedo {
   613			panic("invalid use of scanner")
   614		}
   615		s.redoCode = scanCode
   616		s.redoState = s.step
   617		s.step = stateRedo
   618	}
   619	
   620	// stateRedo helps implement the scanner's 1-byte undo.
   621	func stateRedo(s *scanner, c int) int {
   622		s.step = s.redoState
   623		return s.redoCode
   624	}

release.r60.3. Except as noted, this content is licensed under a Creative Commons Attribution 3.0 License.