The Go Programming Language

Source file src/pkg/go/scanner/scanner.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	// Package scanner implements a scanner for Go source text. Takes a []byte as
     6	// source which can then be tokenized through repeated calls to the Scan
     7	// function. Typical use:
     8	//
     9	//	var s Scanner
    10	//	fset := token.NewFileSet()  // position information is relative to fset
    11	//      file := fset.AddFile(filename, fset.Base(), len(src))  // register file
    12	//	s.Init(file, src, nil /* no error handler */, 0)
    13	//	for {
    14	//		pos, tok, lit := s.Scan()
    15	//		if tok == token.EOF {
    16	//			break
    17	//		}
    18	//		// do something here with pos, tok, and lit
    19	//	}
    20	//
    21	package scanner
    22	
    23	import (
    24		"bytes"
    25		"fmt"
    26		"go/token"
    27		"path/filepath"
    28		"strconv"
    29		"unicode"
    30		"utf8"
    31	)
    32	
    33	// A Scanner holds the scanner's internal state while processing
    34	// a given text.  It can be allocated as part of another data
    35	// structure but must be initialized via Init before use.
    36	//
    37	type Scanner struct {
    38		// immutable state
    39		file *token.File  // source file handle
    40		dir  string       // directory portion of file.Name()
    41		src  []byte       // source
    42		err  ErrorHandler // error reporting; or nil
    43		mode uint         // scanning mode
    44	
    45		// scanning state
    46		ch         int  // current character
    47		offset     int  // character offset
    48		rdOffset   int  // reading offset (position after current character)
    49		lineOffset int  // current line offset
    50		insertSemi bool // insert a semicolon before next newline
    51	
    52		// public state - ok to modify
    53		ErrorCount int // number of errors encountered
    54	}
    55	
    56	// Read the next Unicode char into S.ch.
    57	// S.ch < 0 means end-of-file.
    58	//
    59	func (S *Scanner) next() {
    60		if S.rdOffset < len(S.src) {
    61			S.offset = S.rdOffset
    62			if S.ch == '\n' {
    63				S.lineOffset = S.offset
    64				S.file.AddLine(S.offset)
    65			}
    66			r, w := int(S.src[S.rdOffset]), 1
    67			switch {
    68			case r == 0:
    69				S.error(S.offset, "illegal character NUL")
    70			case r >= 0x80:
    71				// not ASCII
    72				r, w = utf8.DecodeRune(S.src[S.rdOffset:])
    73				if r == utf8.RuneError && w == 1 {
    74					S.error(S.offset, "illegal UTF-8 encoding")
    75				}
    76			}
    77			S.rdOffset += w
    78			S.ch = r
    79		} else {
    80			S.offset = len(S.src)
    81			if S.ch == '\n' {
    82				S.lineOffset = S.offset
    83				S.file.AddLine(S.offset)
    84			}
    85			S.ch = -1 // eof
    86		}
    87	}
    88	
    89	// The mode parameter to the Init function is a set of flags (or 0).
    90	// They control scanner behavior.
    91	//
    92	const (
    93		ScanComments      = 1 << iota // return comments as COMMENT tokens
    94		AllowIllegalChars             // do not report an error for illegal chars
    95		InsertSemis                   // automatically insert semicolons
    96	)
    97	
    98	// Init prepares the scanner S to tokenize the text src by setting the
    99	// scanner at the beginning of src. The scanner uses the file set file
   100	// for position information and it adds line information for each line.
   101	// It is ok to re-use the same file when re-scanning the same file as
   102	// line information which is already present is ignored. Init causes a
   103	// panic if the file size does not match the src size.
   104	//
   105	// Calls to Scan will use the error handler err if they encounter a
   106	// syntax error and err is not nil. Also, for each error encountered,
   107	// the Scanner field ErrorCount is incremented by one. The mode parameter
   108	// determines how comments, illegal characters, and semicolons are handled.
   109	//
   110	// Note that Init may call err if there is an error in the first character
   111	// of the file.
   112	//
   113	func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
   114		// Explicitly initialize all fields since a scanner may be reused.
   115		if file.Size() != len(src) {
   116			panic("file size does not match src len")
   117		}
   118		S.file = file
   119		S.dir, _ = filepath.Split(file.Name())
   120		S.src = src
   121		S.err = err
   122		S.mode = mode
   123	
   124		S.ch = ' '
   125		S.offset = 0
   126		S.rdOffset = 0
   127		S.lineOffset = 0
   128		S.insertSemi = false
   129		S.ErrorCount = 0
   130	
   131		S.next()
   132	}
   133	
   134	func (S *Scanner) error(offs int, msg string) {
   135		if S.err != nil {
   136			S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
   137		}
   138		S.ErrorCount++
   139	}
   140	
   141	var prefix = []byte("//line ")
   142	
   143	func (S *Scanner) interpretLineComment(text []byte) {
   144		if bytes.HasPrefix(text, prefix) {
   145			// get filename and line number, if any
   146			if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
   147				if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
   148					// valid //line filename:line comment;
   149					filename := filepath.Clean(string(text[len(prefix):i]))
   150					if !filepath.IsAbs(filename) {
   151						// make filename relative to current directory
   152						filename = filepath.Join(S.dir, filename)
   153					}
   154					// update scanner position
   155					S.file.AddLineInfo(S.lineOffset, filename, line-1) // -1 since comment applies to next line
   156				}
   157			}
   158		}
   159	}
   160	
   161	func (S *Scanner) scanComment() {
   162		// initial '/' already consumed; S.ch == '/' || S.ch == '*'
   163		offs := S.offset - 1 // position of initial '/'
   164	
   165		if S.ch == '/' {
   166			//-style comment
   167			S.next()
   168			for S.ch != '\n' && S.ch >= 0 {
   169				S.next()
   170			}
   171			if offs == S.lineOffset {
   172				// comment starts at the beginning of the current line
   173				S.interpretLineComment(S.src[offs:S.offset])
   174			}
   175			return
   176		}
   177	
   178		/*-style comment */
   179		S.next()
   180		for S.ch >= 0 {
   181			ch := S.ch
   182			S.next()
   183			if ch == '*' && S.ch == '/' {
   184				S.next()
   185				return
   186			}
   187		}
   188	
   189		S.error(offs, "comment not terminated")
   190	}
   191	
   192	func (S *Scanner) findLineEnd() bool {
   193		// initial '/' already consumed
   194	
   195		defer func(offs int) {
   196			// reset scanner state to where it was upon calling findLineEnd
   197			S.ch = '/'
   198			S.offset = offs
   199			S.rdOffset = offs + 1
   200			S.next() // consume initial '/' again
   201		}(S.offset - 1)
   202	
   203		// read ahead until a newline, EOF, or non-comment token is found
   204		for S.ch == '/' || S.ch == '*' {
   205			if S.ch == '/' {
   206				//-style comment always contains a newline
   207				return true
   208			}
   209			/*-style comment: look for newline */
   210			S.next()
   211			for S.ch >= 0 {
   212				ch := S.ch
   213				if ch == '\n' {
   214					return true
   215				}
   216				S.next()
   217				if ch == '*' && S.ch == '/' {
   218					S.next()
   219					break
   220				}
   221			}
   222			S.skipWhitespace() // S.insertSemi is set
   223			if S.ch < 0 || S.ch == '\n' {
   224				return true
   225			}
   226			if S.ch != '/' {
   227				// non-comment token
   228				return false
   229			}
   230			S.next() // consume '/'
   231		}
   232	
   233		return false
   234	}
   235	
   236	func isLetter(ch int) bool {
   237		return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
   238	}
   239	
   240	func isDigit(ch int) bool {
   241		return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
   242	}
   243	
   244	func (S *Scanner) scanIdentifier() token.Token {
   245		offs := S.offset
   246		for isLetter(S.ch) || isDigit(S.ch) {
   247			S.next()
   248		}
   249		return token.Lookup(S.src[offs:S.offset])
   250	}
   251	
   252	func digitVal(ch int) int {
   253		switch {
   254		case '0' <= ch && ch <= '9':
   255			return ch - '0'
   256		case 'a' <= ch && ch <= 'f':
   257			return ch - 'a' + 10
   258		case 'A' <= ch && ch <= 'F':
   259			return ch - 'A' + 10
   260		}
   261		return 16 // larger than any legal digit val
   262	}
   263	
   264	func (S *Scanner) scanMantissa(base int) {
   265		for digitVal(S.ch) < base {
   266			S.next()
   267		}
   268	}
   269	
   270	func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
   271		// digitVal(S.ch) < 10
   272		tok := token.INT
   273	
   274		if seenDecimalPoint {
   275			tok = token.FLOAT
   276			S.scanMantissa(10)
   277			goto exponent
   278		}
   279	
   280		if S.ch == '0' {
   281			// int or float
   282			offs := S.offset
   283			S.next()
   284			if S.ch == 'x' || S.ch == 'X' {
   285				// hexadecimal int
   286				S.next()
   287				S.scanMantissa(16)
   288				if S.offset-offs <= 2 {
   289					// only scanned "0x" or "0X"
   290					S.error(offs, "illegal hexadecimal number")
   291				}
   292			} else {
   293				// octal int or float
   294				seenDecimalDigit := false
   295				S.scanMantissa(8)
   296				if S.ch == '8' || S.ch == '9' {
   297					// illegal octal int or float
   298					seenDecimalDigit = true
   299					S.scanMantissa(10)
   300				}
   301				if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
   302					goto fraction
   303				}
   304				// octal int
   305				if seenDecimalDigit {
   306					S.error(offs, "illegal octal number")
   307				}
   308			}
   309			goto exit
   310		}
   311	
   312		// decimal int or float
   313		S.scanMantissa(10)
   314	
   315	fraction:
   316		if S.ch == '.' {
   317			tok = token.FLOAT
   318			S.next()
   319			S.scanMantissa(10)
   320		}
   321	
   322	exponent:
   323		if S.ch == 'e' || S.ch == 'E' {
   324			tok = token.FLOAT
   325			S.next()
   326			if S.ch == '-' || S.ch == '+' {
   327				S.next()
   328			}
   329			S.scanMantissa(10)
   330		}
   331	
   332		if S.ch == 'i' {
   333			tok = token.IMAG
   334			S.next()
   335		}
   336	
   337	exit:
   338		return tok
   339	}
   340	
   341	func (S *Scanner) scanEscape(quote int) {
   342		offs := S.offset
   343	
   344		var i, base, max uint32
   345		switch S.ch {
   346		case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
   347			S.next()
   348			return
   349		case '0', '1', '2', '3', '4', '5', '6', '7':
   350			i, base, max = 3, 8, 255
   351		case 'x':
   352			S.next()
   353			i, base, max = 2, 16, 255
   354		case 'u':
   355			S.next()
   356			i, base, max = 4, 16, unicode.MaxRune
   357		case 'U':
   358			S.next()
   359			i, base, max = 8, 16, unicode.MaxRune
   360		default:
   361			S.next() // always make progress
   362			S.error(offs, "unknown escape sequence")
   363			return
   364		}
   365	
   366		var x uint32
   367		for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
   368			d := uint32(digitVal(S.ch))
   369			if d >= base {
   370				S.error(S.offset, "illegal character in escape sequence")
   371				break
   372			}
   373			x = x*base + d
   374			S.next()
   375		}
   376		// in case of an error, consume remaining chars
   377		for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
   378			S.next()
   379		}
   380		if x > max || 0xd800 <= x && x < 0xe000 {
   381			S.error(offs, "escape sequence is invalid Unicode code point")
   382		}
   383	}
   384	
   385	func (S *Scanner) scanChar() {
   386		// '\'' opening already consumed
   387		offs := S.offset - 1
   388	
   389		n := 0
   390		for S.ch != '\'' {
   391			ch := S.ch
   392			n++
   393			S.next()
   394			if ch == '\n' || ch < 0 {
   395				S.error(offs, "character literal not terminated")
   396				n = 1
   397				break
   398			}
   399			if ch == '\\' {
   400				S.scanEscape('\'')
   401			}
   402		}
   403	
   404		S.next()
   405	
   406		if n != 1 {
   407			S.error(offs, "illegal character literal")
   408		}
   409	}
   410	
   411	func (S *Scanner) scanString() {
   412		// '"' opening already consumed
   413		offs := S.offset - 1
   414	
   415		for S.ch != '"' {
   416			ch := S.ch
   417			S.next()
   418			if ch == '\n' || ch < 0 {
   419				S.error(offs, "string not terminated")
   420				break
   421			}
   422			if ch == '\\' {
   423				S.scanEscape('"')
   424			}
   425		}
   426	
   427		S.next()
   428	}
   429	
   430	func (S *Scanner) scanRawString() {
   431		// '`' opening already consumed
   432		offs := S.offset - 1
   433	
   434		for S.ch != '`' {
   435			ch := S.ch
   436			S.next()
   437			if ch < 0 {
   438				S.error(offs, "string not terminated")
   439				break
   440			}
   441		}
   442	
   443		S.next()
   444	}
   445	
   446	func (S *Scanner) skipWhitespace() {
   447		for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
   448			S.next()
   449		}
   450	}
   451	
   452	// Helper functions for scanning multi-byte tokens such as >> += >>= .
   453	// Different routines recognize different length tok_i based on matches
   454	// of ch_i. If a token ends in '=', the result is tok1 or tok3
   455	// respectively. Otherwise, the result is tok0 if there was no other
   456	// matching character, or tok2 if the matching character was ch2.
   457	
   458	func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
   459		if S.ch == '=' {
   460			S.next()
   461			return tok1
   462		}
   463		return tok0
   464	}
   465	
   466	func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
   467		if S.ch == '=' {
   468			S.next()
   469			return tok1
   470		}
   471		if S.ch == ch2 {
   472			S.next()
   473			return tok2
   474		}
   475		return tok0
   476	}
   477	
   478	func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
   479		if S.ch == '=' {
   480			S.next()
   481			return tok1
   482		}
   483		if S.ch == ch2 {
   484			S.next()
   485			if S.ch == '=' {
   486				S.next()
   487				return tok3
   488			}
   489			return tok2
   490		}
   491		return tok0
   492	}
   493	
   494	// Scan scans the next token and returns the token position,
   495	// the token, and the literal string corresponding to the
   496	// token. The source end is indicated by token.EOF.
   497	//
   498	// If the returned token is token.SEMICOLON, the corresponding
   499	// literal string is ";" if the semicolon was present in the source,
   500	// and "\n" if the semicolon was inserted because of a newline or
   501	// at EOF.
   502	//
   503	// For more tolerant parsing, Scan will return a valid token if
   504	// possible even if a syntax error was encountered. Thus, even
   505	// if the resulting token sequence contains no illegal tokens,
   506	// a client may not assume that no error occurred. Instead it
   507	// must check the scanner's ErrorCount or the number of calls
   508	// of the error handler, if there was one installed.
   509	//
   510	// Scan adds line information to the file added to the file
   511	// set with Init. Token positions are relative to that file
   512	// and thus relative to the file set.
   513	//
   514	func (S *Scanner) Scan() (token.Pos, token.Token, string) {
   515	scanAgain:
   516		S.skipWhitespace()
   517	
   518		// current token start
   519		insertSemi := false
   520		offs := S.offset
   521		tok := token.ILLEGAL
   522	
   523		// determine token value
   524		switch ch := S.ch; {
   525		case isLetter(ch):
   526			tok = S.scanIdentifier()
   527			switch tok {
   528			case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
   529				insertSemi = true
   530			}
   531		case digitVal(ch) < 10:
   532			insertSemi = true
   533			tok = S.scanNumber(false)
   534		default:
   535			S.next() // always make progress
   536			switch ch {
   537			case -1:
   538				if S.insertSemi {
   539					S.insertSemi = false // EOF consumed
   540					return S.file.Pos(offs), token.SEMICOLON, "\n"
   541				}
   542				tok = token.EOF
   543			case '\n':
   544				// we only reach here if S.insertSemi was
   545				// set in the first place and exited early
   546				// from S.skipWhitespace()
   547				S.insertSemi = false // newline consumed
   548				return S.file.Pos(offs), token.SEMICOLON, "\n"
   549			case '"':
   550				insertSemi = true
   551				tok = token.STRING
   552				S.scanString()
   553			case '\'':
   554				insertSemi = true
   555				tok = token.CHAR
   556				S.scanChar()
   557			case '`':
   558				insertSemi = true
   559				tok = token.STRING
   560				S.scanRawString()
   561			case ':':
   562				tok = S.switch2(token.COLON, token.DEFINE)
   563			case '.':
   564				if digitVal(S.ch) < 10 {
   565					insertSemi = true
   566					tok = S.scanNumber(true)
   567				} else if S.ch == '.' {
   568					S.next()
   569					if S.ch == '.' {
   570						S.next()
   571						tok = token.ELLIPSIS
   572					}
   573				} else {
   574					tok = token.PERIOD
   575				}
   576			case ',':
   577				tok = token.COMMA
   578			case ';':
   579				tok = token.SEMICOLON
   580			case '(':
   581				tok = token.LPAREN
   582			case ')':
   583				insertSemi = true
   584				tok = token.RPAREN
   585			case '[':
   586				tok = token.LBRACK
   587			case ']':
   588				insertSemi = true
   589				tok = token.RBRACK
   590			case '{':
   591				tok = token.LBRACE
   592			case '}':
   593				insertSemi = true
   594				tok = token.RBRACE
   595			case '+':
   596				tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
   597				if tok == token.INC {
   598					insertSemi = true
   599				}
   600			case '-':
   601				tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
   602				if tok == token.DEC {
   603					insertSemi = true
   604				}
   605			case '*':
   606				tok = S.switch2(token.MUL, token.MUL_ASSIGN)
   607			case '/':
   608				if S.ch == '/' || S.ch == '*' {
   609					// comment
   610					if S.insertSemi && S.findLineEnd() {
   611						// reset position to the beginning of the comment
   612						S.ch = '/'
   613						S.offset = offs
   614						S.rdOffset = offs + 1
   615						S.insertSemi = false // newline consumed
   616						return S.file.Pos(offs), token.SEMICOLON, "\n"
   617					}
   618					S.scanComment()
   619					if S.mode&ScanComments == 0 {
   620						// skip comment
   621						S.insertSemi = false // newline consumed
   622						goto scanAgain
   623					}
   624					tok = token.COMMENT
   625				} else {
   626					tok = S.switch2(token.QUO, token.QUO_ASSIGN)
   627				}
   628			case '%':
   629				tok = S.switch2(token.REM, token.REM_ASSIGN)
   630			case '^':
   631				tok = S.switch2(token.XOR, token.XOR_ASSIGN)
   632			case '<':
   633				if S.ch == '-' {
   634					S.next()
   635					tok = token.ARROW
   636				} else {
   637					tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
   638				}
   639			case '>':
   640				tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
   641			case '=':
   642				tok = S.switch2(token.ASSIGN, token.EQL)
   643			case '!':
   644				tok = S.switch2(token.NOT, token.NEQ)
   645			case '&':
   646				if S.ch == '^' {
   647					S.next()
   648					tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
   649				} else {
   650					tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
   651				}
   652			case '|':
   653				tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
   654			default:
   655				if S.mode&AllowIllegalChars == 0 {
   656					S.error(offs, fmt.Sprintf("illegal character %#U", ch))
   657				}
   658				insertSemi = S.insertSemi // preserve insertSemi info
   659			}
   660		}
   661	
   662		if S.mode&InsertSemis != 0 {
   663			S.insertSemi = insertSemi
   664		}
   665	
   666		// TODO(gri): The scanner API should change such that the literal string
   667		//            is only valid if an actual literal was scanned. This will
   668		//            permit a more efficient implementation.
   669		return S.file.Pos(offs), tok, string(S.src[offs:S.offset])
   670	}

release.r60.3. Except as noted, this content is licensed under a Creative Commons Attribution 3.0 License.