1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21 package scanner
22
23 import (
24 "bytes"
25 "fmt"
26 "go/token"
27 "path/filepath"
28 "strconv"
29 "unicode"
30 "utf8"
31 )
32
33
34
35
36
37 type Scanner struct {
38
39 file *token.File
40 dir string
41 src []byte
42 err ErrorHandler
43 mode uint
44
45
46 ch int
47 offset int
48 rdOffset int
49 lineOffset int
50 insertSemi bool
51
52
53 ErrorCount int
54 }
55
56
57
58
59 func (S *Scanner) next() {
60 if S.rdOffset < len(S.src) {
61 S.offset = S.rdOffset
62 if S.ch == '\n' {
63 S.lineOffset = S.offset
64 S.file.AddLine(S.offset)
65 }
66 r, w := int(S.src[S.rdOffset]), 1
67 switch {
68 case r == 0:
69 S.error(S.offset, "illegal character NUL")
70 case r >= 0x80:
71
72 r, w = utf8.DecodeRune(S.src[S.rdOffset:])
73 if r == utf8.RuneError && w == 1 {
74 S.error(S.offset, "illegal UTF-8 encoding")
75 }
76 }
77 S.rdOffset += w
78 S.ch = r
79 } else {
80 S.offset = len(S.src)
81 if S.ch == '\n' {
82 S.lineOffset = S.offset
83 S.file.AddLine(S.offset)
84 }
85 S.ch = -1
86 }
87 }
88
89
90
91
92 const (
93 ScanComments = 1 << iota
94 AllowIllegalChars
95 InsertSemis
96 )
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113 func (S *Scanner) Init(file *token.File, src []byte, err ErrorHandler, mode uint) {
114
115 if file.Size() != len(src) {
116 panic("file size does not match src len")
117 }
118 S.file = file
119 S.dir, _ = filepath.Split(file.Name())
120 S.src = src
121 S.err = err
122 S.mode = mode
123
124 S.ch = ' '
125 S.offset = 0
126 S.rdOffset = 0
127 S.lineOffset = 0
128 S.insertSemi = false
129 S.ErrorCount = 0
130
131 S.next()
132 }
133
134 func (S *Scanner) error(offs int, msg string) {
135 if S.err != nil {
136 S.err.Error(S.file.Position(S.file.Pos(offs)), msg)
137 }
138 S.ErrorCount++
139 }
140
141 var prefix = []byte("//line ")
142
143 func (S *Scanner) interpretLineComment(text []byte) {
144 if bytes.HasPrefix(text, prefix) {
145
146 if i := bytes.LastIndex(text, []byte{':'}); i > 0 {
147 if line, err := strconv.Atoi(string(text[i+1:])); err == nil && line > 0 {
148
149 filename := filepath.Clean(string(text[len(prefix):i]))
150 if !filepath.IsAbs(filename) {
151
152 filename = filepath.Join(S.dir, filename)
153 }
154
155 S.file.AddLineInfo(S.lineOffset, filename, line-1)
156 }
157 }
158 }
159 }
160
161 func (S *Scanner) scanComment() {
162
163 offs := S.offset - 1
164
165 if S.ch == '/' {
166
167 S.next()
168 for S.ch != '\n' && S.ch >= 0 {
169 S.next()
170 }
171 if offs == S.lineOffset {
172
173 S.interpretLineComment(S.src[offs:S.offset])
174 }
175 return
176 }
177
178
179 S.next()
180 for S.ch >= 0 {
181 ch := S.ch
182 S.next()
183 if ch == '*' && S.ch == '/' {
184 S.next()
185 return
186 }
187 }
188
189 S.error(offs, "comment not terminated")
190 }
191
192 func (S *Scanner) findLineEnd() bool {
193
194
195 defer func(offs int) {
196
197 S.ch = '/'
198 S.offset = offs
199 S.rdOffset = offs + 1
200 S.next()
201 }(S.offset - 1)
202
203
204 for S.ch == '/' || S.ch == '*' {
205 if S.ch == '/' {
206
207 return true
208 }
209
210 S.next()
211 for S.ch >= 0 {
212 ch := S.ch
213 if ch == '\n' {
214 return true
215 }
216 S.next()
217 if ch == '*' && S.ch == '/' {
218 S.next()
219 break
220 }
221 }
222 S.skipWhitespace()
223 if S.ch < 0 || S.ch == '\n' {
224 return true
225 }
226 if S.ch != '/' {
227
228 return false
229 }
230 S.next()
231 }
232
233 return false
234 }
235
236 func isLetter(ch int) bool {
237 return 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z' || ch == '_' || ch >= 0x80 && unicode.IsLetter(ch)
238 }
239
240 func isDigit(ch int) bool {
241 return '0' <= ch && ch <= '9' || ch >= 0x80 && unicode.IsDigit(ch)
242 }
243
244 func (S *Scanner) scanIdentifier() token.Token {
245 offs := S.offset
246 for isLetter(S.ch) || isDigit(S.ch) {
247 S.next()
248 }
249 return token.Lookup(S.src[offs:S.offset])
250 }
251
252 func digitVal(ch int) int {
253 switch {
254 case '0' <= ch && ch <= '9':
255 return ch - '0'
256 case 'a' <= ch && ch <= 'f':
257 return ch - 'a' + 10
258 case 'A' <= ch && ch <= 'F':
259 return ch - 'A' + 10
260 }
261 return 16
262 }
263
264 func (S *Scanner) scanMantissa(base int) {
265 for digitVal(S.ch) < base {
266 S.next()
267 }
268 }
269
270 func (S *Scanner) scanNumber(seenDecimalPoint bool) token.Token {
271
272 tok := token.INT
273
274 if seenDecimalPoint {
275 tok = token.FLOAT
276 S.scanMantissa(10)
277 goto exponent
278 }
279
280 if S.ch == '0' {
281
282 offs := S.offset
283 S.next()
284 if S.ch == 'x' || S.ch == 'X' {
285
286 S.next()
287 S.scanMantissa(16)
288 if S.offset-offs <= 2 {
289
290 S.error(offs, "illegal hexadecimal number")
291 }
292 } else {
293
294 seenDecimalDigit := false
295 S.scanMantissa(8)
296 if S.ch == '8' || S.ch == '9' {
297
298 seenDecimalDigit = true
299 S.scanMantissa(10)
300 }
301 if S.ch == '.' || S.ch == 'e' || S.ch == 'E' || S.ch == 'i' {
302 goto fraction
303 }
304
305 if seenDecimalDigit {
306 S.error(offs, "illegal octal number")
307 }
308 }
309 goto exit
310 }
311
312
313 S.scanMantissa(10)
314
315 fraction:
316 if S.ch == '.' {
317 tok = token.FLOAT
318 S.next()
319 S.scanMantissa(10)
320 }
321
322 exponent:
323 if S.ch == 'e' || S.ch == 'E' {
324 tok = token.FLOAT
325 S.next()
326 if S.ch == '-' || S.ch == '+' {
327 S.next()
328 }
329 S.scanMantissa(10)
330 }
331
332 if S.ch == 'i' {
333 tok = token.IMAG
334 S.next()
335 }
336
337 exit:
338 return tok
339 }
340
341 func (S *Scanner) scanEscape(quote int) {
342 offs := S.offset
343
344 var i, base, max uint32
345 switch S.ch {
346 case 'a', 'b', 'f', 'n', 'r', 't', 'v', '\\', quote:
347 S.next()
348 return
349 case '0', '1', '2', '3', '4', '5', '6', '7':
350 i, base, max = 3, 8, 255
351 case 'x':
352 S.next()
353 i, base, max = 2, 16, 255
354 case 'u':
355 S.next()
356 i, base, max = 4, 16, unicode.MaxRune
357 case 'U':
358 S.next()
359 i, base, max = 8, 16, unicode.MaxRune
360 default:
361 S.next()
362 S.error(offs, "unknown escape sequence")
363 return
364 }
365
366 var x uint32
367 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
368 d := uint32(digitVal(S.ch))
369 if d >= base {
370 S.error(S.offset, "illegal character in escape sequence")
371 break
372 }
373 x = x*base + d
374 S.next()
375 }
376
377 for ; i > 0 && S.ch != quote && S.ch >= 0; i-- {
378 S.next()
379 }
380 if x > max || 0xd800 <= x && x < 0xe000 {
381 S.error(offs, "escape sequence is invalid Unicode code point")
382 }
383 }
384
385 func (S *Scanner) scanChar() {
386
387 offs := S.offset - 1
388
389 n := 0
390 for S.ch != '\'' {
391 ch := S.ch
392 n++
393 S.next()
394 if ch == '\n' || ch < 0 {
395 S.error(offs, "character literal not terminated")
396 n = 1
397 break
398 }
399 if ch == '\\' {
400 S.scanEscape('\'')
401 }
402 }
403
404 S.next()
405
406 if n != 1 {
407 S.error(offs, "illegal character literal")
408 }
409 }
410
411 func (S *Scanner) scanString() {
412
413 offs := S.offset - 1
414
415 for S.ch != '"' {
416 ch := S.ch
417 S.next()
418 if ch == '\n' || ch < 0 {
419 S.error(offs, "string not terminated")
420 break
421 }
422 if ch == '\\' {
423 S.scanEscape('"')
424 }
425 }
426
427 S.next()
428 }
429
430 func (S *Scanner) scanRawString() {
431
432 offs := S.offset - 1
433
434 for S.ch != '`' {
435 ch := S.ch
436 S.next()
437 if ch < 0 {
438 S.error(offs, "string not terminated")
439 break
440 }
441 }
442
443 S.next()
444 }
445
446 func (S *Scanner) skipWhitespace() {
447 for S.ch == ' ' || S.ch == '\t' || S.ch == '\n' && !S.insertSemi || S.ch == '\r' {
448 S.next()
449 }
450 }
451
452
453
454
455
456
457
458 func (S *Scanner) switch2(tok0, tok1 token.Token) token.Token {
459 if S.ch == '=' {
460 S.next()
461 return tok1
462 }
463 return tok0
464 }
465
466 func (S *Scanner) switch3(tok0, tok1 token.Token, ch2 int, tok2 token.Token) token.Token {
467 if S.ch == '=' {
468 S.next()
469 return tok1
470 }
471 if S.ch == ch2 {
472 S.next()
473 return tok2
474 }
475 return tok0
476 }
477
478 func (S *Scanner) switch4(tok0, tok1 token.Token, ch2 int, tok2, tok3 token.Token) token.Token {
479 if S.ch == '=' {
480 S.next()
481 return tok1
482 }
483 if S.ch == ch2 {
484 S.next()
485 if S.ch == '=' {
486 S.next()
487 return tok3
488 }
489 return tok2
490 }
491 return tok0
492 }
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514 func (S *Scanner) Scan() (token.Pos, token.Token, string) {
515 scanAgain:
516 S.skipWhitespace()
517
518
519 insertSemi := false
520 offs := S.offset
521 tok := token.ILLEGAL
522
523
524 switch ch := S.ch; {
525 case isLetter(ch):
526 tok = S.scanIdentifier()
527 switch tok {
528 case token.IDENT, token.BREAK, token.CONTINUE, token.FALLTHROUGH, token.RETURN:
529 insertSemi = true
530 }
531 case digitVal(ch) < 10:
532 insertSemi = true
533 tok = S.scanNumber(false)
534 default:
535 S.next()
536 switch ch {
537 case -1:
538 if S.insertSemi {
539 S.insertSemi = false
540 return S.file.Pos(offs), token.SEMICOLON, "\n"
541 }
542 tok = token.EOF
543 case '\n':
544
545
546
547 S.insertSemi = false
548 return S.file.Pos(offs), token.SEMICOLON, "\n"
549 case '"':
550 insertSemi = true
551 tok = token.STRING
552 S.scanString()
553 case '\'':
554 insertSemi = true
555 tok = token.CHAR
556 S.scanChar()
557 case '`':
558 insertSemi = true
559 tok = token.STRING
560 S.scanRawString()
561 case ':':
562 tok = S.switch2(token.COLON, token.DEFINE)
563 case '.':
564 if digitVal(S.ch) < 10 {
565 insertSemi = true
566 tok = S.scanNumber(true)
567 } else if S.ch == '.' {
568 S.next()
569 if S.ch == '.' {
570 S.next()
571 tok = token.ELLIPSIS
572 }
573 } else {
574 tok = token.PERIOD
575 }
576 case ',':
577 tok = token.COMMA
578 case ';':
579 tok = token.SEMICOLON
580 case '(':
581 tok = token.LPAREN
582 case ')':
583 insertSemi = true
584 tok = token.RPAREN
585 case '[':
586 tok = token.LBRACK
587 case ']':
588 insertSemi = true
589 tok = token.RBRACK
590 case '{':
591 tok = token.LBRACE
592 case '}':
593 insertSemi = true
594 tok = token.RBRACE
595 case '+':
596 tok = S.switch3(token.ADD, token.ADD_ASSIGN, '+', token.INC)
597 if tok == token.INC {
598 insertSemi = true
599 }
600 case '-':
601 tok = S.switch3(token.SUB, token.SUB_ASSIGN, '-', token.DEC)
602 if tok == token.DEC {
603 insertSemi = true
604 }
605 case '*':
606 tok = S.switch2(token.MUL, token.MUL_ASSIGN)
607 case '/':
608 if S.ch == '/' || S.ch == '*' {
609
610 if S.insertSemi && S.findLineEnd() {
611
612 S.ch = '/'
613 S.offset = offs
614 S.rdOffset = offs + 1
615 S.insertSemi = false
616 return S.file.Pos(offs), token.SEMICOLON, "\n"
617 }
618 S.scanComment()
619 if S.mode&ScanComments == 0 {
620
621 S.insertSemi = false
622 goto scanAgain
623 }
624 tok = token.COMMENT
625 } else {
626 tok = S.switch2(token.QUO, token.QUO_ASSIGN)
627 }
628 case '%':
629 tok = S.switch2(token.REM, token.REM_ASSIGN)
630 case '^':
631 tok = S.switch2(token.XOR, token.XOR_ASSIGN)
632 case '<':
633 if S.ch == '-' {
634 S.next()
635 tok = token.ARROW
636 } else {
637 tok = S.switch4(token.LSS, token.LEQ, '<', token.SHL, token.SHL_ASSIGN)
638 }
639 case '>':
640 tok = S.switch4(token.GTR, token.GEQ, '>', token.SHR, token.SHR_ASSIGN)
641 case '=':
642 tok = S.switch2(token.ASSIGN, token.EQL)
643 case '!':
644 tok = S.switch2(token.NOT, token.NEQ)
645 case '&':
646 if S.ch == '^' {
647 S.next()
648 tok = S.switch2(token.AND_NOT, token.AND_NOT_ASSIGN)
649 } else {
650 tok = S.switch3(token.AND, token.AND_ASSIGN, '&', token.LAND)
651 }
652 case '|':
653 tok = S.switch3(token.OR, token.OR_ASSIGN, '|', token.LOR)
654 default:
655 if S.mode&AllowIllegalChars == 0 {
656 S.error(offs, fmt.Sprintf("illegal character %#U", ch))
657 }
658 insertSemi = S.insertSemi
659 }
660 }
661
662 if S.mode&InsertSemis != 0 {
663 S.insertSemi = insertSemi
664 }
665
666
667
668
669 return S.file.Pos(offs), tok, string(S.src[offs:S.offset])
670 }