Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"internal/bytealg"
    11  	"unicode/utf8"
    12  )
    13  
    14  const (
    15  	lowerhex = "0123456789abcdef"
    16  	upperhex = "0123456789ABCDEF"
    17  )
    18  
    19  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    20  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    21  }
    22  
    23  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    28  	// Often called with big strings, so preallocate. If there's quoting,
    29  	// this is conservative but still helps a lot.
    30  	if cap(buf)-len(buf) < len(s) {
    31  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    32  		copy(nBuf, buf)
    33  		buf = nBuf
    34  	}
    35  	buf = append(buf, quote)
    36  	for width := 0; len(s) > 0; s = s[width:] {
    37  		r := rune(s[0])
    38  		width = 1
    39  		if r >= utf8.RuneSelf {
    40  			r, width = utf8.DecodeRuneInString(s)
    41  		}
    42  		if width == 1 && r == utf8.RuneError {
    43  			buf = append(buf, `\x`...)
    44  			buf = append(buf, lowerhex[s[0]>>4])
    45  			buf = append(buf, lowerhex[s[0]&0xF])
    46  			continue
    47  		}
    48  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    49  	}
    50  	buf = append(buf, quote)
    51  	return buf
    52  }
    53  
    54  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    55  	buf = append(buf, quote)
    56  	if !utf8.ValidRune(r) {
    57  		r = utf8.RuneError
    58  	}
    59  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    60  	buf = append(buf, quote)
    61  	return buf
    62  }
    63  
    64  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    65  	var runeTmp [utf8.UTFMax]byte
    66  	if r == rune(quote) || r == '\\' { // always backslashed
    67  		buf = append(buf, '\\')
    68  		buf = append(buf, byte(r))
    69  		return buf
    70  	}
    71  	if ASCIIonly {
    72  		if r < utf8.RuneSelf && IsPrint(r) {
    73  			buf = append(buf, byte(r))
    74  			return buf
    75  		}
    76  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    77  		n := utf8.EncodeRune(runeTmp[:], r)
    78  		buf = append(buf, runeTmp[:n]...)
    79  		return buf
    80  	}
    81  	switch r {
    82  	case '\a':
    83  		buf = append(buf, `\a`...)
    84  	case '\b':
    85  		buf = append(buf, `\b`...)
    86  	case '\f':
    87  		buf = append(buf, `\f`...)
    88  	case '\n':
    89  		buf = append(buf, `\n`...)
    90  	case '\r':
    91  		buf = append(buf, `\r`...)
    92  	case '\t':
    93  		buf = append(buf, `\t`...)
    94  	case '\v':
    95  		buf = append(buf, `\v`...)
    96  	default:
    97  		switch {
    98  		case r < ' ':
    99  			buf = append(buf, `\x`...)
   100  			buf = append(buf, lowerhex[byte(r)>>4])
   101  			buf = append(buf, lowerhex[byte(r)&0xF])
   102  		case r > utf8.MaxRune:
   103  			r = 0xFFFD
   104  			fallthrough
   105  		case r < 0x10000:
   106  			buf = append(buf, `\u`...)
   107  			for s := 12; s >= 0; s -= 4 {
   108  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   109  			}
   110  		default:
   111  			buf = append(buf, `\U`...)
   112  			for s := 28; s >= 0; s -= 4 {
   113  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   114  			}
   115  		}
   116  	}
   117  	return buf
   118  }
   119  
   120  // Quote returns a double-quoted Go string literal representing s. The
   121  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   122  // control characters and non-printable characters as defined by
   123  // IsPrint.
   124  func Quote(s string) string {
   125  	return quoteWith(s, '"', false, false)
   126  }
   127  
   128  // AppendQuote appends a double-quoted Go string literal representing s,
   129  // as generated by Quote, to dst and returns the extended buffer.
   130  func AppendQuote(dst []byte, s string) []byte {
   131  	return appendQuotedWith(dst, s, '"', false, false)
   132  }
   133  
   134  // QuoteToASCII returns a double-quoted Go string literal representing s.
   135  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   136  // non-ASCII characters and non-printable characters as defined by IsPrint.
   137  func QuoteToASCII(s string) string {
   138  	return quoteWith(s, '"', true, false)
   139  }
   140  
   141  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   142  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   143  func AppendQuoteToASCII(dst []byte, s string) []byte {
   144  	return appendQuotedWith(dst, s, '"', true, false)
   145  }
   146  
   147  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   148  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   149  // non-ASCII characters and non-printable characters as defined by IsGraphic.
   150  func QuoteToGraphic(s string) string {
   151  	return quoteWith(s, '"', false, true)
   152  }
   153  
   154  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   155  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   156  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   157  	return appendQuotedWith(dst, s, '"', false, true)
   158  }
   159  
   160  // QuoteRune returns a single-quoted Go character literal representing the
   161  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   162  // for control characters and non-printable characters as defined by IsPrint.
   163  func QuoteRune(r rune) string {
   164  	return quoteRuneWith(r, '\'', false, false)
   165  }
   166  
   167  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   168  // as generated by QuoteRune, to dst and returns the extended buffer.
   169  func AppendQuoteRune(dst []byte, r rune) []byte {
   170  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   171  }
   172  
   173  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   174  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   175  // \u0100) for non-ASCII characters and non-printable characters as defined
   176  // by IsPrint.
   177  func QuoteRuneToASCII(r rune) string {
   178  	return quoteRuneWith(r, '\'', true, false)
   179  }
   180  
   181  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   182  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   183  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   184  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   185  }
   186  
   187  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   188  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   189  // \u0100) for non-ASCII characters and non-printable characters as defined
   190  // by IsGraphic.
   191  func QuoteRuneToGraphic(r rune) string {
   192  	return quoteRuneWith(r, '\'', false, true)
   193  }
   194  
   195  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   196  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   197  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   198  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   199  }
   200  
   201  // CanBackquote reports whether the string s can be represented
   202  // unchanged as a single-line backquoted string without control
   203  // characters other than tab.
   204  func CanBackquote(s string) bool {
   205  	for len(s) > 0 {
   206  		r, wid := utf8.DecodeRuneInString(s)
   207  		s = s[wid:]
   208  		if wid > 1 {
   209  			if r == '\ufeff' {
   210  				return false // BOMs are invisible and should not be quoted.
   211  			}
   212  			continue // All other multibyte runes are correctly encoded and assumed printable.
   213  		}
   214  		if r == utf8.RuneError {
   215  			return false
   216  		}
   217  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   218  			return false
   219  		}
   220  	}
   221  	return true
   222  }
   223  
   224  func unhex(b byte) (v rune, ok bool) {
   225  	c := rune(b)
   226  	switch {
   227  	case '0' <= c && c <= '9':
   228  		return c - '0', true
   229  	case 'a' <= c && c <= 'f':
   230  		return c - 'a' + 10, true
   231  	case 'A' <= c && c <= 'F':
   232  		return c - 'A' + 10, true
   233  	}
   234  	return
   235  }
   236  
   237  // UnquoteChar decodes the first character or byte in the escaped string
   238  // or character literal represented by the string s.
   239  // It returns four values:
   240  //
   241  //	1) value, the decoded Unicode code point or byte value;
   242  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   243  //	3) tail, the remainder of the string after the character; and
   244  //	4) an error that will be nil if the character is syntactically valid.
   245  //
   246  // The second argument, quote, specifies the type of literal being parsed
   247  // and therefore which escaped quote character is permitted.
   248  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   249  // If set to a double quote, it permits \" and disallows unescaped ".
   250  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   251  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   252  	// easy cases
   253  	if len(s) == 0 {
   254  		err = ErrSyntax
   255  		return
   256  	}
   257  	switch c := s[0]; {
   258  	case c == quote && (quote == '\'' || quote == '"'):
   259  		err = ErrSyntax
   260  		return
   261  	case c >= utf8.RuneSelf:
   262  		r, size := utf8.DecodeRuneInString(s)
   263  		return r, true, s[size:], nil
   264  	case c != '\\':
   265  		return rune(s[0]), false, s[1:], nil
   266  	}
   267  
   268  	// hard case: c is backslash
   269  	if len(s) <= 1 {
   270  		err = ErrSyntax
   271  		return
   272  	}
   273  	c := s[1]
   274  	s = s[2:]
   275  
   276  	switch c {
   277  	case 'a':
   278  		value = '\a'
   279  	case 'b':
   280  		value = '\b'
   281  	case 'f':
   282  		value = '\f'
   283  	case 'n':
   284  		value = '\n'
   285  	case 'r':
   286  		value = '\r'
   287  	case 't':
   288  		value = '\t'
   289  	case 'v':
   290  		value = '\v'
   291  	case 'x', 'u', 'U':
   292  		n := 0
   293  		switch c {
   294  		case 'x':
   295  			n = 2
   296  		case 'u':
   297  			n = 4
   298  		case 'U':
   299  			n = 8
   300  		}
   301  		var v rune
   302  		if len(s) < n {
   303  			err = ErrSyntax
   304  			return
   305  		}
   306  		for j := 0; j < n; j++ {
   307  			x, ok := unhex(s[j])
   308  			if !ok {
   309  				err = ErrSyntax
   310  				return
   311  			}
   312  			v = v<<4 | x
   313  		}
   314  		s = s[n:]
   315  		if c == 'x' {
   316  			// single-byte string, possibly not UTF-8
   317  			value = v
   318  			break
   319  		}
   320  		if v > utf8.MaxRune {
   321  			err = ErrSyntax
   322  			return
   323  		}
   324  		value = v
   325  		multibyte = true
   326  	case '0', '1', '2', '3', '4', '5', '6', '7':
   327  		v := rune(c) - '0'
   328  		if len(s) < 2 {
   329  			err = ErrSyntax
   330  			return
   331  		}
   332  		for j := 0; j < 2; j++ { // one digit already; two more
   333  			x := rune(s[j]) - '0'
   334  			if x < 0 || x > 7 {
   335  				err = ErrSyntax
   336  				return
   337  			}
   338  			v = (v << 3) | x
   339  		}
   340  		s = s[2:]
   341  		if v > 255 {
   342  			err = ErrSyntax
   343  			return
   344  		}
   345  		value = v
   346  	case '\\':
   347  		value = '\\'
   348  	case '\'', '"':
   349  		if c != quote {
   350  			err = ErrSyntax
   351  			return
   352  		}
   353  		value = rune(c)
   354  	default:
   355  		err = ErrSyntax
   356  		return
   357  	}
   358  	tail = s
   359  	return
   360  }
   361  
   362  // Unquote interprets s as a single-quoted, double-quoted,
   363  // or backquoted Go string literal, returning the string value
   364  // that s quotes.  (If s is single-quoted, it would be a Go
   365  // character literal; Unquote returns the corresponding
   366  // one-character string.)
   367  func Unquote(s string) (string, error) {
   368  	n := len(s)
   369  	if n < 2 {
   370  		return "", ErrSyntax
   371  	}
   372  	quote := s[0]
   373  	if quote != s[n-1] {
   374  		return "", ErrSyntax
   375  	}
   376  	s = s[1 : n-1]
   377  
   378  	if quote == '`' {
   379  		if contains(s, '`') {
   380  			return "", ErrSyntax
   381  		}
   382  		if contains(s, '\r') {
   383  			// -1 because we know there is at least one \r to remove.
   384  			buf := make([]byte, 0, len(s)-1)
   385  			for i := 0; i < len(s); i++ {
   386  				if s[i] != '\r' {
   387  					buf = append(buf, s[i])
   388  				}
   389  			}
   390  			return string(buf), nil
   391  		}
   392  		return s, nil
   393  	}
   394  	if quote != '"' && quote != '\'' {
   395  		return "", ErrSyntax
   396  	}
   397  	if contains(s, '\n') {
   398  		return "", ErrSyntax
   399  	}
   400  
   401  	// Is it trivial? Avoid allocation.
   402  	if !contains(s, '\\') && !contains(s, quote) {
   403  		switch quote {
   404  		case '"':
   405  			if utf8.ValidString(s) {
   406  				return s, nil
   407  			}
   408  		case '\'':
   409  			r, size := utf8.DecodeRuneInString(s)
   410  			if size == len(s) && (r != utf8.RuneError || size != 1) {
   411  				return s, nil
   412  			}
   413  		}
   414  	}
   415  
   416  	var runeTmp [utf8.UTFMax]byte
   417  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   418  	for len(s) > 0 {
   419  		c, multibyte, ss, err := UnquoteChar(s, quote)
   420  		if err != nil {
   421  			return "", err
   422  		}
   423  		s = ss
   424  		if c < utf8.RuneSelf || !multibyte {
   425  			buf = append(buf, byte(c))
   426  		} else {
   427  			n := utf8.EncodeRune(runeTmp[:], c)
   428  			buf = append(buf, runeTmp[:n]...)
   429  		}
   430  		if quote == '\'' && len(s) != 0 {
   431  			// single-quoted must be single character
   432  			return "", ErrSyntax
   433  		}
   434  	}
   435  	return string(buf), nil
   436  }
   437  
   438  // contains reports whether the string contains the byte c.
   439  func contains(s string, c byte) bool {
   440  	return bytealg.IndexByteString(s, c) != -1
   441  }
   442  
   443  // bsearch16 returns the smallest i such that a[i] >= x.
   444  // If there is no such i, bsearch16 returns len(a).
   445  func bsearch16(a []uint16, x uint16) int {
   446  	i, j := 0, len(a)
   447  	for i < j {
   448  		h := i + (j-i)/2
   449  		if a[h] < x {
   450  			i = h + 1
   451  		} else {
   452  			j = h
   453  		}
   454  	}
   455  	return i
   456  }
   457  
   458  // bsearch32 returns the smallest i such that a[i] >= x.
   459  // If there is no such i, bsearch32 returns len(a).
   460  func bsearch32(a []uint32, x uint32) int {
   461  	i, j := 0, len(a)
   462  	for i < j {
   463  		h := i + (j-i)/2
   464  		if a[h] < x {
   465  			i = h + 1
   466  		} else {
   467  			j = h
   468  		}
   469  	}
   470  	return i
   471  }
   472  
   473  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   474  // to give the same answer. It allows this package not to depend on unicode,
   475  // and therefore not pull in all the Unicode tables. If the linker were better
   476  // at tossing unused tables, we could get rid of this implementation.
   477  // That would be nice.
   478  
   479  // IsPrint reports whether the rune is defined as printable by Go, with
   480  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   481  // symbols and ASCII space.
   482  func IsPrint(r rune) bool {
   483  	// Fast check for Latin-1
   484  	if r <= 0xFF {
   485  		if 0x20 <= r && r <= 0x7E {
   486  			// All the ASCII is printable from space through DEL-1.
   487  			return true
   488  		}
   489  		if 0xA1 <= r && r <= 0xFF {
   490  			// Similarly for ¡ through ÿ...
   491  			return r != 0xAD // ...except for the bizarre soft hyphen.
   492  		}
   493  		return false
   494  	}
   495  
   496  	// Same algorithm, either on uint16 or uint32 value.
   497  	// First, find first i such that isPrint[i] >= x.
   498  	// This is the index of either the start or end of a pair that might span x.
   499  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   500  	// If we find x in a range, make sure x is not in isNotPrint list.
   501  
   502  	if 0 <= r && r < 1<<16 {
   503  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   504  		i := bsearch16(isPrint, rr)
   505  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   506  			return false
   507  		}
   508  		j := bsearch16(isNotPrint, rr)
   509  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   510  	}
   511  
   512  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   513  	i := bsearch32(isPrint, rr)
   514  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   515  		return false
   516  	}
   517  	if r >= 0x20000 {
   518  		return true
   519  	}
   520  	r -= 0x10000
   521  	j := bsearch16(isNotPrint, uint16(r))
   522  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   523  }
   524  
   525  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   526  // characters include letters, marks, numbers, punctuation, symbols, and
   527  // spaces, from categories L, M, N, P, S, and Zs.
   528  func IsGraphic(r rune) bool {
   529  	if IsPrint(r) {
   530  		return true
   531  	}
   532  	return isInGraphicList(r)
   533  }
   534  
   535  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   536  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   537  // Should be called only if IsPrint fails.
   538  func isInGraphicList(r rune) bool {
   539  	// We know r must fit in 16 bits - see makeisprint.go.
   540  	if r > 0xFFFF {
   541  		return false
   542  	}
   543  	rr := uint16(r)
   544  	i := bsearch16(isGraphic, rr)
   545  	return i < len(isGraphic) && rr == isGraphic[i]
   546  }
   547  

View as plain text