Black Lives Matter. Support the Equal Justice Initiative.

Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"internal/bytealg"
    11  	"unicode/utf8"
    12  )
    13  
    14  const (
    15  	lowerhex = "0123456789abcdef"
    16  	upperhex = "0123456789ABCDEF"
    17  )
    18  
    19  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    20  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    21  }
    22  
    23  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    24  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    25  }
    26  
    27  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    28  	// Often called with big strings, so preallocate. If there's quoting,
    29  	// this is conservative but still helps a lot.
    30  	if cap(buf)-len(buf) < len(s) {
    31  		nBuf := make([]byte, len(buf), len(buf)+1+len(s)+1)
    32  		copy(nBuf, buf)
    33  		buf = nBuf
    34  	}
    35  	buf = append(buf, quote)
    36  	for width := 0; len(s) > 0; s = s[width:] {
    37  		r := rune(s[0])
    38  		width = 1
    39  		if r >= utf8.RuneSelf {
    40  			r, width = utf8.DecodeRuneInString(s)
    41  		}
    42  		if width == 1 && r == utf8.RuneError {
    43  			buf = append(buf, `\x`...)
    44  			buf = append(buf, lowerhex[s[0]>>4])
    45  			buf = append(buf, lowerhex[s[0]&0xF])
    46  			continue
    47  		}
    48  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    49  	}
    50  	buf = append(buf, quote)
    51  	return buf
    52  }
    53  
    54  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    55  	buf = append(buf, quote)
    56  	if !utf8.ValidRune(r) {
    57  		r = utf8.RuneError
    58  	}
    59  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    60  	buf = append(buf, quote)
    61  	return buf
    62  }
    63  
    64  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    65  	var runeTmp [utf8.UTFMax]byte
    66  	if r == rune(quote) || r == '\\' { // always backslashed
    67  		buf = append(buf, '\\')
    68  		buf = append(buf, byte(r))
    69  		return buf
    70  	}
    71  	if ASCIIonly {
    72  		if r < utf8.RuneSelf && IsPrint(r) {
    73  			buf = append(buf, byte(r))
    74  			return buf
    75  		}
    76  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    77  		n := utf8.EncodeRune(runeTmp[:], r)
    78  		buf = append(buf, runeTmp[:n]...)
    79  		return buf
    80  	}
    81  	switch r {
    82  	case '\a':
    83  		buf = append(buf, `\a`...)
    84  	case '\b':
    85  		buf = append(buf, `\b`...)
    86  	case '\f':
    87  		buf = append(buf, `\f`...)
    88  	case '\n':
    89  		buf = append(buf, `\n`...)
    90  	case '\r':
    91  		buf = append(buf, `\r`...)
    92  	case '\t':
    93  		buf = append(buf, `\t`...)
    94  	case '\v':
    95  		buf = append(buf, `\v`...)
    96  	default:
    97  		switch {
    98  		case r < ' ':
    99  			buf = append(buf, `\x`...)
   100  			buf = append(buf, lowerhex[byte(r)>>4])
   101  			buf = append(buf, lowerhex[byte(r)&0xF])
   102  		case r > utf8.MaxRune:
   103  			r = 0xFFFD
   104  			fallthrough
   105  		case r < 0x10000:
   106  			buf = append(buf, `\u`...)
   107  			for s := 12; s >= 0; s -= 4 {
   108  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   109  			}
   110  		default:
   111  			buf = append(buf, `\U`...)
   112  			for s := 28; s >= 0; s -= 4 {
   113  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   114  			}
   115  		}
   116  	}
   117  	return buf
   118  }
   119  
   120  // Quote returns a double-quoted Go string literal representing s. The
   121  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   122  // control characters and non-printable characters as defined by
   123  // IsPrint.
   124  func Quote(s string) string {
   125  	return quoteWith(s, '"', false, false)
   126  }
   127  
   128  // AppendQuote appends a double-quoted Go string literal representing s,
   129  // as generated by Quote, to dst and returns the extended buffer.
   130  func AppendQuote(dst []byte, s string) []byte {
   131  	return appendQuotedWith(dst, s, '"', false, false)
   132  }
   133  
   134  // QuoteToASCII returns a double-quoted Go string literal representing s.
   135  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   136  // non-ASCII characters and non-printable characters as defined by IsPrint.
   137  func QuoteToASCII(s string) string {
   138  	return quoteWith(s, '"', true, false)
   139  }
   140  
   141  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   142  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   143  func AppendQuoteToASCII(dst []byte, s string) []byte {
   144  	return appendQuotedWith(dst, s, '"', true, false)
   145  }
   146  
   147  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   148  // The returned string leaves Unicode graphic characters, as defined by
   149  // IsGraphic, unchanged and uses Go escape sequences (\t, \n, \xFF, \u0100)
   150  // for non-graphic characters.
   151  func QuoteToGraphic(s string) string {
   152  	return quoteWith(s, '"', false, true)
   153  }
   154  
   155  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   156  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   157  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   158  	return appendQuotedWith(dst, s, '"', false, true)
   159  }
   160  
   161  // QuoteRune returns a single-quoted Go character literal representing the
   162  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   163  // for control characters and non-printable characters as defined by IsPrint.
   164  func QuoteRune(r rune) string {
   165  	return quoteRuneWith(r, '\'', false, false)
   166  }
   167  
   168  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   169  // as generated by QuoteRune, to dst and returns the extended buffer.
   170  func AppendQuoteRune(dst []byte, r rune) []byte {
   171  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   172  }
   173  
   174  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   175  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   176  // \u0100) for non-ASCII characters and non-printable characters as defined
   177  // by IsPrint.
   178  func QuoteRuneToASCII(r rune) string {
   179  	return quoteRuneWith(r, '\'', true, false)
   180  }
   181  
   182  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   183  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   184  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   185  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   186  }
   187  
   188  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   189  // the rune. If the rune is not a Unicode graphic character,
   190  // as defined by IsGraphic, the returned string will use a Go escape sequence
   191  // (\t, \n, \xFF, \u0100).
   192  func QuoteRuneToGraphic(r rune) string {
   193  	return quoteRuneWith(r, '\'', false, true)
   194  }
   195  
   196  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   197  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   198  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   199  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   200  }
   201  
   202  // CanBackquote reports whether the string s can be represented
   203  // unchanged as a single-line backquoted string without control
   204  // characters other than tab.
   205  func CanBackquote(s string) bool {
   206  	for len(s) > 0 {
   207  		r, wid := utf8.DecodeRuneInString(s)
   208  		s = s[wid:]
   209  		if wid > 1 {
   210  			if r == '\ufeff' {
   211  				return false // BOMs are invisible and should not be quoted.
   212  			}
   213  			continue // All other multibyte runes are correctly encoded and assumed printable.
   214  		}
   215  		if r == utf8.RuneError {
   216  			return false
   217  		}
   218  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   219  			return false
   220  		}
   221  	}
   222  	return true
   223  }
   224  
   225  func unhex(b byte) (v rune, ok bool) {
   226  	c := rune(b)
   227  	switch {
   228  	case '0' <= c && c <= '9':
   229  		return c - '0', true
   230  	case 'a' <= c && c <= 'f':
   231  		return c - 'a' + 10, true
   232  	case 'A' <= c && c <= 'F':
   233  		return c - 'A' + 10, true
   234  	}
   235  	return
   236  }
   237  
   238  // UnquoteChar decodes the first character or byte in the escaped string
   239  // or character literal represented by the string s.
   240  // It returns four values:
   241  //
   242  //	1) value, the decoded Unicode code point or byte value;
   243  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   244  //	3) tail, the remainder of the string after the character; and
   245  //	4) an error that will be nil if the character is syntactically valid.
   246  //
   247  // The second argument, quote, specifies the type of literal being parsed
   248  // and therefore which escaped quote character is permitted.
   249  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   250  // If set to a double quote, it permits \" and disallows unescaped ".
   251  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   252  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   253  	// easy cases
   254  	if len(s) == 0 {
   255  		err = ErrSyntax
   256  		return
   257  	}
   258  	switch c := s[0]; {
   259  	case c == quote && (quote == '\'' || quote == '"'):
   260  		err = ErrSyntax
   261  		return
   262  	case c >= utf8.RuneSelf:
   263  		r, size := utf8.DecodeRuneInString(s)
   264  		return r, true, s[size:], nil
   265  	case c != '\\':
   266  		return rune(s[0]), false, s[1:], nil
   267  	}
   268  
   269  	// hard case: c is backslash
   270  	if len(s) <= 1 {
   271  		err = ErrSyntax
   272  		return
   273  	}
   274  	c := s[1]
   275  	s = s[2:]
   276  
   277  	switch c {
   278  	case 'a':
   279  		value = '\a'
   280  	case 'b':
   281  		value = '\b'
   282  	case 'f':
   283  		value = '\f'
   284  	case 'n':
   285  		value = '\n'
   286  	case 'r':
   287  		value = '\r'
   288  	case 't':
   289  		value = '\t'
   290  	case 'v':
   291  		value = '\v'
   292  	case 'x', 'u', 'U':
   293  		n := 0
   294  		switch c {
   295  		case 'x':
   296  			n = 2
   297  		case 'u':
   298  			n = 4
   299  		case 'U':
   300  			n = 8
   301  		}
   302  		var v rune
   303  		if len(s) < n {
   304  			err = ErrSyntax
   305  			return
   306  		}
   307  		for j := 0; j < n; j++ {
   308  			x, ok := unhex(s[j])
   309  			if !ok {
   310  				err = ErrSyntax
   311  				return
   312  			}
   313  			v = v<<4 | x
   314  		}
   315  		s = s[n:]
   316  		if c == 'x' {
   317  			// single-byte string, possibly not UTF-8
   318  			value = v
   319  			break
   320  		}
   321  		if v > utf8.MaxRune {
   322  			err = ErrSyntax
   323  			return
   324  		}
   325  		value = v
   326  		multibyte = true
   327  	case '0', '1', '2', '3', '4', '5', '6', '7':
   328  		v := rune(c) - '0'
   329  		if len(s) < 2 {
   330  			err = ErrSyntax
   331  			return
   332  		}
   333  		for j := 0; j < 2; j++ { // one digit already; two more
   334  			x := rune(s[j]) - '0'
   335  			if x < 0 || x > 7 {
   336  				err = ErrSyntax
   337  				return
   338  			}
   339  			v = (v << 3) | x
   340  		}
   341  		s = s[2:]
   342  		if v > 255 {
   343  			err = ErrSyntax
   344  			return
   345  		}
   346  		value = v
   347  	case '\\':
   348  		value = '\\'
   349  	case '\'', '"':
   350  		if c != quote {
   351  			err = ErrSyntax
   352  			return
   353  		}
   354  		value = rune(c)
   355  	default:
   356  		err = ErrSyntax
   357  		return
   358  	}
   359  	tail = s
   360  	return
   361  }
   362  
   363  // Unquote interprets s as a single-quoted, double-quoted,
   364  // or backquoted Go string literal, returning the string value
   365  // that s quotes.  (If s is single-quoted, it would be a Go
   366  // character literal; Unquote returns the corresponding
   367  // one-character string.)
   368  func Unquote(s string) (string, error) {
   369  	n := len(s)
   370  	if n < 2 {
   371  		return "", ErrSyntax
   372  	}
   373  	quote := s[0]
   374  	if quote != s[n-1] {
   375  		return "", ErrSyntax
   376  	}
   377  	s = s[1 : n-1]
   378  
   379  	if quote == '`' {
   380  		if contains(s, '`') {
   381  			return "", ErrSyntax
   382  		}
   383  		if contains(s, '\r') {
   384  			// -1 because we know there is at least one \r to remove.
   385  			buf := make([]byte, 0, len(s)-1)
   386  			for i := 0; i < len(s); i++ {
   387  				if s[i] != '\r' {
   388  					buf = append(buf, s[i])
   389  				}
   390  			}
   391  			return string(buf), nil
   392  		}
   393  		return s, nil
   394  	}
   395  	if quote != '"' && quote != '\'' {
   396  		return "", ErrSyntax
   397  	}
   398  	if contains(s, '\n') {
   399  		return "", ErrSyntax
   400  	}
   401  
   402  	// Is it trivial? Avoid allocation.
   403  	if !contains(s, '\\') && !contains(s, quote) {
   404  		switch quote {
   405  		case '"':
   406  			if utf8.ValidString(s) {
   407  				return s, nil
   408  			}
   409  		case '\'':
   410  			r, size := utf8.DecodeRuneInString(s)
   411  			if size == len(s) && (r != utf8.RuneError || size != 1) {
   412  				return s, nil
   413  			}
   414  		}
   415  	}
   416  
   417  	var runeTmp [utf8.UTFMax]byte
   418  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   419  	for len(s) > 0 {
   420  		c, multibyte, ss, err := UnquoteChar(s, quote)
   421  		if err != nil {
   422  			return "", err
   423  		}
   424  		s = ss
   425  		if c < utf8.RuneSelf || !multibyte {
   426  			buf = append(buf, byte(c))
   427  		} else {
   428  			n := utf8.EncodeRune(runeTmp[:], c)
   429  			buf = append(buf, runeTmp[:n]...)
   430  		}
   431  		if quote == '\'' && len(s) != 0 {
   432  			// single-quoted must be single character
   433  			return "", ErrSyntax
   434  		}
   435  	}
   436  	return string(buf), nil
   437  }
   438  
   439  // contains reports whether the string contains the byte c.
   440  func contains(s string, c byte) bool {
   441  	return bytealg.IndexByteString(s, c) != -1
   442  }
   443  
   444  // bsearch16 returns the smallest i such that a[i] >= x.
   445  // If there is no such i, bsearch16 returns len(a).
   446  func bsearch16(a []uint16, x uint16) int {
   447  	i, j := 0, len(a)
   448  	for i < j {
   449  		h := i + (j-i)/2
   450  		if a[h] < x {
   451  			i = h + 1
   452  		} else {
   453  			j = h
   454  		}
   455  	}
   456  	return i
   457  }
   458  
   459  // bsearch32 returns the smallest i such that a[i] >= x.
   460  // If there is no such i, bsearch32 returns len(a).
   461  func bsearch32(a []uint32, x uint32) int {
   462  	i, j := 0, len(a)
   463  	for i < j {
   464  		h := i + (j-i)/2
   465  		if a[h] < x {
   466  			i = h + 1
   467  		} else {
   468  			j = h
   469  		}
   470  	}
   471  	return i
   472  }
   473  
   474  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   475  // to give the same answer. It allows this package not to depend on unicode,
   476  // and therefore not pull in all the Unicode tables. If the linker were better
   477  // at tossing unused tables, we could get rid of this implementation.
   478  // That would be nice.
   479  
   480  // IsPrint reports whether the rune is defined as printable by Go, with
   481  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   482  // symbols and ASCII space.
   483  func IsPrint(r rune) bool {
   484  	// Fast check for Latin-1
   485  	if r <= 0xFF {
   486  		if 0x20 <= r && r <= 0x7E {
   487  			// All the ASCII is printable from space through DEL-1.
   488  			return true
   489  		}
   490  		if 0xA1 <= r && r <= 0xFF {
   491  			// Similarly for ¡ through ÿ...
   492  			return r != 0xAD // ...except for the bizarre soft hyphen.
   493  		}
   494  		return false
   495  	}
   496  
   497  	// Same algorithm, either on uint16 or uint32 value.
   498  	// First, find first i such that isPrint[i] >= x.
   499  	// This is the index of either the start or end of a pair that might span x.
   500  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   501  	// If we find x in a range, make sure x is not in isNotPrint list.
   502  
   503  	if 0 <= r && r < 1<<16 {
   504  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   505  		i := bsearch16(isPrint, rr)
   506  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   507  			return false
   508  		}
   509  		j := bsearch16(isNotPrint, rr)
   510  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   511  	}
   512  
   513  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   514  	i := bsearch32(isPrint, rr)
   515  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   516  		return false
   517  	}
   518  	if r >= 0x20000 {
   519  		return true
   520  	}
   521  	r -= 0x10000
   522  	j := bsearch16(isNotPrint, uint16(r))
   523  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   524  }
   525  
   526  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   527  // characters include letters, marks, numbers, punctuation, symbols, and
   528  // spaces, from categories L, M, N, P, S, and Zs.
   529  func IsGraphic(r rune) bool {
   530  	if IsPrint(r) {
   531  		return true
   532  	}
   533  	return isInGraphicList(r)
   534  }
   535  
   536  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   537  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   538  // Should be called only if IsPrint fails.
   539  func isInGraphicList(r rune) bool {
   540  	// We know r must fit in 16 bits - see makeisprint.go.
   541  	if r > 0xFFFF {
   542  		return false
   543  	}
   544  	rr := uint16(r)
   545  	i := bsearch16(isGraphic, rr)
   546  	return i < len(isGraphic) && rr == isGraphic[i]
   547  }
   548  

View as plain text