...
Run Format

Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import "unicode/utf8"
    10  
    11  const lowerhex = "0123456789abcdef"
    12  
    13  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    14  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    15  }
    16  
    17  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    18  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    19  }
    20  
    21  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    22  	buf = append(buf, quote)
    23  	for width := 0; len(s) > 0; s = s[width:] {
    24  		r := rune(s[0])
    25  		width = 1
    26  		if r >= utf8.RuneSelf {
    27  			r, width = utf8.DecodeRuneInString(s)
    28  		}
    29  		if width == 1 && r == utf8.RuneError {
    30  			buf = append(buf, `\x`...)
    31  			buf = append(buf, lowerhex[s[0]>>4])
    32  			buf = append(buf, lowerhex[s[0]&0xF])
    33  			continue
    34  		}
    35  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    36  	}
    37  	buf = append(buf, quote)
    38  	return buf
    39  }
    40  
    41  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    42  	buf = append(buf, quote)
    43  	if !utf8.ValidRune(r) {
    44  		r = utf8.RuneError
    45  	}
    46  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    47  	buf = append(buf, quote)
    48  	return buf
    49  }
    50  
    51  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    52  	var runeTmp [utf8.UTFMax]byte
    53  	if r == rune(quote) || r == '\\' { // always backslashed
    54  		buf = append(buf, '\\')
    55  		buf = append(buf, byte(r))
    56  		return buf
    57  	}
    58  	if ASCIIonly {
    59  		if r < utf8.RuneSelf && IsPrint(r) {
    60  			buf = append(buf, byte(r))
    61  			return buf
    62  		}
    63  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    64  		n := utf8.EncodeRune(runeTmp[:], r)
    65  		buf = append(buf, runeTmp[:n]...)
    66  		return buf
    67  	}
    68  	switch r {
    69  	case '\a':
    70  		buf = append(buf, `\a`...)
    71  	case '\b':
    72  		buf = append(buf, `\b`...)
    73  	case '\f':
    74  		buf = append(buf, `\f`...)
    75  	case '\n':
    76  		buf = append(buf, `\n`...)
    77  	case '\r':
    78  		buf = append(buf, `\r`...)
    79  	case '\t':
    80  		buf = append(buf, `\t`...)
    81  	case '\v':
    82  		buf = append(buf, `\v`...)
    83  	default:
    84  		switch {
    85  		case r < ' ':
    86  			buf = append(buf, `\x`...)
    87  			buf = append(buf, lowerhex[byte(r)>>4])
    88  			buf = append(buf, lowerhex[byte(r)&0xF])
    89  		case r > utf8.MaxRune:
    90  			r = 0xFFFD
    91  			fallthrough
    92  		case r < 0x10000:
    93  			buf = append(buf, `\u`...)
    94  			for s := 12; s >= 0; s -= 4 {
    95  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
    96  			}
    97  		default:
    98  			buf = append(buf, `\U`...)
    99  			for s := 28; s >= 0; s -= 4 {
   100  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   101  			}
   102  		}
   103  	}
   104  	return buf
   105  }
   106  
   107  // Quote returns a double-quoted Go string literal representing s. The
   108  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   109  // control characters and non-printable characters as defined by
   110  // IsPrint.
   111  func Quote(s string) string {
   112  	return quoteWith(s, '"', false, false)
   113  }
   114  
   115  // AppendQuote appends a double-quoted Go string literal representing s,
   116  // as generated by Quote, to dst and returns the extended buffer.
   117  func AppendQuote(dst []byte, s string) []byte {
   118  	return appendQuotedWith(dst, s, '"', false, false)
   119  }
   120  
   121  // QuoteToASCII returns a double-quoted Go string literal representing s.
   122  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   123  // non-ASCII characters and non-printable characters as defined by IsPrint.
   124  func QuoteToASCII(s string) string {
   125  	return quoteWith(s, '"', true, false)
   126  }
   127  
   128  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   129  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   130  func AppendQuoteToASCII(dst []byte, s string) []byte {
   131  	return appendQuotedWith(dst, s, '"', true, false)
   132  }
   133  
   134  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   135  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   136  // non-ASCII characters and non-printable characters as defined by IsGraphic.
   137  func QuoteToGraphic(s string) string {
   138  	return quoteWith(s, '"', false, true)
   139  }
   140  
   141  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   142  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   143  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   144  	return appendQuotedWith(dst, s, '"', false, true)
   145  }
   146  
   147  // QuoteRune returns a single-quoted Go character literal representing the
   148  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   149  // for control characters and non-printable characters as defined by IsPrint.
   150  func QuoteRune(r rune) string {
   151  	return quoteRuneWith(r, '\'', false, false)
   152  }
   153  
   154  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   155  // as generated by QuoteRune, to dst and returns the extended buffer.
   156  func AppendQuoteRune(dst []byte, r rune) []byte {
   157  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   158  }
   159  
   160  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   161  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   162  // \u0100) for non-ASCII characters and non-printable characters as defined
   163  // by IsPrint.
   164  func QuoteRuneToASCII(r rune) string {
   165  	return quoteRuneWith(r, '\'', true, false)
   166  }
   167  
   168  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   169  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   170  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   171  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   172  }
   173  
   174  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   175  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   176  // \u0100) for non-ASCII characters and non-printable characters as defined
   177  // by IsGraphic.
   178  func QuoteRuneToGraphic(r rune) string {
   179  	return quoteRuneWith(r, '\'', false, true)
   180  }
   181  
   182  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   183  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   184  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   185  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   186  }
   187  
   188  // CanBackquote reports whether the string s can be represented
   189  // unchanged as a single-line backquoted string without control
   190  // characters other than tab.
   191  func CanBackquote(s string) bool {
   192  	for len(s) > 0 {
   193  		r, wid := utf8.DecodeRuneInString(s)
   194  		s = s[wid:]
   195  		if wid > 1 {
   196  			if r == '\ufeff' {
   197  				return false // BOMs are invisible and should not be quoted.
   198  			}
   199  			continue // All other multibyte runes are correctly encoded and assumed printable.
   200  		}
   201  		if r == utf8.RuneError {
   202  			return false
   203  		}
   204  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   205  			return false
   206  		}
   207  	}
   208  	return true
   209  }
   210  
   211  func unhex(b byte) (v rune, ok bool) {
   212  	c := rune(b)
   213  	switch {
   214  	case '0' <= c && c <= '9':
   215  		return c - '0', true
   216  	case 'a' <= c && c <= 'f':
   217  		return c - 'a' + 10, true
   218  	case 'A' <= c && c <= 'F':
   219  		return c - 'A' + 10, true
   220  	}
   221  	return
   222  }
   223  
   224  // UnquoteChar decodes the first character or byte in the escaped string
   225  // or character literal represented by the string s.
   226  // It returns four values:
   227  //
   228  //	1) value, the decoded Unicode code point or byte value;
   229  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   230  //	3) tail, the remainder of the string after the character; and
   231  //	4) an error that will be nil if the character is syntactically valid.
   232  //
   233  // The second argument, quote, specifies the type of literal being parsed
   234  // and therefore which escaped quote character is permitted.
   235  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   236  // If set to a double quote, it permits \" and disallows unescaped ".
   237  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   238  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   239  	// easy cases
   240  	if len(s) == 0 {
   241  		err = ErrSyntax
   242  		return
   243  	}
   244  	switch c := s[0]; {
   245  	case c == quote && (quote == '\'' || quote == '"'):
   246  		err = ErrSyntax
   247  		return
   248  	case c >= utf8.RuneSelf:
   249  		r, size := utf8.DecodeRuneInString(s)
   250  		return r, true, s[size:], nil
   251  	case c != '\\':
   252  		return rune(s[0]), false, s[1:], nil
   253  	}
   254  
   255  	// hard case: c is backslash
   256  	if len(s) <= 1 {
   257  		err = ErrSyntax
   258  		return
   259  	}
   260  	c := s[1]
   261  	s = s[2:]
   262  
   263  	switch c {
   264  	case 'a':
   265  		value = '\a'
   266  	case 'b':
   267  		value = '\b'
   268  	case 'f':
   269  		value = '\f'
   270  	case 'n':
   271  		value = '\n'
   272  	case 'r':
   273  		value = '\r'
   274  	case 't':
   275  		value = '\t'
   276  	case 'v':
   277  		value = '\v'
   278  	case 'x', 'u', 'U':
   279  		n := 0
   280  		switch c {
   281  		case 'x':
   282  			n = 2
   283  		case 'u':
   284  			n = 4
   285  		case 'U':
   286  			n = 8
   287  		}
   288  		var v rune
   289  		if len(s) < n {
   290  			err = ErrSyntax
   291  			return
   292  		}
   293  		for j := 0; j < n; j++ {
   294  			x, ok := unhex(s[j])
   295  			if !ok {
   296  				err = ErrSyntax
   297  				return
   298  			}
   299  			v = v<<4 | x
   300  		}
   301  		s = s[n:]
   302  		if c == 'x' {
   303  			// single-byte string, possibly not UTF-8
   304  			value = v
   305  			break
   306  		}
   307  		if v > utf8.MaxRune {
   308  			err = ErrSyntax
   309  			return
   310  		}
   311  		value = v
   312  		multibyte = true
   313  	case '0', '1', '2', '3', '4', '5', '6', '7':
   314  		v := rune(c) - '0'
   315  		if len(s) < 2 {
   316  			err = ErrSyntax
   317  			return
   318  		}
   319  		for j := 0; j < 2; j++ { // one digit already; two more
   320  			x := rune(s[j]) - '0'
   321  			if x < 0 || x > 7 {
   322  				err = ErrSyntax
   323  				return
   324  			}
   325  			v = (v << 3) | x
   326  		}
   327  		s = s[2:]
   328  		if v > 255 {
   329  			err = ErrSyntax
   330  			return
   331  		}
   332  		value = v
   333  	case '\\':
   334  		value = '\\'
   335  	case '\'', '"':
   336  		if c != quote {
   337  			err = ErrSyntax
   338  			return
   339  		}
   340  		value = rune(c)
   341  	default:
   342  		err = ErrSyntax
   343  		return
   344  	}
   345  	tail = s
   346  	return
   347  }
   348  
   349  // Unquote interprets s as a single-quoted, double-quoted,
   350  // or backquoted Go string literal, returning the string value
   351  // that s quotes.  (If s is single-quoted, it would be a Go
   352  // character literal; Unquote returns the corresponding
   353  // one-character string.)
   354  func Unquote(s string) (string, error) {
   355  	n := len(s)
   356  	if n < 2 {
   357  		return "", ErrSyntax
   358  	}
   359  	quote := s[0]
   360  	if quote != s[n-1] {
   361  		return "", ErrSyntax
   362  	}
   363  	s = s[1 : n-1]
   364  
   365  	if quote == '`' {
   366  		if contains(s, '`') {
   367  			return "", ErrSyntax
   368  		}
   369  		if contains(s, '\r') {
   370  			// -1 because we know there is at least one \r to remove.
   371  			buf := make([]byte, 0, len(s)-1)
   372  			for i := 0; i < len(s); i++ {
   373  				if s[i] != '\r' {
   374  					buf = append(buf, s[i])
   375  				}
   376  			}
   377  			return string(buf), nil
   378  		}
   379  		return s, nil
   380  	}
   381  	if quote != '"' && quote != '\'' {
   382  		return "", ErrSyntax
   383  	}
   384  	if contains(s, '\n') {
   385  		return "", ErrSyntax
   386  	}
   387  
   388  	// Is it trivial? Avoid allocation.
   389  	if !contains(s, '\\') && !contains(s, quote) {
   390  		switch quote {
   391  		case '"':
   392  			if utf8.ValidString(s) {
   393  				return s, nil
   394  			}
   395  		case '\'':
   396  			r, size := utf8.DecodeRuneInString(s)
   397  			if size == len(s) && (r != utf8.RuneError || size != 1) {
   398  				return s, nil
   399  			}
   400  		}
   401  	}
   402  
   403  	var runeTmp [utf8.UTFMax]byte
   404  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   405  	for len(s) > 0 {
   406  		c, multibyte, ss, err := UnquoteChar(s, quote)
   407  		if err != nil {
   408  			return "", err
   409  		}
   410  		s = ss
   411  		if c < utf8.RuneSelf || !multibyte {
   412  			buf = append(buf, byte(c))
   413  		} else {
   414  			n := utf8.EncodeRune(runeTmp[:], c)
   415  			buf = append(buf, runeTmp[:n]...)
   416  		}
   417  		if quote == '\'' && len(s) != 0 {
   418  			// single-quoted must be single character
   419  			return "", ErrSyntax
   420  		}
   421  	}
   422  	return string(buf), nil
   423  }
   424  
   425  // contains reports whether the string contains the byte c.
   426  func contains(s string, c byte) bool {
   427  	for i := 0; i < len(s); i++ {
   428  		if s[i] == c {
   429  			return true
   430  		}
   431  	}
   432  	return false
   433  }
   434  
   435  // bsearch16 returns the smallest i such that a[i] >= x.
   436  // If there is no such i, bsearch16 returns len(a).
   437  func bsearch16(a []uint16, x uint16) int {
   438  	i, j := 0, len(a)
   439  	for i < j {
   440  		h := i + (j-i)/2
   441  		if a[h] < x {
   442  			i = h + 1
   443  		} else {
   444  			j = h
   445  		}
   446  	}
   447  	return i
   448  }
   449  
   450  // bsearch32 returns the smallest i such that a[i] >= x.
   451  // If there is no such i, bsearch32 returns len(a).
   452  func bsearch32(a []uint32, x uint32) int {
   453  	i, j := 0, len(a)
   454  	for i < j {
   455  		h := i + (j-i)/2
   456  		if a[h] < x {
   457  			i = h + 1
   458  		} else {
   459  			j = h
   460  		}
   461  	}
   462  	return i
   463  }
   464  
   465  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   466  // to give the same answer. It allows this package not to depend on unicode,
   467  // and therefore not pull in all the Unicode tables. If the linker were better
   468  // at tossing unused tables, we could get rid of this implementation.
   469  // That would be nice.
   470  
   471  // IsPrint reports whether the rune is defined as printable by Go, with
   472  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   473  // symbols and ASCII space.
   474  func IsPrint(r rune) bool {
   475  	// Fast check for Latin-1
   476  	if r <= 0xFF {
   477  		if 0x20 <= r && r <= 0x7E {
   478  			// All the ASCII is printable from space through DEL-1.
   479  			return true
   480  		}
   481  		if 0xA1 <= r && r <= 0xFF {
   482  			// Similarly for ¡ through ÿ...
   483  			return r != 0xAD // ...except for the bizarre soft hyphen.
   484  		}
   485  		return false
   486  	}
   487  
   488  	// Same algorithm, either on uint16 or uint32 value.
   489  	// First, find first i such that isPrint[i] >= x.
   490  	// This is the index of either the start or end of a pair that might span x.
   491  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   492  	// If we find x in a range, make sure x is not in isNotPrint list.
   493  
   494  	if 0 <= r && r < 1<<16 {
   495  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   496  		i := bsearch16(isPrint, rr)
   497  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   498  			return false
   499  		}
   500  		j := bsearch16(isNotPrint, rr)
   501  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   502  	}
   503  
   504  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   505  	i := bsearch32(isPrint, rr)
   506  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   507  		return false
   508  	}
   509  	if r >= 0x20000 {
   510  		return true
   511  	}
   512  	r -= 0x10000
   513  	j := bsearch16(isNotPrint, uint16(r))
   514  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   515  }
   516  
   517  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   518  // characters include letters, marks, numbers, punctuation, symbols, and
   519  // spaces, from categories L, M, N, P, S, and Zs.
   520  func IsGraphic(r rune) bool {
   521  	if IsPrint(r) {
   522  		return true
   523  	}
   524  	return isInGraphicList(r)
   525  }
   526  
   527  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   528  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   529  // Should be called only if IsPrint fails.
   530  func isInGraphicList(r rune) bool {
   531  	// We know r must fit in 16 bits - see makeisprint.go.
   532  	if r > 0xFFFF {
   533  		return false
   534  	}
   535  	rr := uint16(r)
   536  	i := bsearch16(isGraphic, rr)
   537  	return i < len(isGraphic) && rr == isGraphic[i]
   538  }
   539  

View as plain text