...
Run Format

Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import "unicode/utf8"
    10  
    11  const lowerhex = "0123456789abcdef"
    12  
    13  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    14  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    15  }
    16  
    17  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    18  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    19  }
    20  
    21  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    22  	buf = append(buf, quote)
    23  	for width := 0; len(s) > 0; s = s[width:] {
    24  		r := rune(s[0])
    25  		width = 1
    26  		if r >= utf8.RuneSelf {
    27  			r, width = utf8.DecodeRuneInString(s)
    28  		}
    29  		if width == 1 && r == utf8.RuneError {
    30  			buf = append(buf, `\x`...)
    31  			buf = append(buf, lowerhex[s[0]>>4])
    32  			buf = append(buf, lowerhex[s[0]&0xF])
    33  			continue
    34  		}
    35  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    36  	}
    37  	buf = append(buf, quote)
    38  	return buf
    39  }
    40  
    41  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    42  	buf = append(buf, quote)
    43  	if !utf8.ValidRune(r) {
    44  		r = utf8.RuneError
    45  	}
    46  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    47  	buf = append(buf, quote)
    48  	return buf
    49  }
    50  
    51  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    52  	var runeTmp [utf8.UTFMax]byte
    53  	if r == rune(quote) || r == '\\' { // always backslashed
    54  		buf = append(buf, '\\')
    55  		buf = append(buf, byte(r))
    56  		return buf
    57  	}
    58  	if ASCIIonly {
    59  		if r < utf8.RuneSelf && IsPrint(r) {
    60  			buf = append(buf, byte(r))
    61  			return buf
    62  		}
    63  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    64  		n := utf8.EncodeRune(runeTmp[:], r)
    65  		buf = append(buf, runeTmp[:n]...)
    66  		return buf
    67  	}
    68  	switch r {
    69  	case '\a':
    70  		buf = append(buf, `\a`...)
    71  	case '\b':
    72  		buf = append(buf, `\b`...)
    73  	case '\f':
    74  		buf = append(buf, `\f`...)
    75  	case '\n':
    76  		buf = append(buf, `\n`...)
    77  	case '\r':
    78  		buf = append(buf, `\r`...)
    79  	case '\t':
    80  		buf = append(buf, `\t`...)
    81  	case '\v':
    82  		buf = append(buf, `\v`...)
    83  	default:
    84  		switch {
    85  		case r < ' ':
    86  			buf = append(buf, `\x`...)
    87  			buf = append(buf, lowerhex[byte(r)>>4])
    88  			buf = append(buf, lowerhex[byte(r)&0xF])
    89  		case r > utf8.MaxRune:
    90  			r = 0xFFFD
    91  			fallthrough
    92  		case r < 0x10000:
    93  			buf = append(buf, `\u`...)
    94  			for s := 12; s >= 0; s -= 4 {
    95  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
    96  			}
    97  		default:
    98  			buf = append(buf, `\U`...)
    99  			for s := 28; s >= 0; s -= 4 {
   100  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   101  			}
   102  		}
   103  	}
   104  	return buf
   105  }
   106  
   107  // Quote returns a double-quoted Go string literal representing s. The
   108  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   109  // control characters and non-printable characters as defined by
   110  // IsPrint.
   111  func Quote(s string) string {
   112  	return quoteWith(s, '"', false, false)
   113  }
   114  
   115  // AppendQuote appends a double-quoted Go string literal representing s,
   116  // as generated by Quote, to dst and returns the extended buffer.
   117  func AppendQuote(dst []byte, s string) []byte {
   118  	return appendQuotedWith(dst, s, '"', false, false)
   119  }
   120  
   121  // QuoteToASCII returns a double-quoted Go string literal representing s.
   122  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   123  // non-ASCII characters and non-printable characters as defined by IsPrint.
   124  func QuoteToASCII(s string) string {
   125  	return quoteWith(s, '"', true, false)
   126  }
   127  
   128  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   129  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   130  func AppendQuoteToASCII(dst []byte, s string) []byte {
   131  	return appendQuotedWith(dst, s, '"', true, false)
   132  }
   133  
   134  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   135  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   136  // non-ASCII characters and non-printable characters as defined by IsGraphic.
   137  func QuoteToGraphic(s string) string {
   138  	return quoteWith(s, '"', false, true)
   139  }
   140  
   141  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   142  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   143  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   144  	return appendQuotedWith(dst, s, '"', false, true)
   145  }
   146  
   147  // QuoteRune returns a single-quoted Go character literal representing the
   148  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   149  // for control characters and non-printable characters as defined by IsPrint.
   150  func QuoteRune(r rune) string {
   151  	return quoteRuneWith(r, '\'', false, false)
   152  }
   153  
   154  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   155  // as generated by QuoteRune, to dst and returns the extended buffer.
   156  func AppendQuoteRune(dst []byte, r rune) []byte {
   157  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   158  }
   159  
   160  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   161  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   162  // \u0100) for non-ASCII characters and non-printable characters as defined
   163  // by IsPrint.
   164  func QuoteRuneToASCII(r rune) string {
   165  	return quoteRuneWith(r, '\'', true, false)
   166  }
   167  
   168  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   169  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   170  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   171  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   172  }
   173  
   174  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   175  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   176  // \u0100) for non-ASCII characters and non-printable characters as defined
   177  // by IsGraphic.
   178  func QuoteRuneToGraphic(r rune) string {
   179  	return quoteRuneWith(r, '\'', false, true)
   180  }
   181  
   182  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   183  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   184  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   185  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   186  }
   187  
   188  // CanBackquote reports whether the string s can be represented
   189  // unchanged as a single-line backquoted string without control
   190  // characters other than tab.
   191  func CanBackquote(s string) bool {
   192  	for len(s) > 0 {
   193  		r, wid := utf8.DecodeRuneInString(s)
   194  		s = s[wid:]
   195  		if wid > 1 {
   196  			if r == '\ufeff' {
   197  				return false // BOMs are invisible and should not be quoted.
   198  			}
   199  			continue // All other multibyte runes are correctly encoded and assumed printable.
   200  		}
   201  		if r == utf8.RuneError {
   202  			return false
   203  		}
   204  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   205  			return false
   206  		}
   207  	}
   208  	return true
   209  }
   210  
   211  func unhex(b byte) (v rune, ok bool) {
   212  	c := rune(b)
   213  	switch {
   214  	case '0' <= c && c <= '9':
   215  		return c - '0', true
   216  	case 'a' <= c && c <= 'f':
   217  		return c - 'a' + 10, true
   218  	case 'A' <= c && c <= 'F':
   219  		return c - 'A' + 10, true
   220  	}
   221  	return
   222  }
   223  
   224  // UnquoteChar decodes the first character or byte in the escaped string
   225  // or character literal represented by the string s.
   226  // It returns four values:
   227  //
   228  //	1) value, the decoded Unicode code point or byte value;
   229  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   230  //	3) tail, the remainder of the string after the character; and
   231  //	4) an error that will be nil if the character is syntactically valid.
   232  //
   233  // The second argument, quote, specifies the type of literal being parsed
   234  // and therefore which escaped quote character is permitted.
   235  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   236  // If set to a double quote, it permits \" and disallows unescaped ".
   237  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   238  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   239  	// easy cases
   240  	switch c := s[0]; {
   241  	case c == quote && (quote == '\'' || quote == '"'):
   242  		err = ErrSyntax
   243  		return
   244  	case c >= utf8.RuneSelf:
   245  		r, size := utf8.DecodeRuneInString(s)
   246  		return r, true, s[size:], nil
   247  	case c != '\\':
   248  		return rune(s[0]), false, s[1:], nil
   249  	}
   250  
   251  	// hard case: c is backslash
   252  	if len(s) <= 1 {
   253  		err = ErrSyntax
   254  		return
   255  	}
   256  	c := s[1]
   257  	s = s[2:]
   258  
   259  	switch c {
   260  	case 'a':
   261  		value = '\a'
   262  	case 'b':
   263  		value = '\b'
   264  	case 'f':
   265  		value = '\f'
   266  	case 'n':
   267  		value = '\n'
   268  	case 'r':
   269  		value = '\r'
   270  	case 't':
   271  		value = '\t'
   272  	case 'v':
   273  		value = '\v'
   274  	case 'x', 'u', 'U':
   275  		n := 0
   276  		switch c {
   277  		case 'x':
   278  			n = 2
   279  		case 'u':
   280  			n = 4
   281  		case 'U':
   282  			n = 8
   283  		}
   284  		var v rune
   285  		if len(s) < n {
   286  			err = ErrSyntax
   287  			return
   288  		}
   289  		for j := 0; j < n; j++ {
   290  			x, ok := unhex(s[j])
   291  			if !ok {
   292  				err = ErrSyntax
   293  				return
   294  			}
   295  			v = v<<4 | x
   296  		}
   297  		s = s[n:]
   298  		if c == 'x' {
   299  			// single-byte string, possibly not UTF-8
   300  			value = v
   301  			break
   302  		}
   303  		if v > utf8.MaxRune {
   304  			err = ErrSyntax
   305  			return
   306  		}
   307  		value = v
   308  		multibyte = true
   309  	case '0', '1', '2', '3', '4', '5', '6', '7':
   310  		v := rune(c) - '0'
   311  		if len(s) < 2 {
   312  			err = ErrSyntax
   313  			return
   314  		}
   315  		for j := 0; j < 2; j++ { // one digit already; two more
   316  			x := rune(s[j]) - '0'
   317  			if x < 0 || x > 7 {
   318  				err = ErrSyntax
   319  				return
   320  			}
   321  			v = (v << 3) | x
   322  		}
   323  		s = s[2:]
   324  		if v > 255 {
   325  			err = ErrSyntax
   326  			return
   327  		}
   328  		value = v
   329  	case '\\':
   330  		value = '\\'
   331  	case '\'', '"':
   332  		if c != quote {
   333  			err = ErrSyntax
   334  			return
   335  		}
   336  		value = rune(c)
   337  	default:
   338  		err = ErrSyntax
   339  		return
   340  	}
   341  	tail = s
   342  	return
   343  }
   344  
   345  // Unquote interprets s as a single-quoted, double-quoted,
   346  // or backquoted Go string literal, returning the string value
   347  // that s quotes.  (If s is single-quoted, it would be a Go
   348  // character literal; Unquote returns the corresponding
   349  // one-character string.)
   350  func Unquote(s string) (string, error) {
   351  	n := len(s)
   352  	if n < 2 {
   353  		return "", ErrSyntax
   354  	}
   355  	quote := s[0]
   356  	if quote != s[n-1] {
   357  		return "", ErrSyntax
   358  	}
   359  	s = s[1 : n-1]
   360  
   361  	if quote == '`' {
   362  		if contains(s, '`') {
   363  			return "", ErrSyntax
   364  		}
   365  		if contains(s, '\r') {
   366  			// -1 because we know there is at least one \r to remove.
   367  			buf := make([]byte, 0, len(s)-1)
   368  			for i := 0; i < len(s); i++ {
   369  				if s[i] != '\r' {
   370  					buf = append(buf, s[i])
   371  				}
   372  			}
   373  			return string(buf), nil
   374  		}
   375  		return s, nil
   376  	}
   377  	if quote != '"' && quote != '\'' {
   378  		return "", ErrSyntax
   379  	}
   380  	if contains(s, '\n') {
   381  		return "", ErrSyntax
   382  	}
   383  
   384  	// Is it trivial? Avoid allocation.
   385  	if !contains(s, '\\') && !contains(s, quote) {
   386  		switch quote {
   387  		case '"':
   388  			return s, nil
   389  		case '\'':
   390  			r, size := utf8.DecodeRuneInString(s)
   391  			if size == len(s) && (r != utf8.RuneError || size != 1) {
   392  				return s, nil
   393  			}
   394  		}
   395  	}
   396  
   397  	var runeTmp [utf8.UTFMax]byte
   398  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   399  	for len(s) > 0 {
   400  		c, multibyte, ss, err := UnquoteChar(s, quote)
   401  		if err != nil {
   402  			return "", err
   403  		}
   404  		s = ss
   405  		if c < utf8.RuneSelf || !multibyte {
   406  			buf = append(buf, byte(c))
   407  		} else {
   408  			n := utf8.EncodeRune(runeTmp[:], c)
   409  			buf = append(buf, runeTmp[:n]...)
   410  		}
   411  		if quote == '\'' && len(s) != 0 {
   412  			// single-quoted must be single character
   413  			return "", ErrSyntax
   414  		}
   415  	}
   416  	return string(buf), nil
   417  }
   418  
   419  // contains reports whether the string contains the byte c.
   420  func contains(s string, c byte) bool {
   421  	for i := 0; i < len(s); i++ {
   422  		if s[i] == c {
   423  			return true
   424  		}
   425  	}
   426  	return false
   427  }
   428  
   429  // bsearch16 returns the smallest i such that a[i] >= x.
   430  // If there is no such i, bsearch16 returns len(a).
   431  func bsearch16(a []uint16, x uint16) int {
   432  	i, j := 0, len(a)
   433  	for i < j {
   434  		h := i + (j-i)/2
   435  		if a[h] < x {
   436  			i = h + 1
   437  		} else {
   438  			j = h
   439  		}
   440  	}
   441  	return i
   442  }
   443  
   444  // bsearch32 returns the smallest i such that a[i] >= x.
   445  // If there is no such i, bsearch32 returns len(a).
   446  func bsearch32(a []uint32, x uint32) int {
   447  	i, j := 0, len(a)
   448  	for i < j {
   449  		h := i + (j-i)/2
   450  		if a[h] < x {
   451  			i = h + 1
   452  		} else {
   453  			j = h
   454  		}
   455  	}
   456  	return i
   457  }
   458  
   459  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   460  // to give the same answer. It allows this package not to depend on unicode,
   461  // and therefore not pull in all the Unicode tables. If the linker were better
   462  // at tossing unused tables, we could get rid of this implementation.
   463  // That would be nice.
   464  
   465  // IsPrint reports whether the rune is defined as printable by Go, with
   466  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   467  // symbols and ASCII space.
   468  func IsPrint(r rune) bool {
   469  	// Fast check for Latin-1
   470  	if r <= 0xFF {
   471  		if 0x20 <= r && r <= 0x7E {
   472  			// All the ASCII is printable from space through DEL-1.
   473  			return true
   474  		}
   475  		if 0xA1 <= r && r <= 0xFF {
   476  			// Similarly for ¡ through ÿ...
   477  			return r != 0xAD // ...except for the bizarre soft hyphen.
   478  		}
   479  		return false
   480  	}
   481  
   482  	// Same algorithm, either on uint16 or uint32 value.
   483  	// First, find first i such that isPrint[i] >= x.
   484  	// This is the index of either the start or end of a pair that might span x.
   485  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   486  	// If we find x in a range, make sure x is not in isNotPrint list.
   487  
   488  	if 0 <= r && r < 1<<16 {
   489  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   490  		i := bsearch16(isPrint, rr)
   491  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   492  			return false
   493  		}
   494  		j := bsearch16(isNotPrint, rr)
   495  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   496  	}
   497  
   498  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   499  	i := bsearch32(isPrint, rr)
   500  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   501  		return false
   502  	}
   503  	if r >= 0x20000 {
   504  		return true
   505  	}
   506  	r -= 0x10000
   507  	j := bsearch16(isNotPrint, uint16(r))
   508  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   509  }
   510  
   511  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   512  // characters include letters, marks, numbers, punctuation, symbols, and
   513  // spaces, from categories L, M, N, P, S, and Zs.
   514  func IsGraphic(r rune) bool {
   515  	if IsPrint(r) {
   516  		return true
   517  	}
   518  	return isInGraphicList(r)
   519  }
   520  
   521  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   522  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   523  // Should be called only if IsPrint fails.
   524  func isInGraphicList(r rune) bool {
   525  	// We know r must fit in 16 bits - see makeisprint.go.
   526  	if r > 0xFFFF {
   527  		return false
   528  	}
   529  	rr := uint16(r)
   530  	i := bsearch16(isGraphic, rr)
   531  	return i < len(isGraphic) && rr == isGraphic[i]
   532  }
   533  

View as plain text