...
Run Format

Source file src/strconv/quote.go

Documentation: strconv

     1  // Copyright 2009 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  //go:generate go run makeisprint.go -output isprint.go
     6  
     7  package strconv
     8  
     9  import (
    10  	"internal/bytealg"
    11  	"unicode/utf8"
    12  )
    13  
    14  const lowerhex = "0123456789abcdef"
    15  
    16  func quoteWith(s string, quote byte, ASCIIonly, graphicOnly bool) string {
    17  	return string(appendQuotedWith(make([]byte, 0, 3*len(s)/2), s, quote, ASCIIonly, graphicOnly))
    18  }
    19  
    20  func quoteRuneWith(r rune, quote byte, ASCIIonly, graphicOnly bool) string {
    21  	return string(appendQuotedRuneWith(nil, r, quote, ASCIIonly, graphicOnly))
    22  }
    23  
    24  func appendQuotedWith(buf []byte, s string, quote byte, ASCIIonly, graphicOnly bool) []byte {
    25  	buf = append(buf, quote)
    26  	for width := 0; len(s) > 0; s = s[width:] {
    27  		r := rune(s[0])
    28  		width = 1
    29  		if r >= utf8.RuneSelf {
    30  			r, width = utf8.DecodeRuneInString(s)
    31  		}
    32  		if width == 1 && r == utf8.RuneError {
    33  			buf = append(buf, `\x`...)
    34  			buf = append(buf, lowerhex[s[0]>>4])
    35  			buf = append(buf, lowerhex[s[0]&0xF])
    36  			continue
    37  		}
    38  		buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    39  	}
    40  	buf = append(buf, quote)
    41  	return buf
    42  }
    43  
    44  func appendQuotedRuneWith(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    45  	buf = append(buf, quote)
    46  	if !utf8.ValidRune(r) {
    47  		r = utf8.RuneError
    48  	}
    49  	buf = appendEscapedRune(buf, r, quote, ASCIIonly, graphicOnly)
    50  	buf = append(buf, quote)
    51  	return buf
    52  }
    53  
    54  func appendEscapedRune(buf []byte, r rune, quote byte, ASCIIonly, graphicOnly bool) []byte {
    55  	var runeTmp [utf8.UTFMax]byte
    56  	if r == rune(quote) || r == '\\' { // always backslashed
    57  		buf = append(buf, '\\')
    58  		buf = append(buf, byte(r))
    59  		return buf
    60  	}
    61  	if ASCIIonly {
    62  		if r < utf8.RuneSelf && IsPrint(r) {
    63  			buf = append(buf, byte(r))
    64  			return buf
    65  		}
    66  	} else if IsPrint(r) || graphicOnly && isInGraphicList(r) {
    67  		n := utf8.EncodeRune(runeTmp[:], r)
    68  		buf = append(buf, runeTmp[:n]...)
    69  		return buf
    70  	}
    71  	switch r {
    72  	case '\a':
    73  		buf = append(buf, `\a`...)
    74  	case '\b':
    75  		buf = append(buf, `\b`...)
    76  	case '\f':
    77  		buf = append(buf, `\f`...)
    78  	case '\n':
    79  		buf = append(buf, `\n`...)
    80  	case '\r':
    81  		buf = append(buf, `\r`...)
    82  	case '\t':
    83  		buf = append(buf, `\t`...)
    84  	case '\v':
    85  		buf = append(buf, `\v`...)
    86  	default:
    87  		switch {
    88  		case r < ' ':
    89  			buf = append(buf, `\x`...)
    90  			buf = append(buf, lowerhex[byte(r)>>4])
    91  			buf = append(buf, lowerhex[byte(r)&0xF])
    92  		case r > utf8.MaxRune:
    93  			r = 0xFFFD
    94  			fallthrough
    95  		case r < 0x10000:
    96  			buf = append(buf, `\u`...)
    97  			for s := 12; s >= 0; s -= 4 {
    98  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
    99  			}
   100  		default:
   101  			buf = append(buf, `\U`...)
   102  			for s := 28; s >= 0; s -= 4 {
   103  				buf = append(buf, lowerhex[r>>uint(s)&0xF])
   104  			}
   105  		}
   106  	}
   107  	return buf
   108  }
   109  
   110  // Quote returns a double-quoted Go string literal representing s. The
   111  // returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   112  // control characters and non-printable characters as defined by
   113  // IsPrint.
   114  func Quote(s string) string {
   115  	return quoteWith(s, '"', false, false)
   116  }
   117  
   118  // AppendQuote appends a double-quoted Go string literal representing s,
   119  // as generated by Quote, to dst and returns the extended buffer.
   120  func AppendQuote(dst []byte, s string) []byte {
   121  	return appendQuotedWith(dst, s, '"', false, false)
   122  }
   123  
   124  // QuoteToASCII returns a double-quoted Go string literal representing s.
   125  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   126  // non-ASCII characters and non-printable characters as defined by IsPrint.
   127  func QuoteToASCII(s string) string {
   128  	return quoteWith(s, '"', true, false)
   129  }
   130  
   131  // AppendQuoteToASCII appends a double-quoted Go string literal representing s,
   132  // as generated by QuoteToASCII, to dst and returns the extended buffer.
   133  func AppendQuoteToASCII(dst []byte, s string) []byte {
   134  	return appendQuotedWith(dst, s, '"', true, false)
   135  }
   136  
   137  // QuoteToGraphic returns a double-quoted Go string literal representing s.
   138  // The returned string uses Go escape sequences (\t, \n, \xFF, \u0100) for
   139  // non-ASCII characters and non-printable characters as defined by IsGraphic.
   140  func QuoteToGraphic(s string) string {
   141  	return quoteWith(s, '"', false, true)
   142  }
   143  
   144  // AppendQuoteToGraphic appends a double-quoted Go string literal representing s,
   145  // as generated by QuoteToGraphic, to dst and returns the extended buffer.
   146  func AppendQuoteToGraphic(dst []byte, s string) []byte {
   147  	return appendQuotedWith(dst, s, '"', false, true)
   148  }
   149  
   150  // QuoteRune returns a single-quoted Go character literal representing the
   151  // rune. The returned string uses Go escape sequences (\t, \n, \xFF, \u0100)
   152  // for control characters and non-printable characters as defined by IsPrint.
   153  func QuoteRune(r rune) string {
   154  	return quoteRuneWith(r, '\'', false, false)
   155  }
   156  
   157  // AppendQuoteRune appends a single-quoted Go character literal representing the rune,
   158  // as generated by QuoteRune, to dst and returns the extended buffer.
   159  func AppendQuoteRune(dst []byte, r rune) []byte {
   160  	return appendQuotedRuneWith(dst, r, '\'', false, false)
   161  }
   162  
   163  // QuoteRuneToASCII returns a single-quoted Go character literal representing
   164  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   165  // \u0100) for non-ASCII characters and non-printable characters as defined
   166  // by IsPrint.
   167  func QuoteRuneToASCII(r rune) string {
   168  	return quoteRuneWith(r, '\'', true, false)
   169  }
   170  
   171  // AppendQuoteRuneToASCII appends a single-quoted Go character literal representing the rune,
   172  // as generated by QuoteRuneToASCII, to dst and returns the extended buffer.
   173  func AppendQuoteRuneToASCII(dst []byte, r rune) []byte {
   174  	return appendQuotedRuneWith(dst, r, '\'', true, false)
   175  }
   176  
   177  // QuoteRuneToGraphic returns a single-quoted Go character literal representing
   178  // the rune. The returned string uses Go escape sequences (\t, \n, \xFF,
   179  // \u0100) for non-ASCII characters and non-printable characters as defined
   180  // by IsGraphic.
   181  func QuoteRuneToGraphic(r rune) string {
   182  	return quoteRuneWith(r, '\'', false, true)
   183  }
   184  
   185  // AppendQuoteRuneToGraphic appends a single-quoted Go character literal representing the rune,
   186  // as generated by QuoteRuneToGraphic, to dst and returns the extended buffer.
   187  func AppendQuoteRuneToGraphic(dst []byte, r rune) []byte {
   188  	return appendQuotedRuneWith(dst, r, '\'', false, true)
   189  }
   190  
   191  // CanBackquote reports whether the string s can be represented
   192  // unchanged as a single-line backquoted string without control
   193  // characters other than tab.
   194  func CanBackquote(s string) bool {
   195  	for len(s) > 0 {
   196  		r, wid := utf8.DecodeRuneInString(s)
   197  		s = s[wid:]
   198  		if wid > 1 {
   199  			if r == '\ufeff' {
   200  				return false // BOMs are invisible and should not be quoted.
   201  			}
   202  			continue // All other multibyte runes are correctly encoded and assumed printable.
   203  		}
   204  		if r == utf8.RuneError {
   205  			return false
   206  		}
   207  		if (r < ' ' && r != '\t') || r == '`' || r == '\u007F' {
   208  			return false
   209  		}
   210  	}
   211  	return true
   212  }
   213  
   214  func unhex(b byte) (v rune, ok bool) {
   215  	c := rune(b)
   216  	switch {
   217  	case '0' <= c && c <= '9':
   218  		return c - '0', true
   219  	case 'a' <= c && c <= 'f':
   220  		return c - 'a' + 10, true
   221  	case 'A' <= c && c <= 'F':
   222  		return c - 'A' + 10, true
   223  	}
   224  	return
   225  }
   226  
   227  // UnquoteChar decodes the first character or byte in the escaped string
   228  // or character literal represented by the string s.
   229  // It returns four values:
   230  //
   231  //	1) value, the decoded Unicode code point or byte value;
   232  //	2) multibyte, a boolean indicating whether the decoded character requires a multibyte UTF-8 representation;
   233  //	3) tail, the remainder of the string after the character; and
   234  //	4) an error that will be nil if the character is syntactically valid.
   235  //
   236  // The second argument, quote, specifies the type of literal being parsed
   237  // and therefore which escaped quote character is permitted.
   238  // If set to a single quote, it permits the sequence \' and disallows unescaped '.
   239  // If set to a double quote, it permits \" and disallows unescaped ".
   240  // If set to zero, it does not permit either escape and allows both quote characters to appear unescaped.
   241  func UnquoteChar(s string, quote byte) (value rune, multibyte bool, tail string, err error) {
   242  	// easy cases
   243  	if len(s) == 0 {
   244  		err = ErrSyntax
   245  		return
   246  	}
   247  	switch c := s[0]; {
   248  	case c == quote && (quote == '\'' || quote == '"'):
   249  		err = ErrSyntax
   250  		return
   251  	case c >= utf8.RuneSelf:
   252  		r, size := utf8.DecodeRuneInString(s)
   253  		return r, true, s[size:], nil
   254  	case c != '\\':
   255  		return rune(s[0]), false, s[1:], nil
   256  	}
   257  
   258  	// hard case: c is backslash
   259  	if len(s) <= 1 {
   260  		err = ErrSyntax
   261  		return
   262  	}
   263  	c := s[1]
   264  	s = s[2:]
   265  
   266  	switch c {
   267  	case 'a':
   268  		value = '\a'
   269  	case 'b':
   270  		value = '\b'
   271  	case 'f':
   272  		value = '\f'
   273  	case 'n':
   274  		value = '\n'
   275  	case 'r':
   276  		value = '\r'
   277  	case 't':
   278  		value = '\t'
   279  	case 'v':
   280  		value = '\v'
   281  	case 'x', 'u', 'U':
   282  		n := 0
   283  		switch c {
   284  		case 'x':
   285  			n = 2
   286  		case 'u':
   287  			n = 4
   288  		case 'U':
   289  			n = 8
   290  		}
   291  		var v rune
   292  		if len(s) < n {
   293  			err = ErrSyntax
   294  			return
   295  		}
   296  		for j := 0; j < n; j++ {
   297  			x, ok := unhex(s[j])
   298  			if !ok {
   299  				err = ErrSyntax
   300  				return
   301  			}
   302  			v = v<<4 | x
   303  		}
   304  		s = s[n:]
   305  		if c == 'x' {
   306  			// single-byte string, possibly not UTF-8
   307  			value = v
   308  			break
   309  		}
   310  		if v > utf8.MaxRune {
   311  			err = ErrSyntax
   312  			return
   313  		}
   314  		value = v
   315  		multibyte = true
   316  	case '0', '1', '2', '3', '4', '5', '6', '7':
   317  		v := rune(c) - '0'
   318  		if len(s) < 2 {
   319  			err = ErrSyntax
   320  			return
   321  		}
   322  		for j := 0; j < 2; j++ { // one digit already; two more
   323  			x := rune(s[j]) - '0'
   324  			if x < 0 || x > 7 {
   325  				err = ErrSyntax
   326  				return
   327  			}
   328  			v = (v << 3) | x
   329  		}
   330  		s = s[2:]
   331  		if v > 255 {
   332  			err = ErrSyntax
   333  			return
   334  		}
   335  		value = v
   336  	case '\\':
   337  		value = '\\'
   338  	case '\'', '"':
   339  		if c != quote {
   340  			err = ErrSyntax
   341  			return
   342  		}
   343  		value = rune(c)
   344  	default:
   345  		err = ErrSyntax
   346  		return
   347  	}
   348  	tail = s
   349  	return
   350  }
   351  
   352  // Unquote interprets s as a single-quoted, double-quoted,
   353  // or backquoted Go string literal, returning the string value
   354  // that s quotes.  (If s is single-quoted, it would be a Go
   355  // character literal; Unquote returns the corresponding
   356  // one-character string.)
   357  func Unquote(s string) (string, error) {
   358  	n := len(s)
   359  	if n < 2 {
   360  		return "", ErrSyntax
   361  	}
   362  	quote := s[0]
   363  	if quote != s[n-1] {
   364  		return "", ErrSyntax
   365  	}
   366  	s = s[1 : n-1]
   367  
   368  	if quote == '`' {
   369  		if contains(s, '`') {
   370  			return "", ErrSyntax
   371  		}
   372  		if contains(s, '\r') {
   373  			// -1 because we know there is at least one \r to remove.
   374  			buf := make([]byte, 0, len(s)-1)
   375  			for i := 0; i < len(s); i++ {
   376  				if s[i] != '\r' {
   377  					buf = append(buf, s[i])
   378  				}
   379  			}
   380  			return string(buf), nil
   381  		}
   382  		return s, nil
   383  	}
   384  	if quote != '"' && quote != '\'' {
   385  		return "", ErrSyntax
   386  	}
   387  	if contains(s, '\n') {
   388  		return "", ErrSyntax
   389  	}
   390  
   391  	// Is it trivial? Avoid allocation.
   392  	if !contains(s, '\\') && !contains(s, quote) {
   393  		switch quote {
   394  		case '"':
   395  			if utf8.ValidString(s) {
   396  				return s, nil
   397  			}
   398  		case '\'':
   399  			r, size := utf8.DecodeRuneInString(s)
   400  			if size == len(s) && (r != utf8.RuneError || size != 1) {
   401  				return s, nil
   402  			}
   403  		}
   404  	}
   405  
   406  	var runeTmp [utf8.UTFMax]byte
   407  	buf := make([]byte, 0, 3*len(s)/2) // Try to avoid more allocations.
   408  	for len(s) > 0 {
   409  		c, multibyte, ss, err := UnquoteChar(s, quote)
   410  		if err != nil {
   411  			return "", err
   412  		}
   413  		s = ss
   414  		if c < utf8.RuneSelf || !multibyte {
   415  			buf = append(buf, byte(c))
   416  		} else {
   417  			n := utf8.EncodeRune(runeTmp[:], c)
   418  			buf = append(buf, runeTmp[:n]...)
   419  		}
   420  		if quote == '\'' && len(s) != 0 {
   421  			// single-quoted must be single character
   422  			return "", ErrSyntax
   423  		}
   424  	}
   425  	return string(buf), nil
   426  }
   427  
   428  // contains reports whether the string contains the byte c.
   429  func contains(s string, c byte) bool {
   430  	return bytealg.IndexByteString(s, c) != -1
   431  }
   432  
   433  // bsearch16 returns the smallest i such that a[i] >= x.
   434  // If there is no such i, bsearch16 returns len(a).
   435  func bsearch16(a []uint16, x uint16) int {
   436  	i, j := 0, len(a)
   437  	for i < j {
   438  		h := i + (j-i)/2
   439  		if a[h] < x {
   440  			i = h + 1
   441  		} else {
   442  			j = h
   443  		}
   444  	}
   445  	return i
   446  }
   447  
   448  // bsearch32 returns the smallest i such that a[i] >= x.
   449  // If there is no such i, bsearch32 returns len(a).
   450  func bsearch32(a []uint32, x uint32) int {
   451  	i, j := 0, len(a)
   452  	for i < j {
   453  		h := i + (j-i)/2
   454  		if a[h] < x {
   455  			i = h + 1
   456  		} else {
   457  			j = h
   458  		}
   459  	}
   460  	return i
   461  }
   462  
   463  // TODO: IsPrint is a local implementation of unicode.IsPrint, verified by the tests
   464  // to give the same answer. It allows this package not to depend on unicode,
   465  // and therefore not pull in all the Unicode tables. If the linker were better
   466  // at tossing unused tables, we could get rid of this implementation.
   467  // That would be nice.
   468  
   469  // IsPrint reports whether the rune is defined as printable by Go, with
   470  // the same definition as unicode.IsPrint: letters, numbers, punctuation,
   471  // symbols and ASCII space.
   472  func IsPrint(r rune) bool {
   473  	// Fast check for Latin-1
   474  	if r <= 0xFF {
   475  		if 0x20 <= r && r <= 0x7E {
   476  			// All the ASCII is printable from space through DEL-1.
   477  			return true
   478  		}
   479  		if 0xA1 <= r && r <= 0xFF {
   480  			// Similarly for ¡ through ÿ...
   481  			return r != 0xAD // ...except for the bizarre soft hyphen.
   482  		}
   483  		return false
   484  	}
   485  
   486  	// Same algorithm, either on uint16 or uint32 value.
   487  	// First, find first i such that isPrint[i] >= x.
   488  	// This is the index of either the start or end of a pair that might span x.
   489  	// The start is even (isPrint[i&^1]) and the end is odd (isPrint[i|1]).
   490  	// If we find x in a range, make sure x is not in isNotPrint list.
   491  
   492  	if 0 <= r && r < 1<<16 {
   493  		rr, isPrint, isNotPrint := uint16(r), isPrint16, isNotPrint16
   494  		i := bsearch16(isPrint, rr)
   495  		if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   496  			return false
   497  		}
   498  		j := bsearch16(isNotPrint, rr)
   499  		return j >= len(isNotPrint) || isNotPrint[j] != rr
   500  	}
   501  
   502  	rr, isPrint, isNotPrint := uint32(r), isPrint32, isNotPrint32
   503  	i := bsearch32(isPrint, rr)
   504  	if i >= len(isPrint) || rr < isPrint[i&^1] || isPrint[i|1] < rr {
   505  		return false
   506  	}
   507  	if r >= 0x20000 {
   508  		return true
   509  	}
   510  	r -= 0x10000
   511  	j := bsearch16(isNotPrint, uint16(r))
   512  	return j >= len(isNotPrint) || isNotPrint[j] != uint16(r)
   513  }
   514  
   515  // IsGraphic reports whether the rune is defined as a Graphic by Unicode. Such
   516  // characters include letters, marks, numbers, punctuation, symbols, and
   517  // spaces, from categories L, M, N, P, S, and Zs.
   518  func IsGraphic(r rune) bool {
   519  	if IsPrint(r) {
   520  		return true
   521  	}
   522  	return isInGraphicList(r)
   523  }
   524  
   525  // isInGraphicList reports whether the rune is in the isGraphic list. This separation
   526  // from IsGraphic allows quoteWith to avoid two calls to IsPrint.
   527  // Should be called only if IsPrint fails.
   528  func isInGraphicList(r rune) bool {
   529  	// We know r must fit in 16 bits - see makeisprint.go.
   530  	if r > 0xFFFF {
   531  		return false
   532  	}
   533  	rr := uint16(r)
   534  	i := bsearch16(isGraphic, rr)
   535  	return i < len(isGraphic) && rr == isGraphic[i]
   536  }
   537  

View as plain text