escape.go

     1  // Copyright 2010 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  // Package html provides functions for escaping and unescaping HTML text.
     6  package html
     7  
     8  import (
     9  	"strings"
    10  	"unicode/utf8"
    11  )
    12  
    13  // These replacements permit compatibility with old numeric entities that
    14  // assumed Windows-1252 encoding.
    15  // https://html.spec.whatwg.org/multipage/parsing.html#numeric-character-reference-end-state
    16  var replacementTable = [...]rune{
    17  	'\u20AC', // First entry is what 0x80 should be replaced with.
    18  	'\u0081',
    19  	'\u201A',
    20  	'\u0192',
    21  	'\u201E',
    22  	'\u2026',
    23  	'\u2020',
    24  	'\u2021',
    25  	'\u02C6',
    26  	'\u2030',
    27  	'\u0160',
    28  	'\u2039',
    29  	'\u0152',
    30  	'\u008D',
    31  	'\u017D',
    32  	'\u008F',
    33  	'\u0090',
    34  	'\u2018',
    35  	'\u2019',
    36  	'\u201C',
    37  	'\u201D',
    38  	'\u2022',
    39  	'\u2013',
    40  	'\u2014',
    41  	'\u02DC',
    42  	'\u2122',
    43  	'\u0161',
    44  	'\u203A',
    45  	'\u0153',
    46  	'\u009D',
    47  	'\u017E',
    48  	'\u0178', // Last entry is 0x9F.
    49  	// 0x00->'\uFFFD' is handled programmatically.
    50  	// 0x0D->'\u000D' is a no-op.
    51  }
    52  
    53  // unescapeEntity reads an entity like "&lt;" from b[src:] and writes the
    54  // corresponding "<" to b[dst:], returning the incremented dst and src cursors.
    55  // Precondition: b[src] == '&' && dst <= src.
    56  func unescapeEntity(b []byte, dst, src int) (dst1, src1 int) {
    57  	const attribute = false
    58  
    59  	// http://www.whatwg.org/specs/web-apps/current-work/multipage/tokenization.html#consume-a-character-reference
    60  
    61  	// i starts at 1 because we already know that s[0] == '&'.
    62  	i, s := 1, b[src:]
    63  
    64  	if len(s) <= 1 {
    65  		b[dst] = b[src]
    66  		return dst + 1, src + 1
    67  	}
    68  
    69  	if s[i] == '#' {
    70  		if len(s) <= 3 { // We need to have at least "&#.".
    71  			b[dst] = b[src]
    72  			return dst + 1, src + 1
    73  		}
    74  		i++
    75  		c := s[i]
    76  		hex := false
    77  		if c == 'x' || c == 'X' {
    78  			hex = true
    79  			i++
    80  		}
    81  
    82  		x := '\x00'
    83  		for i < len(s) {
    84  			c = s[i]
    85  			i++
    86  			if hex {
    87  				if '0' <= c && c <= '9' {
    88  					x = 16*x + rune(c) - '0'
    89  					continue
    90  				} else if 'a' <= c && c <= 'f' {
    91  					x = 16*x + rune(c) - 'a' + 10
    92  					continue
    93  				} else if 'A' <= c && c <= 'F' {
    94  					x = 16*x + rune(c) - 'A' + 10
    95  					continue
    96  				}
    97  			} else if '0' <= c && c <= '9' {
    98  				x = 10*x + rune(c) - '0'
    99  				continue
   100  			}
   101  			if c != ';' {
   102  				i--
   103  			}
   104  			break
   105  		}
   106  
   107  		if i <= 3 { // No characters matched.
   108  			b[dst] = b[src]
   109  			return dst + 1, src + 1
   110  		}
   111  
   112  		if 0x80 <= x && x <= 0x9F {
   113  			// Replace characters from Windows-1252 with UTF-8 equivalents.
   114  			x = replacementTable[x-0x80]
   115  		} else if x == 0 || (0xD800 <= x && x <= 0xDFFF) || x > 0x10FFFF {
   116  			// Replace invalid characters with the replacement character.
   117  			x = '\uFFFD'
   118  		}
   119  
   120  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   121  	}
   122  
   123  	// Consume the maximum number of characters possible, with the
   124  	// consumed characters matching one of the named references.
   125  
   126  	for i < len(s) {
   127  		c := s[i]
   128  		i++
   129  		// Lower-cased characters are more common in entities, so we check for them first.
   130  		if 'a' <= c && c <= 'z' || 'A' <= c && c <= 'Z' || '0' <= c && c <= '9' {
   131  			continue
   132  		}
   133  		if c != ';' {
   134  			i--
   135  		}
   136  		break
   137  	}
   138  
   139  	entityName := s[1:i]
   140  	if len(entityName) == 0 {
   141  		// No-op.
   142  	} else if attribute && entityName[len(entityName)-1] != ';' && len(s) > i && s[i] == '=' {
   143  		// No-op.
   144  	} else if x := entity[string(entityName)]; x != 0 {
   145  		return dst + utf8.EncodeRune(b[dst:], x), src + i
   146  	} else if x := entity2[string(entityName)]; x[0] != 0 {
   147  		dst1 := dst + utf8.EncodeRune(b[dst:], x[0])
   148  		return dst1 + utf8.EncodeRune(b[dst1:], x[1]), src + i
   149  	} else if !attribute {
   150  		maxLen := len(entityName) - 1
   151  		if maxLen > longestEntityWithoutSemicolon {
   152  			maxLen = longestEntityWithoutSemicolon
   153  		}
   154  		for j := maxLen; j > 1; j-- {
   155  			if x := entity[string(entityName[:j])]; x != 0 {
   156  				return dst + utf8.EncodeRune(b[dst:], x), src + j + 1
   157  			}
   158  		}
   159  	}
   160  
   161  	dst1, src1 = dst+i, src+i
   162  	copy(b[dst:dst1], b[src:src1])
   163  	return dst1, src1
   164  }
   165  
   166  var htmlEscaper = strings.NewReplacer(
   167  	`&`, "&amp;",
   168  	`'`, "&#39;", // "&#39;" is shorter than "&apos;" and apos was not in HTML until HTML5.
   169  	`<`, "&lt;",
   170  	`>`, "&gt;",
   171  	`"`, "&#34;", // "&#34;" is shorter than "&quot;".
   172  )
   173  
   174  // EscapeString escapes special characters like "<" to become "&lt;". It
   175  // escapes only five such characters: <, >, &, ' and ".
   176  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   177  // always true.
   178  func EscapeString(s string) string {
   179  	return htmlEscaper.Replace(s)
   180  }
   181  
   182  // UnescapeString unescapes entities like "&lt;" to become "<". It unescapes a
   183  // larger range of entities than EscapeString escapes. For example, "&aacute;"
   184  // unescapes to "á", as does "&#225;" and "&#xE1;".
   185  // UnescapeString(EscapeString(s)) == s always holds, but the converse isn't
   186  // always true.
   187  func UnescapeString(s string) string {
   188  	populateMapsOnce.Do(populateMaps)
   189  	i := strings.IndexByte(s, '&')
   190  
   191  	if i < 0 {
   192  		return s
   193  	}
   194  
   195  	b := []byte(s)
   196  	dst, src := unescapeEntity(b, i, i)
   197  	for len(s[src:]) > 0 {
   198  		if s[src] == '&' {
   199  			i = 0
   200  		} else {
   201  			i = strings.IndexByte(s[src:], '&')
   202  		}
   203  		if i < 0 {
   204  			dst += copy(b[dst:], s[src:])
   205  			break
   206  		}
   207  
   208  		if i > 0 {
   209  			copy(b[dst:], s[src:src+i])
   210  		}
   211  		dst, src = unescapeEntity(b, dst+i, src+i)
   212  	}
   213  	return string(b[:dst])
   214  }
   215
View as plain text