encodedword.go

     1  // Copyright 2015 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package mime
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/base64"
    10  	"errors"
    11  	"fmt"
    12  	"io"
    13  	"strings"
    14  	"unicode"
    15  	"unicode/utf8"
    16  )
    17  
    18  // A WordEncoder is an RFC 2047 encoded-word encoder.
    19  type WordEncoder byte
    20  
    21  const (
    22  	// BEncoding represents Base64 encoding scheme as defined by RFC 2045.
    23  	BEncoding = WordEncoder('b')
    24  	// QEncoding represents the Q-encoding scheme as defined by RFC 2047.
    25  	QEncoding = WordEncoder('q')
    26  )
    27  
    28  var (
    29  	errInvalidWord = errors.New("mime: invalid RFC 2047 encoded-word")
    30  )
    31  
    32  // Encode returns the encoded-word form of s. If s is ASCII without special
    33  // characters, it is returned unchanged. The provided charset is the IANA
    34  // charset name of s. It is case insensitive.
    35  func (e WordEncoder) Encode(charset, s string) string {
    36  	if !needsEncoding(s) {
    37  		return s
    38  	}
    39  	return e.encodeWord(charset, s)
    40  }
    41  
    42  func needsEncoding(s string) bool {
    43  	for _, b := range s {
    44  		if (b < ' ' || b > '~') && b != '\t' {
    45  			return true
    46  		}
    47  	}
    48  	return false
    49  }
    50  
    51  // encodeWord encodes a string into an encoded-word.
    52  func (e WordEncoder) encodeWord(charset, s string) string {
    53  	var buf strings.Builder
    54  	// Could use a hint like len(s)*3, but that's not enough for cases
    55  	// with word splits and too much for simpler inputs.
    56  	// 48 is close to maxEncodedWordLen/2, but adjusted to allocator size class.
    57  	buf.Grow(48)
    58  
    59  	e.openWord(&buf, charset)
    60  	if e == BEncoding {
    61  		e.bEncode(&buf, charset, s)
    62  	} else {
    63  		e.qEncode(&buf, charset, s)
    64  	}
    65  	closeWord(&buf)
    66  
    67  	return buf.String()
    68  }
    69  
    70  const (
    71  	// The maximum length of an encoded-word is 75 characters.
    72  	// See RFC 2047, section 2.
    73  	maxEncodedWordLen = 75
    74  	// maxContentLen is how much content can be encoded, ignoring the header and
    75  	// 2-byte footer.
    76  	maxContentLen = maxEncodedWordLen - len("=?UTF-8?q?") - len("?=")
    77  )
    78  
    79  var maxBase64Len = base64.StdEncoding.DecodedLen(maxContentLen)
    80  
    81  // bEncode encodes s using base64 encoding and writes it to buf.
    82  func (e WordEncoder) bEncode(buf *strings.Builder, charset, s string) {
    83  	w := base64.NewEncoder(base64.StdEncoding, buf)
    84  	// If the charset is not UTF-8 or if the content is short, do not bother
    85  	// splitting the encoded-word.
    86  	if !isUTF8(charset) || base64.StdEncoding.EncodedLen(len(s)) <= maxContentLen {
    87  		io.WriteString(w, s)
    88  		w.Close()
    89  		return
    90  	}
    91  
    92  	var currentLen, last, runeLen int
    93  	for i := 0; i < len(s); i += runeLen {
    94  		// Multi-byte characters must not be split across encoded-words.
    95  		// See RFC 2047, section 5.3.
    96  		_, runeLen = utf8.DecodeRuneInString(s[i:])
    97  
    98  		if currentLen+runeLen <= maxBase64Len {
    99  			currentLen += runeLen
   100  		} else {
   101  			io.WriteString(w, s[last:i])
   102  			w.Close()
   103  			e.splitWord(buf, charset)
   104  			last = i
   105  			currentLen = runeLen
   106  		}
   107  	}
   108  	io.WriteString(w, s[last:])
   109  	w.Close()
   110  }
   111  
   112  // qEncode encodes s using Q encoding and writes it to buf. It splits the
   113  // encoded-words when necessary.
   114  func (e WordEncoder) qEncode(buf *strings.Builder, charset, s string) {
   115  	// We only split encoded-words when the charset is UTF-8.
   116  	if !isUTF8(charset) {
   117  		writeQString(buf, s)
   118  		return
   119  	}
   120  
   121  	var currentLen, runeLen int
   122  	for i := 0; i < len(s); i += runeLen {
   123  		b := s[i]
   124  		// Multi-byte characters must not be split across encoded-words.
   125  		// See RFC 2047, section 5.3.
   126  		var encLen int
   127  		if b >= ' ' && b <= '~' && b != '=' && b != '?' && b != '_' {
   128  			runeLen, encLen = 1, 1
   129  		} else {
   130  			_, runeLen = utf8.DecodeRuneInString(s[i:])
   131  			encLen = 3 * runeLen
   132  		}
   133  
   134  		if currentLen+encLen > maxContentLen {
   135  			e.splitWord(buf, charset)
   136  			currentLen = 0
   137  		}
   138  		writeQString(buf, s[i:i+runeLen])
   139  		currentLen += encLen
   140  	}
   141  }
   142  
   143  // writeQString encodes s using Q encoding and writes it to buf.
   144  func writeQString(buf *strings.Builder, s string) {
   145  	for i := 0; i < len(s); i++ {
   146  		switch b := s[i]; {
   147  		case b == ' ':
   148  			buf.WriteByte('_')
   149  		case b >= '!' && b <= '~' && b != '=' && b != '?' && b != '_':
   150  			buf.WriteByte(b)
   151  		default:
   152  			buf.WriteByte('=')
   153  			buf.WriteByte(upperhex[b>>4])
   154  			buf.WriteByte(upperhex[b&0x0f])
   155  		}
   156  	}
   157  }
   158  
   159  // openWord writes the beginning of an encoded-word into buf.
   160  func (e WordEncoder) openWord(buf *strings.Builder, charset string) {
   161  	buf.WriteString("=?")
   162  	buf.WriteString(charset)
   163  	buf.WriteByte('?')
   164  	buf.WriteByte(byte(e))
   165  	buf.WriteByte('?')
   166  }
   167  
   168  // closeWord writes the end of an encoded-word into buf.
   169  func closeWord(buf *strings.Builder) {
   170  	buf.WriteString("?=")
   171  }
   172  
   173  // splitWord closes the current encoded-word and opens a new one.
   174  func (e WordEncoder) splitWord(buf *strings.Builder, charset string) {
   175  	closeWord(buf)
   176  	buf.WriteByte(' ')
   177  	e.openWord(buf, charset)
   178  }
   179  
   180  func isUTF8(charset string) bool {
   181  	return strings.EqualFold(charset, "UTF-8")
   182  }
   183  
   184  const upperhex = "0123456789ABCDEF"
   185  
   186  // A WordDecoder decodes MIME headers containing RFC 2047 encoded-words.
   187  type WordDecoder struct {
   188  	// CharsetReader, if non-nil, defines a function to generate
   189  	// charset-conversion readers, converting from the provided
   190  	// charset into UTF-8.
   191  	// Charsets are always lower-case. utf-8, iso-8859-1 and us-ascii charsets
   192  	// are handled by default.
   193  	// One of the CharsetReader's result values must be non-nil.
   194  	CharsetReader func(charset string, input io.Reader) (io.Reader, error)
   195  }
   196  
   197  // Decode decodes an RFC 2047 encoded-word.
   198  func (d *WordDecoder) Decode(word string) (string, error) {
   199  	// See https://tools.ietf.org/html/rfc2047#section-2 for details.
   200  	// Our decoder is permissive, we accept empty encoded-text.
   201  	if len(word) < 8 || !strings.HasPrefix(word, "=?") || !strings.HasSuffix(word, "?=") || strings.Count(word, "?") != 4 {
   202  		return "", errInvalidWord
   203  	}
   204  	word = word[2 : len(word)-2]
   205  
   206  	// split word "UTF-8?q?text" into "UTF-8", 'q', and "text"
   207  	charset, text, _ := strings.Cut(word, "?")
   208  	if charset == "" {
   209  		return "", errInvalidWord
   210  	}
   211  	encoding, text, _ := strings.Cut(text, "?")
   212  	if len(encoding) != 1 {
   213  		return "", errInvalidWord
   214  	}
   215  
   216  	content, err := decode(encoding[0], text)
   217  	if err != nil {
   218  		return "", err
   219  	}
   220  
   221  	var buf strings.Builder
   222  	if err := d.convert(&buf, charset, content); err != nil {
   223  		return "", err
   224  	}
   225  	return buf.String(), nil
   226  }
   227  
   228  // DecodeHeader decodes all encoded-words of the given string. It returns an
   229  // error if and only if CharsetReader of d returns an error.
   230  func (d *WordDecoder) DecodeHeader(header string) (string, error) {
   231  	// If there is no encoded-word, returns before creating a buffer.
   232  	i := strings.Index(header, "=?")
   233  	if i == -1 {
   234  		return header, nil
   235  	}
   236  
   237  	var buf strings.Builder
   238  
   239  	buf.WriteString(header[:i])
   240  	header = header[i:]
   241  
   242  	betweenWords := false
   243  	for {
   244  		start := strings.Index(header, "=?")
   245  		if start == -1 {
   246  			break
   247  		}
   248  		cur := start + len("=?")
   249  
   250  		i := strings.Index(header[cur:], "?")
   251  		if i == -1 {
   252  			break
   253  		}
   254  		charset := header[cur : cur+i]
   255  		cur += i + len("?")
   256  
   257  		if len(header) < cur+len("Q??=") {
   258  			break
   259  		}
   260  		encoding := header[cur]
   261  		cur++
   262  
   263  		if header[cur] != '?' {
   264  			break
   265  		}
   266  		cur++
   267  
   268  		j := strings.Index(header[cur:], "?=")
   269  		if j == -1 {
   270  			break
   271  		}
   272  		text := header[cur : cur+j]
   273  		end := cur + j + len("?=")
   274  
   275  		content, err := decode(encoding, text)
   276  		if err != nil {
   277  			betweenWords = false
   278  			buf.WriteString(header[:start+2])
   279  			header = header[start+2:]
   280  			continue
   281  		}
   282  
   283  		// Write characters before the encoded-word. White-space and newline
   284  		// characters separating two encoded-words must be deleted.
   285  		if start > 0 && (!betweenWords || hasNonWhitespace(header[:start])) {
   286  			buf.WriteString(header[:start])
   287  		}
   288  
   289  		if err := d.convert(&buf, charset, content); err != nil {
   290  			return "", err
   291  		}
   292  
   293  		header = header[end:]
   294  		betweenWords = true
   295  	}
   296  
   297  	if len(header) > 0 {
   298  		buf.WriteString(header)
   299  	}
   300  
   301  	return buf.String(), nil
   302  }
   303  
   304  func decode(encoding byte, text string) ([]byte, error) {
   305  	switch encoding {
   306  	case 'B', 'b':
   307  		return base64.StdEncoding.DecodeString(text)
   308  	case 'Q', 'q':
   309  		return qDecode(text)
   310  	default:
   311  		return nil, errInvalidWord
   312  	}
   313  }
   314  
   315  func (d *WordDecoder) convert(buf *strings.Builder, charset string, content []byte) error {
   316  	switch {
   317  	case strings.EqualFold("utf-8", charset):
   318  		buf.Write(content)
   319  	case strings.EqualFold("iso-8859-1", charset):
   320  		for _, c := range content {
   321  			buf.WriteRune(rune(c))
   322  		}
   323  	case strings.EqualFold("us-ascii", charset):
   324  		for _, c := range content {
   325  			if c >= utf8.RuneSelf {
   326  				buf.WriteRune(unicode.ReplacementChar)
   327  			} else {
   328  				buf.WriteByte(c)
   329  			}
   330  		}
   331  	default:
   332  		if d.CharsetReader == nil {
   333  			return fmt.Errorf("mime: unhandled charset %q", charset)
   334  		}
   335  		r, err := d.CharsetReader(strings.ToLower(charset), bytes.NewReader(content))
   336  		if err != nil {
   337  			return err
   338  		}
   339  		if _, err = io.Copy(buf, r); err != nil {
   340  			return err
   341  		}
   342  	}
   343  	return nil
   344  }
   345  
   346  // hasNonWhitespace reports whether s (assumed to be ASCII) contains at least
   347  // one byte of non-whitespace.
   348  func hasNonWhitespace(s string) bool {
   349  	for _, b := range s {
   350  		switch b {
   351  		// Encoded-words can only be separated by linear white spaces which does
   352  		// not include vertical tabs (\v).
   353  		case ' ', '\t', '\n', '\r':
   354  		default:
   355  			return true
   356  		}
   357  	}
   358  	return false
   359  }
   360  
   361  // qDecode decodes a Q encoded string.
   362  func qDecode(s string) ([]byte, error) {
   363  	dec := make([]byte, len(s))
   364  	n := 0
   365  	for i := 0; i < len(s); i++ {
   366  		switch c := s[i]; {
   367  		case c == '_':
   368  			dec[n] = ' '
   369  		case c == '=':
   370  			if i+2 >= len(s) {
   371  				return nil, errInvalidWord
   372  			}
   373  			b, err := readHexByte(s[i+1], s[i+2])
   374  			if err != nil {
   375  				return nil, err
   376  			}
   377  			dec[n] = b
   378  			i += 2
   379  		case (c <= '~' && c >= ' ') || c == '\n' || c == '\r' || c == '\t':
   380  			dec[n] = c
   381  		default:
   382  			return nil, errInvalidWord
   383  		}
   384  		n++
   385  	}
   386  
   387  	return dec[:n], nil
   388  }
   389  
   390  // readHexByte returns the byte from its quoted-printable representation.
   391  func readHexByte(a, b byte) (byte, error) {
   392  	var hb, lb byte
   393  	var err error
   394  	if hb, err = fromHex(a); err != nil {
   395  		return 0, err
   396  	}
   397  	if lb, err = fromHex(b); err != nil {
   398  		return 0, err
   399  	}
   400  	return hb<<4 | lb, nil
   401  }
   402  
   403  func fromHex(b byte) (byte, error) {
   404  	switch {
   405  	case b >= '0' && b <= '9':
   406  		return b - '0', nil
   407  	case b >= 'A' && b <= 'F':
   408  		return b - 'A' + 10, nil
   409  	// Accept badly encoded bytes.
   410  	case b >= 'a' && b <= 'f':
   411  		return b - 'a' + 10, nil
   412  	}
   413  	return 0, fmt.Errorf("mime: invalid hex byte %#02x", b)
   414  }
   415
View as plain text