1 // Copyright 2011 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 /* 6 Package mail implements parsing of mail messages. 7 8 For the most part, this package follows the syntax as specified by RFC 5322 and 9 extended by RFC 6532. 10 Notable divergences: 11 * Obsolete address formats are not parsed, including addresses with 12 embedded route information. 13 * The full range of spacing (the CFWS syntax element) is not supported, 14 such as breaking addresses across lines. 15 * No unicode normalization is performed. 16 * The special characters ()[]:;@\, are allowed to appear unquoted in names. 17 */ 18 package mail 19 20 import ( 21 "bufio" 22 "errors" 23 "fmt" 24 "io" 25 "log" 26 "mime" 27 "net/textproto" 28 "strings" 29 "sync" 30 "time" 31 "unicode/utf8" 32 ) 33 34 var debug = debugT(false) 35 36 type debugT bool 37 38 func (d debugT) Printf(format string, args ...interface{}) { 39 if d { 40 log.Printf(format, args...) 41 } 42 } 43 44 // A Message represents a parsed mail message. 45 type Message struct { 46 Header Header 47 Body io.Reader 48 } 49 50 // ReadMessage reads a message from r. 51 // The headers are parsed, and the body of the message will be available 52 // for reading from msg.Body. 53 func ReadMessage(r io.Reader) (msg *Message, err error) { 54 tp := textproto.NewReader(bufio.NewReader(r)) 55 56 hdr, err := tp.ReadMIMEHeader() 57 if err != nil { 58 return nil, err 59 } 60 61 return &Message{ 62 Header: Header(hdr), 63 Body: tp.R, 64 }, nil 65 } 66 67 // Layouts suitable for passing to time.Parse. 68 // These are tried in order. 69 var ( 70 dateLayoutsBuildOnce sync.Once 71 dateLayouts []string 72 ) 73 74 func buildDateLayouts() { 75 // Generate layouts based on RFC 5322, section 3.3. 76 77 dows := [...]string{"", "Mon, "} // day-of-week 78 days := [...]string{"2", "02"} // day = 1*2DIGIT 79 years := [...]string{"2006", "06"} // year = 4*DIGIT / 2*DIGIT 80 seconds := [...]string{":05", ""} // second 81 // "-0700 (MST)" is not in RFC 5322, but is common. 82 zones := [...]string{"-0700", "MST", "-0700 (MST)"} // zone = (("+" / "-") 4DIGIT) / "GMT" / ... 83 84 for _, dow := range dows { 85 for _, day := range days { 86 for _, year := range years { 87 for _, second := range seconds { 88 for _, zone := range zones { 89 s := dow + day + " Jan " + year + " 15:04" + second + " " + zone 90 dateLayouts = append(dateLayouts, s) 91 } 92 } 93 } 94 } 95 } 96 } 97 98 // ParseDate parses an RFC 5322 date string. 99 func ParseDate(date string) (time.Time, error) { 100 dateLayoutsBuildOnce.Do(buildDateLayouts) 101 for _, layout := range dateLayouts { 102 t, err := time.Parse(layout, date) 103 if err == nil { 104 return t, nil 105 } 106 } 107 return time.Time{}, errors.New("mail: header could not be parsed") 108 } 109 110 // A Header represents the key-value pairs in a mail message header. 111 type Header map[string][]string 112 113 // Get gets the first value associated with the given key. 114 // It is case insensitive; CanonicalMIMEHeaderKey is used 115 // to canonicalize the provided key. 116 // If there are no values associated with the key, Get returns "". 117 // To access multiple values of a key, or to use non-canonical keys, 118 // access the map directly. 119 func (h Header) Get(key string) string { 120 return textproto.MIMEHeader(h).Get(key) 121 } 122 123 var ErrHeaderNotPresent = errors.New("mail: header not in message") 124 125 // Date parses the Date header field. 126 func (h Header) Date() (time.Time, error) { 127 hdr := h.Get("Date") 128 if hdr == "" { 129 return time.Time{}, ErrHeaderNotPresent 130 } 131 return ParseDate(hdr) 132 } 133 134 // AddressList parses the named header field as a list of addresses. 135 func (h Header) AddressList(key string) ([]*Address, error) { 136 hdr := h.Get(key) 137 if hdr == "" { 138 return nil, ErrHeaderNotPresent 139 } 140 return ParseAddressList(hdr) 141 } 142 143 // Address represents a single mail address. 144 // An address such as "Barry Gibbs <bg@example.com>" is represented 145 // as Address{Name: "Barry Gibbs", Address: "bg@example.com"}. 146 type Address struct { 147 Name string // Proper name; may be empty. 148 Address string // user@domain 149 } 150 151 // ParseAddress parses a single RFC 5322 address, e.g. "Barry Gibbs <bg@example.com>" 152 func ParseAddress(address string) (*Address, error) { 153 return (&addrParser{s: address}).parseSingleAddress() 154 } 155 156 // ParseAddressList parses the given string as a list of addresses. 157 func ParseAddressList(list string) ([]*Address, error) { 158 return (&addrParser{s: list}).parseAddressList() 159 } 160 161 // An AddressParser is an RFC 5322 address parser. 162 type AddressParser struct { 163 // WordDecoder optionally specifies a decoder for RFC 2047 encoded-words. 164 WordDecoder *mime.WordDecoder 165 } 166 167 // Parse parses a single RFC 5322 address of the 168 // form "Gogh Fir <gf@example.com>" or "foo@example.com". 169 func (p *AddressParser) Parse(address string) (*Address, error) { 170 return (&addrParser{s: address, dec: p.WordDecoder}).parseSingleAddress() 171 } 172 173 // ParseList parses the given string as a list of comma-separated addresses 174 // of the form "Gogh Fir <gf@example.com>" or "foo@example.com". 175 func (p *AddressParser) ParseList(list string) ([]*Address, error) { 176 return (&addrParser{s: list, dec: p.WordDecoder}).parseAddressList() 177 } 178 179 // String formats the address as a valid RFC 5322 address. 180 // If the address's name contains non-ASCII characters 181 // the name will be rendered according to RFC 2047. 182 func (a *Address) String() string { 183 // Format address local@domain 184 at := strings.LastIndex(a.Address, "@") 185 var local, domain string 186 if at < 0 { 187 // This is a malformed address ("@" is required in addr-spec); 188 // treat the whole address as local-part. 189 local = a.Address 190 } else { 191 local, domain = a.Address[:at], a.Address[at+1:] 192 } 193 194 // Add quotes if needed 195 quoteLocal := false 196 for i, r := range local { 197 if isAtext(r, false, false) { 198 continue 199 } 200 if r == '.' { 201 // Dots are okay if they are surrounded by atext. 202 // We only need to check that the previous byte is 203 // not a dot, and this isn't the end of the string. 204 if i > 0 && local[i-1] != '.' && i < len(local)-1 { 205 continue 206 } 207 } 208 quoteLocal = true 209 break 210 } 211 if quoteLocal { 212 local = quoteString(local) 213 214 } 215 216 s := "<" + local + "@" + domain + ">" 217 218 if a.Name == "" { 219 return s 220 } 221 222 // If every character is printable ASCII, quoting is simple. 223 allPrintable := true 224 for _, r := range a.Name { 225 // isWSP here should actually be isFWS, 226 // but we don't support folding yet. 227 if !isVchar(r) && !isWSP(r) || isMultibyte(r) { 228 allPrintable = false 229 break 230 } 231 } 232 if allPrintable { 233 return quoteString(a.Name) + " " + s 234 } 235 236 // Text in an encoded-word in a display-name must not contain certain 237 // characters like quotes or parentheses (see RFC 2047 section 5.3). 238 // When this is the case encode the name using base64 encoding. 239 if strings.ContainsAny(a.Name, "\"#$%&'(),.:;<>@[]^`{|}~") { 240 return mime.BEncoding.Encode("utf-8", a.Name) + " " + s 241 } 242 return mime.QEncoding.Encode("utf-8", a.Name) + " " + s 243 } 244 245 type addrParser struct { 246 s string 247 dec *mime.WordDecoder // may be nil 248 } 249 250 func (p *addrParser) parseAddressList() ([]*Address, error) { 251 var list []*Address 252 for { 253 p.skipSpace() 254 addrs, err := p.parseAddress(true) 255 if err != nil { 256 return nil, err 257 } 258 list = append(list, addrs...) 259 260 if !p.skipCFWS() { 261 return nil, errors.New("mail: misformatted parenthetical comment") 262 } 263 if p.empty() { 264 break 265 } 266 if !p.consume(',') { 267 return nil, errors.New("mail: expected comma") 268 } 269 } 270 return list, nil 271 } 272 273 func (p *addrParser) parseSingleAddress() (*Address, error) { 274 addrs, err := p.parseAddress(true) 275 if err != nil { 276 return nil, err 277 } 278 if !p.skipCFWS() { 279 return nil, errors.New("mail: misformatted parenthetical comment") 280 } 281 if !p.empty() { 282 return nil, fmt.Errorf("mail: expected single address, got %q", p.s) 283 } 284 if len(addrs) == 0 { 285 return nil, errors.New("mail: empty group") 286 } 287 if len(addrs) > 1 { 288 return nil, errors.New("mail: group with multiple addresses") 289 } 290 return addrs[0], nil 291 } 292 293 // parseAddress parses a single RFC 5322 address at the start of p. 294 func (p *addrParser) parseAddress(handleGroup bool) ([]*Address, error) { 295 debug.Printf("parseAddress: %q", p.s) 296 p.skipSpace() 297 if p.empty() { 298 return nil, errors.New("mail: no address") 299 } 300 301 // address = mailbox / group 302 // mailbox = name-addr / addr-spec 303 // group = display-name ":" [group-list] ";" [CFWS] 304 305 // addr-spec has a more restricted grammar than name-addr, 306 // so try parsing it first, and fallback to name-addr. 307 // TODO(dsymonds): Is this really correct? 308 spec, err := p.consumeAddrSpec() 309 if err == nil { 310 var displayName string 311 p.skipSpace() 312 if !p.empty() && p.peek() == '(' { 313 displayName, err = p.consumeDisplayNameComment() 314 if err != nil { 315 return nil, err 316 } 317 } 318 319 return []*Address{{ 320 Name: displayName, 321 Address: spec, 322 }}, err 323 } 324 debug.Printf("parseAddress: not an addr-spec: %v", err) 325 debug.Printf("parseAddress: state is now %q", p.s) 326 327 // display-name 328 var displayName string 329 if p.peek() != '<' { 330 displayName, err = p.consumePhrase() 331 if err != nil { 332 return nil, err 333 } 334 } 335 debug.Printf("parseAddress: displayName=%q", displayName) 336 337 p.skipSpace() 338 if handleGroup { 339 if p.consume(':') { 340 return p.consumeGroupList() 341 } 342 } 343 // angle-addr = "<" addr-spec ">" 344 if !p.consume('<') { 345 atext := true 346 for _, r := range displayName { 347 if !isAtext(r, true, false) { 348 atext = false 349 break 350 } 351 } 352 if atext { 353 // The input is like "foo.bar"; it's possible the input 354 // meant to be "foo.bar@domain", or "foo.bar <...>". 355 return nil, errors.New("mail: missing '@' or angle-addr") 356 } 357 // The input is like "Full Name", which couldn't possibly be a 358 // valid email address if followed by "@domain"; the input 359 // likely meant to be "Full Name <...>". 360 return nil, errors.New("mail: no angle-addr") 361 } 362 spec, err = p.consumeAddrSpec() 363 if err != nil { 364 return nil, err 365 } 366 if !p.consume('>') { 367 return nil, errors.New("mail: unclosed angle-addr") 368 } 369 debug.Printf("parseAddress: spec=%q", spec) 370 371 return []*Address{{ 372 Name: displayName, 373 Address: spec, 374 }}, nil 375 } 376 377 func (p *addrParser) consumeGroupList() ([]*Address, error) { 378 var group []*Address 379 // handle empty group. 380 p.skipSpace() 381 if p.consume(';') { 382 p.skipCFWS() 383 return group, nil 384 } 385 386 for { 387 p.skipSpace() 388 // embedded groups not allowed. 389 addrs, err := p.parseAddress(false) 390 if err != nil { 391 return nil, err 392 } 393 group = append(group, addrs...) 394 395 if !p.skipCFWS() { 396 return nil, errors.New("mail: misformatted parenthetical comment") 397 } 398 if p.consume(';') { 399 p.skipCFWS() 400 break 401 } 402 if !p.consume(',') { 403 return nil, errors.New("mail: expected comma") 404 } 405 } 406 return group, nil 407 } 408 409 // consumeAddrSpec parses a single RFC 5322 addr-spec at the start of p. 410 func (p *addrParser) consumeAddrSpec() (spec string, err error) { 411 debug.Printf("consumeAddrSpec: %q", p.s) 412 413 orig := *p 414 defer func() { 415 if err != nil { 416 *p = orig 417 } 418 }() 419 420 // local-part = dot-atom / quoted-string 421 var localPart string 422 p.skipSpace() 423 if p.empty() { 424 return "", errors.New("mail: no addr-spec") 425 } 426 if p.peek() == '"' { 427 // quoted-string 428 debug.Printf("consumeAddrSpec: parsing quoted-string") 429 localPart, err = p.consumeQuotedString() 430 if localPart == "" { 431 err = errors.New("mail: empty quoted string in addr-spec") 432 } 433 } else { 434 // dot-atom 435 debug.Printf("consumeAddrSpec: parsing dot-atom") 436 localPart, err = p.consumeAtom(true, false) 437 } 438 if err != nil { 439 debug.Printf("consumeAddrSpec: failed: %v", err) 440 return "", err 441 } 442 443 if !p.consume('@') { 444 return "", errors.New("mail: missing @ in addr-spec") 445 } 446 447 // domain = dot-atom / domain-literal 448 var domain string 449 p.skipSpace() 450 if p.empty() { 451 return "", errors.New("mail: no domain in addr-spec") 452 } 453 // TODO(dsymonds): Handle domain-literal 454 domain, err = p.consumeAtom(true, false) 455 if err != nil { 456 return "", err 457 } 458 459 return localPart + "@" + domain, nil 460 } 461 462 // consumePhrase parses the RFC 5322 phrase at the start of p. 463 func (p *addrParser) consumePhrase() (phrase string, err error) { 464 debug.Printf("consumePhrase: [%s]", p.s) 465 // phrase = 1*word 466 var words []string 467 var isPrevEncoded bool 468 for { 469 // word = atom / quoted-string 470 var word string 471 p.skipSpace() 472 if p.empty() { 473 break 474 } 475 isEncoded := false 476 if p.peek() == '"' { 477 // quoted-string 478 word, err = p.consumeQuotedString() 479 } else { 480 // atom 481 // We actually parse dot-atom here to be more permissive 482 // than what RFC 5322 specifies. 483 word, err = p.consumeAtom(true, true) 484 if err == nil { 485 word, isEncoded, err = p.decodeRFC2047Word(word) 486 } 487 } 488 489 if err != nil { 490 break 491 } 492 debug.Printf("consumePhrase: consumed %q", word) 493 if isPrevEncoded && isEncoded { 494 words[len(words)-1] += word 495 } else { 496 words = append(words, word) 497 } 498 isPrevEncoded = isEncoded 499 } 500 // Ignore any error if we got at least one word. 501 if err != nil && len(words) == 0 { 502 debug.Printf("consumePhrase: hit err: %v", err) 503 return "", fmt.Errorf("mail: missing word in phrase: %v", err) 504 } 505 phrase = strings.Join(words, " ") 506 return phrase, nil 507 } 508 509 // consumeQuotedString parses the quoted string at the start of p. 510 func (p *addrParser) consumeQuotedString() (qs string, err error) { 511 // Assume first byte is '"'. 512 i := 1 513 qsb := make([]rune, 0, 10) 514 515 escaped := false 516 517 Loop: 518 for { 519 r, size := utf8.DecodeRuneInString(p.s[i:]) 520 521 switch { 522 case size == 0: 523 return "", errors.New("mail: unclosed quoted-string") 524 525 case size == 1 && r == utf8.RuneError: 526 return "", fmt.Errorf("mail: invalid utf-8 in quoted-string: %q", p.s) 527 528 case escaped: 529 // quoted-pair = ("\" (VCHAR / WSP)) 530 531 if !isVchar(r) && !isWSP(r) { 532 return "", fmt.Errorf("mail: bad character in quoted-string: %q", r) 533 } 534 535 qsb = append(qsb, r) 536 escaped = false 537 538 case isQtext(r) || isWSP(r): 539 // qtext (printable US-ASCII excluding " and \), or 540 // FWS (almost; we're ignoring CRLF) 541 qsb = append(qsb, r) 542 543 case r == '"': 544 break Loop 545 546 case r == '\\': 547 escaped = true 548 549 default: 550 return "", fmt.Errorf("mail: bad character in quoted-string: %q", r) 551 552 } 553 554 i += size 555 } 556 p.s = p.s[i+1:] 557 return string(qsb), nil 558 } 559 560 // consumeAtom parses an RFC 5322 atom at the start of p. 561 // If dot is true, consumeAtom parses an RFC 5322 dot-atom instead. 562 // If permissive is true, consumeAtom will not fail on: 563 // - leading/trailing/double dots in the atom (see golang.org/issue/4938) 564 // - special characters (RFC 5322 3.2.3) except '<', '>', ':' and '"' (see golang.org/issue/21018) 565 func (p *addrParser) consumeAtom(dot bool, permissive bool) (atom string, err error) { 566 i := 0 567 568 Loop: 569 for { 570 r, size := utf8.DecodeRuneInString(p.s[i:]) 571 switch { 572 case size == 1 && r == utf8.RuneError: 573 return "", fmt.Errorf("mail: invalid utf-8 in address: %q", p.s) 574 575 case size == 0 || !isAtext(r, dot, permissive): 576 break Loop 577 578 default: 579 i += size 580 581 } 582 } 583 584 if i == 0 { 585 return "", errors.New("mail: invalid string") 586 } 587 atom, p.s = p.s[:i], p.s[i:] 588 if !permissive { 589 if strings.HasPrefix(atom, ".") { 590 return "", errors.New("mail: leading dot in atom") 591 } 592 if strings.Contains(atom, "..") { 593 return "", errors.New("mail: double dot in atom") 594 } 595 if strings.HasSuffix(atom, ".") { 596 return "", errors.New("mail: trailing dot in atom") 597 } 598 } 599 return atom, nil 600 } 601 602 func (p *addrParser) consumeDisplayNameComment() (string, error) { 603 if !p.consume('(') { 604 return "", errors.New("mail: comment does not start with (") 605 } 606 comment, ok := p.consumeComment() 607 if !ok { 608 return "", errors.New("mail: misformatted parenthetical comment") 609 } 610 611 // TODO(stapelberg): parse quoted-string within comment 612 words := strings.FieldsFunc(comment, func(r rune) bool { return r == ' ' || r == '\t' }) 613 for idx, word := range words { 614 decoded, isEncoded, err := p.decodeRFC2047Word(word) 615 if err != nil { 616 return "", err 617 } 618 if isEncoded { 619 words[idx] = decoded 620 } 621 } 622 623 return strings.Join(words, " "), nil 624 } 625 626 func (p *addrParser) consume(c byte) bool { 627 if p.empty() || p.peek() != c { 628 return false 629 } 630 p.s = p.s[1:] 631 return true 632 } 633 634 // skipSpace skips the leading space and tab characters. 635 func (p *addrParser) skipSpace() { 636 p.s = strings.TrimLeft(p.s, " \t") 637 } 638 639 func (p *addrParser) peek() byte { 640 return p.s[0] 641 } 642 643 func (p *addrParser) empty() bool { 644 return p.len() == 0 645 } 646 647 func (p *addrParser) len() int { 648 return len(p.s) 649 } 650 651 // skipCFWS skips CFWS as defined in RFC5322. 652 func (p *addrParser) skipCFWS() bool { 653 p.skipSpace() 654 655 for { 656 if !p.consume('(') { 657 break 658 } 659 660 if _, ok := p.consumeComment(); !ok { 661 return false 662 } 663 664 p.skipSpace() 665 } 666 667 return true 668 } 669 670 func (p *addrParser) consumeComment() (string, bool) { 671 // '(' already consumed. 672 depth := 1 673 674 var comment string 675 for { 676 if p.empty() || depth == 0 { 677 break 678 } 679 680 if p.peek() == '\\' && p.len() > 1 { 681 p.s = p.s[1:] 682 } else if p.peek() == '(' { 683 depth++ 684 } else if p.peek() == ')' { 685 depth-- 686 } 687 if depth > 0 { 688 comment += p.s[:1] 689 } 690 p.s = p.s[1:] 691 } 692 693 return comment, depth == 0 694 } 695 696 func (p *addrParser) decodeRFC2047Word(s string) (word string, isEncoded bool, err error) { 697 if p.dec != nil { 698 word, err = p.dec.Decode(s) 699 } else { 700 word, err = rfc2047Decoder.Decode(s) 701 } 702 703 if err == nil { 704 return word, true, nil 705 } 706 707 if _, ok := err.(charsetError); ok { 708 return s, true, err 709 } 710 711 // Ignore invalid RFC 2047 encoded-word errors. 712 return s, false, nil 713 } 714 715 var rfc2047Decoder = mime.WordDecoder{ 716 CharsetReader: func(charset string, input io.Reader) (io.Reader, error) { 717 return nil, charsetError(charset) 718 }, 719 } 720 721 type charsetError string 722 723 func (e charsetError) Error() string { 724 return fmt.Sprintf("charset not supported: %q", string(e)) 725 } 726 727 // isAtext reports whether r is an RFC 5322 atext character. 728 // If dot is true, period is included. 729 // If permissive is true, RFC 5322 3.2.3 specials is included, 730 // except '<', '>', ':' and '"'. 731 func isAtext(r rune, dot, permissive bool) bool { 732 switch r { 733 case '.': 734 return dot 735 736 // RFC 5322 3.2.3. specials 737 case '(', ')', '[', ']', ';', '@', '\\', ',': 738 return permissive 739 740 case '<', '>', '"', ':': 741 return false 742 } 743 return isVchar(r) 744 } 745 746 // isQtext reports whether r is an RFC 5322 qtext character. 747 func isQtext(r rune) bool { 748 // Printable US-ASCII, excluding backslash or quote. 749 if r == '\\' || r == '"' { 750 return false 751 } 752 return isVchar(r) 753 } 754 755 // quoteString renders a string as an RFC 5322 quoted-string. 756 func quoteString(s string) string { 757 var buf strings.Builder 758 buf.WriteByte('"') 759 for _, r := range s { 760 if isQtext(r) || isWSP(r) { 761 buf.WriteRune(r) 762 } else if isVchar(r) { 763 buf.WriteByte('\\') 764 buf.WriteRune(r) 765 } 766 } 767 buf.WriteByte('"') 768 return buf.String() 769 } 770 771 // isVchar reports whether r is an RFC 5322 VCHAR character. 772 func isVchar(r rune) bool { 773 // Visible (printing) characters. 774 return '!' <= r && r <= '~' || isMultibyte(r) 775 } 776 777 // isMultibyte reports whether r is a multi-byte UTF-8 character 778 // as supported by RFC 6532 779 func isMultibyte(r rune) bool { 780 return r >= utf8.RuneSelf 781 } 782 783 // isWSP reports whether r is a WSP (white space). 784 // WSP is a space or horizontal tab (RFC 5234 Appendix B). 785 func isWSP(r rune) bool { 786 return r == ' ' || r == '\t' 787 } 788
View as plain text