Source file src/net/http/sniff.go

     1  // Copyright 2011 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package http
     6  
     7  import (
     8  	"bytes"
     9  	"encoding/binary"
    10  )
    11  
    12  // The algorithm uses at most sniffLen bytes to make its decision.
    13  const sniffLen = 512
    14  
    15  // DetectContentType implements the algorithm described
    16  // at https://mimesniff.spec.whatwg.org/ to determine the
    17  // Content-Type of the given data. It considers at most the
    18  // first 512 bytes of data. DetectContentType always returns
    19  // a valid MIME type: if it cannot determine a more specific one, it
    20  // returns "application/octet-stream".
    21  func DetectContentType(data []byte) string {
    22  	if len(data) > sniffLen {
    23  		data = data[:sniffLen]
    24  	}
    25  
    26  	// Index of the first non-whitespace byte in data.
    27  	firstNonWS := 0
    28  	for ; firstNonWS < len(data) && isWS(data[firstNonWS]); firstNonWS++ {
    29  	}
    30  
    31  	for _, sig := range sniffSignatures {
    32  		if ct := sig.match(data, firstNonWS); ct != "" {
    33  			return ct
    34  		}
    35  	}
    36  
    37  	return "application/octet-stream" // fallback
    38  }
    39  
    40  // isWS reports whether the provided byte is a whitespace byte (0xWS)
    41  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
    42  func isWS(b byte) bool {
    43  	switch b {
    44  	case '\t', '\n', '\x0c', '\r', ' ':
    45  		return true
    46  	}
    47  	return false
    48  }
    49  
    50  // isTT reports whether the provided byte is a tag-terminating byte (0xTT)
    51  // as defined in https://mimesniff.spec.whatwg.org/#terminology.
    52  func isTT(b byte) bool {
    53  	switch b {
    54  	case ' ', '>':
    55  		return true
    56  	}
    57  	return false
    58  }
    59  
    60  type sniffSig interface {
    61  	// match returns the MIME type of the data, or "" if unknown.
    62  	match(data []byte, firstNonWS int) string
    63  }
    64  
    65  // Data matching the table in section 6.
    66  var sniffSignatures = []sniffSig{
    67  	htmlSig("<!DOCTYPE HTML"),
    68  	htmlSig("<HTML"),
    69  	htmlSig("<HEAD"),
    70  	htmlSig("<SCRIPT"),
    71  	htmlSig("<IFRAME"),
    72  	htmlSig("<H1"),
    73  	htmlSig("<DIV"),
    74  	htmlSig("<FONT"),
    75  	htmlSig("<TABLE"),
    76  	htmlSig("<A"),
    77  	htmlSig("<STYLE"),
    78  	htmlSig("<TITLE"),
    79  	htmlSig("<B"),
    80  	htmlSig("<BODY"),
    81  	htmlSig("<BR"),
    82  	htmlSig("<P"),
    83  	htmlSig("<!--"),
    84  	&maskedSig{
    85  		mask:   []byte("\xFF\xFF\xFF\xFF\xFF"),
    86  		pat:    []byte("<?xml"),
    87  		skipWS: true,
    88  		ct:     "text/xml; charset=utf-8"},
    89  	&exactSig{[]byte("%PDF-"), "application/pdf"},
    90  	&exactSig{[]byte("%!PS-Adobe-"), "application/postscript"},
    91  
    92  	// UTF BOMs.
    93  	&maskedSig{
    94  		mask: []byte("\xFF\xFF\x00\x00"),
    95  		pat:  []byte("\xFE\xFF\x00\x00"),
    96  		ct:   "text/plain; charset=utf-16be",
    97  	},
    98  	&maskedSig{
    99  		mask: []byte("\xFF\xFF\x00\x00"),
   100  		pat:  []byte("\xFF\xFE\x00\x00"),
   101  		ct:   "text/plain; charset=utf-16le",
   102  	},
   103  	&maskedSig{
   104  		mask: []byte("\xFF\xFF\xFF\x00"),
   105  		pat:  []byte("\xEF\xBB\xBF\x00"),
   106  		ct:   "text/plain; charset=utf-8",
   107  	},
   108  
   109  	// Image types
   110  	// For posterity, we originally returned "image/vnd.microsoft.icon" from
   111  	// https://tools.ietf.org/html/draft-ietf-websec-mime-sniff-03#section-7
   112  	// https://codereview.appspot.com/4746042
   113  	// but that has since been replaced with "image/x-icon" in Section 6.2
   114  	// of https://mimesniff.spec.whatwg.org/#matching-an-image-type-pattern
   115  	&exactSig{[]byte("\x00\x00\x01\x00"), "image/x-icon"},
   116  	&exactSig{[]byte("\x00\x00\x02\x00"), "image/x-icon"},
   117  	&exactSig{[]byte("BM"), "image/bmp"},
   118  	&exactSig{[]byte("GIF87a"), "image/gif"},
   119  	&exactSig{[]byte("GIF89a"), "image/gif"},
   120  	&maskedSig{
   121  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF\xFF\xFF"),
   122  		pat:  []byte("RIFF\x00\x00\x00\x00WEBPVP"),
   123  		ct:   "image/webp",
   124  	},
   125  	&exactSig{[]byte("\x89PNG\x0D\x0A\x1A\x0A"), "image/png"},
   126  	&exactSig{[]byte("\xFF\xD8\xFF"), "image/jpeg"},
   127  
   128  	// Audio and Video types
   129  	// Enforce the pattern match ordering as prescribed in
   130  	// https://mimesniff.spec.whatwg.org/#matching-an-audio-or-video-type-pattern
   131  	&maskedSig{
   132  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   133  		pat:  []byte("FORM\x00\x00\x00\x00AIFF"),
   134  		ct:   "audio/aiff",
   135  	},
   136  	&maskedSig{
   137  		mask: []byte("\xFF\xFF\xFF"),
   138  		pat:  []byte("ID3"),
   139  		ct:   "audio/mpeg",
   140  	},
   141  	&maskedSig{
   142  		mask: []byte("\xFF\xFF\xFF\xFF\xFF"),
   143  		pat:  []byte("OggS\x00"),
   144  		ct:   "application/ogg",
   145  	},
   146  	&maskedSig{
   147  		mask: []byte("\xFF\xFF\xFF\xFF\xFF\xFF\xFF\xFF"),
   148  		pat:  []byte("MThd\x00\x00\x00\x06"),
   149  		ct:   "audio/midi",
   150  	},
   151  	&maskedSig{
   152  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   153  		pat:  []byte("RIFF\x00\x00\x00\x00AVI "),
   154  		ct:   "video/avi",
   155  	},
   156  	&maskedSig{
   157  		mask: []byte("\xFF\xFF\xFF\xFF\x00\x00\x00\x00\xFF\xFF\xFF\xFF"),
   158  		pat:  []byte("RIFF\x00\x00\x00\x00WAVE"),
   159  		ct:   "audio/wave",
   160  	},
   161  	// 6.2.0.2. video/mp4
   162  	mp4Sig{},
   163  	// 6.2.0.3. video/webm
   164  	&exactSig{[]byte("\x1A\x45\xDF\xA3"), "video/webm"},
   165  
   166  	// Font types
   167  	&maskedSig{
   168  		// 34 NULL bytes followed by the string "LP"
   169  		pat: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00LP"),
   170  		// 34 NULL bytes followed by \xF\xF
   171  		mask: []byte("\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\xFF\xFF"),
   172  		ct:   "application/vnd.ms-fontobject",
   173  	},
   174  	&exactSig{[]byte("\x00\x01\x00\x00"), "font/ttf"},
   175  	&exactSig{[]byte("OTTO"), "font/otf"},
   176  	&exactSig{[]byte("ttcf"), "font/collection"},
   177  	&exactSig{[]byte("wOFF"), "font/woff"},
   178  	&exactSig{[]byte("wOF2"), "font/woff2"},
   179  
   180  	// Archive types
   181  	&exactSig{[]byte("\x1F\x8B\x08"), "application/x-gzip"},
   182  	&exactSig{[]byte("PK\x03\x04"), "application/zip"},
   183  	// RAR's signatures are incorrectly defined by the MIME spec as per
   184  	//    https://github.com/whatwg/mimesniff/issues/63
   185  	// However, RAR Labs correctly defines it at:
   186  	//    https://www.rarlab.com/technote.htm#rarsign
   187  	// so we use the definition from RAR Labs.
   188  	// TODO: do whatever the spec ends up doing.
   189  	&exactSig{[]byte("Rar!\x1A\x07\x00"), "application/x-rar-compressed"},     // RAR v1.5-v4.0
   190  	&exactSig{[]byte("Rar!\x1A\x07\x01\x00"), "application/x-rar-compressed"}, // RAR v5+
   191  
   192  	&exactSig{[]byte("\x00\x61\x73\x6D"), "application/wasm"},
   193  
   194  	textSig{}, // should be last
   195  }
   196  
   197  type exactSig struct {
   198  	sig []byte
   199  	ct  string
   200  }
   201  
   202  func (e *exactSig) match(data []byte, firstNonWS int) string {
   203  	if bytes.HasPrefix(data, e.sig) {
   204  		return e.ct
   205  	}
   206  	return ""
   207  }
   208  
   209  type maskedSig struct {
   210  	mask, pat []byte
   211  	skipWS    bool
   212  	ct        string
   213  }
   214  
   215  func (m *maskedSig) match(data []byte, firstNonWS int) string {
   216  	// pattern matching algorithm section 6
   217  	// https://mimesniff.spec.whatwg.org/#pattern-matching-algorithm
   218  
   219  	if m.skipWS {
   220  		data = data[firstNonWS:]
   221  	}
   222  	if len(m.pat) != len(m.mask) {
   223  		return ""
   224  	}
   225  	if len(data) < len(m.pat) {
   226  		return ""
   227  	}
   228  	for i, pb := range m.pat {
   229  		maskedData := data[i] & m.mask[i]
   230  		if maskedData != pb {
   231  			return ""
   232  		}
   233  	}
   234  	return m.ct
   235  }
   236  
   237  type htmlSig []byte
   238  
   239  func (h htmlSig) match(data []byte, firstNonWS int) string {
   240  	data = data[firstNonWS:]
   241  	if len(data) < len(h)+1 {
   242  		return ""
   243  	}
   244  	for i, b := range h {
   245  		db := data[i]
   246  		if 'A' <= b && b <= 'Z' {
   247  			db &= 0xDF
   248  		}
   249  		if b != db {
   250  			return ""
   251  		}
   252  	}
   253  	// Next byte must be a tag-terminating byte(0xTT).
   254  	if !isTT(data[len(h)]) {
   255  		return ""
   256  	}
   257  	return "text/html; charset=utf-8"
   258  }
   259  
   260  var mp4ftype = []byte("ftyp")
   261  var mp4 = []byte("mp4")
   262  
   263  type mp4Sig struct{}
   264  
   265  func (mp4Sig) match(data []byte, firstNonWS int) string {
   266  	// https://mimesniff.spec.whatwg.org/#signature-for-mp4
   267  	// c.f. section 6.2.1
   268  	if len(data) < 12 {
   269  		return ""
   270  	}
   271  	boxSize := int(binary.BigEndian.Uint32(data[:4]))
   272  	if len(data) < boxSize || boxSize%4 != 0 {
   273  		return ""
   274  	}
   275  	if !bytes.Equal(data[4:8], mp4ftype) {
   276  		return ""
   277  	}
   278  	for st := 8; st < boxSize; st += 4 {
   279  		if st == 12 {
   280  			// Ignores the four bytes that correspond to the version number of the "major brand".
   281  			continue
   282  		}
   283  		if bytes.Equal(data[st:st+3], mp4) {
   284  			return "video/mp4"
   285  		}
   286  	}
   287  	return ""
   288  }
   289  
   290  type textSig struct{}
   291  
   292  func (textSig) match(data []byte, firstNonWS int) string {
   293  	// c.f. section 5, step 4.
   294  	for _, b := range data[firstNonWS:] {
   295  		switch {
   296  		case b <= 0x08,
   297  			b == 0x0B,
   298  			0x0E <= b && b <= 0x1A,
   299  			0x1C <= b && b <= 0x1F:
   300  			return ""
   301  		}
   302  	}
   303  	return "text/plain; charset=utf-8"
   304  }
   305  

View as plain text