The Go Programming Language

Source file src/pkg/compress/flate/huffman_bit_writer.go

     1	// Copyright 2009 The Go Authors. All rights reserved.
     2	// Use of this source code is governed by a BSD-style
     3	// license that can be found in the LICENSE file.
     4	
     5	package flate
     6	
     7	import (
     8		"io"
     9		"math"
    10		"os"
    11		"strconv"
    12	)
    13	
    14	const (
    15		// The largest offset code.
    16		offsetCodeCount = 30
    17	
    18		// The special code used to mark the end of a block.
    19		endBlockMarker = 256
    20	
    21		// The first length code.
    22		lengthCodesStart = 257
    23	
    24		// The number of codegen codes.
    25		codegenCodeCount = 19
    26		badCode          = 255
    27	)
    28	
    29	// The number of extra bits needed by length code X - LENGTH_CODES_START.
    30	var lengthExtraBits = []int8{
    31		/* 257 */ 0, 0, 0,
    32		/* 260 */ 0, 0, 0, 0, 0, 1, 1, 1, 1, 2,
    33		/* 270 */ 2, 2, 2, 3, 3, 3, 3, 4, 4, 4,
    34		/* 280 */ 4, 5, 5, 5, 5, 0,
    35	}
    36	
    37	// The length indicated by length code X - LENGTH_CODES_START.
    38	var lengthBase = []uint32{
    39		0, 1, 2, 3, 4, 5, 6, 7, 8, 10,
    40		12, 14, 16, 20, 24, 28, 32, 40, 48, 56,
    41		64, 80, 96, 112, 128, 160, 192, 224, 255,
    42	}
    43	
    44	// offset code word extra bits.
    45	var offsetExtraBits = []int8{
    46		0, 0, 0, 0, 1, 1, 2, 2, 3, 3,
    47		4, 4, 5, 5, 6, 6, 7, 7, 8, 8,
    48		9, 9, 10, 10, 11, 11, 12, 12, 13, 13,
    49		/* extended window */
    50		14, 14, 15, 15, 16, 16, 17, 17, 18, 18, 19, 19, 20, 20,
    51	}
    52	
    53	var offsetBase = []uint32{
    54		/* normal deflate */
    55		0x000000, 0x000001, 0x000002, 0x000003, 0x000004,
    56		0x000006, 0x000008, 0x00000c, 0x000010, 0x000018,
    57		0x000020, 0x000030, 0x000040, 0x000060, 0x000080,
    58		0x0000c0, 0x000100, 0x000180, 0x000200, 0x000300,
    59		0x000400, 0x000600, 0x000800, 0x000c00, 0x001000,
    60		0x001800, 0x002000, 0x003000, 0x004000, 0x006000,
    61	
    62		/* extended window */
    63		0x008000, 0x00c000, 0x010000, 0x018000, 0x020000,
    64		0x030000, 0x040000, 0x060000, 0x080000, 0x0c0000,
    65		0x100000, 0x180000, 0x200000, 0x300000,
    66	}
    67	
    68	// The odd order in which the codegen code sizes are written.
    69	var codegenOrder = []uint32{16, 17, 18, 0, 8, 7, 9, 6, 10, 5, 11, 4, 12, 3, 13, 2, 14, 1, 15}
    70	
    71	type huffmanBitWriter struct {
    72		w io.Writer
    73		// Data waiting to be written is bytes[0:nbytes]
    74		// and then the low nbits of bits.
    75		bits            uint32
    76		nbits           uint32
    77		bytes           [64]byte
    78		nbytes          int
    79		literalFreq     []int32
    80		offsetFreq      []int32
    81		codegen         []uint8
    82		codegenFreq     []int32
    83		literalEncoding *huffmanEncoder
    84		offsetEncoding  *huffmanEncoder
    85		codegenEncoding *huffmanEncoder
    86		err             os.Error
    87	}
    88	
    89	type WrongValueError struct {
    90		name  string
    91		from  int32
    92		to    int32
    93		value int32
    94	}
    95	
    96	func newHuffmanBitWriter(w io.Writer) *huffmanBitWriter {
    97		return &huffmanBitWriter{
    98			w:               w,
    99			literalFreq:     make([]int32, maxLit),
   100			offsetFreq:      make([]int32, offsetCodeCount),
   101			codegen:         make([]uint8, maxLit+offsetCodeCount+1),
   102			codegenFreq:     make([]int32, codegenCodeCount),
   103			literalEncoding: newHuffmanEncoder(maxLit),
   104			offsetEncoding:  newHuffmanEncoder(offsetCodeCount),
   105			codegenEncoding: newHuffmanEncoder(codegenCodeCount),
   106		}
   107	}
   108	
   109	func (err WrongValueError) String() string {
   110		return "huffmanBitWriter: " + err.name + " should belong to [" + strconv.Itoa64(int64(err.from)) + ";" +
   111			strconv.Itoa64(int64(err.to)) + "] but actual value is " + strconv.Itoa64(int64(err.value))
   112	}
   113	
   114	func (w *huffmanBitWriter) flushBits() {
   115		if w.err != nil {
   116			w.nbits = 0
   117			return
   118		}
   119		bits := w.bits
   120		w.bits >>= 16
   121		w.nbits -= 16
   122		n := w.nbytes
   123		w.bytes[n] = byte(bits)
   124		w.bytes[n+1] = byte(bits >> 8)
   125		if n += 2; n >= len(w.bytes) {
   126			_, w.err = w.w.Write(w.bytes[0:])
   127			n = 0
   128		}
   129		w.nbytes = n
   130	}
   131	
   132	func (w *huffmanBitWriter) flush() {
   133		if w.err != nil {
   134			w.nbits = 0
   135			return
   136		}
   137		n := w.nbytes
   138		if w.nbits > 8 {
   139			w.bytes[n] = byte(w.bits)
   140			w.bits >>= 8
   141			w.nbits -= 8
   142			n++
   143		}
   144		if w.nbits > 0 {
   145			w.bytes[n] = byte(w.bits)
   146			w.nbits = 0
   147			n++
   148		}
   149		w.bits = 0
   150		_, w.err = w.w.Write(w.bytes[0:n])
   151		w.nbytes = 0
   152	}
   153	
   154	func (w *huffmanBitWriter) writeBits(b, nb int32) {
   155		w.bits |= uint32(b) << w.nbits
   156		if w.nbits += uint32(nb); w.nbits >= 16 {
   157			w.flushBits()
   158		}
   159	}
   160	
   161	func (w *huffmanBitWriter) writeBytes(bytes []byte) {
   162		if w.err != nil {
   163			return
   164		}
   165		n := w.nbytes
   166		if w.nbits == 8 {
   167			w.bytes[n] = byte(w.bits)
   168			w.nbits = 0
   169			n++
   170		}
   171		if w.nbits != 0 {
   172			w.err = InternalError("writeBytes with unfinished bits")
   173			return
   174		}
   175		if n != 0 {
   176			_, w.err = w.w.Write(w.bytes[0:n])
   177			if w.err != nil {
   178				return
   179			}
   180		}
   181		w.nbytes = 0
   182		_, w.err = w.w.Write(bytes)
   183	}
   184	
   185	// RFC 1951 3.2.7 specifies a special run-length encoding for specifying
   186	// the literal and offset lengths arrays (which are concatenated into a single
   187	// array).  This method generates that run-length encoding.
   188	//
   189	// The result is written into the codegen array, and the frequencies
   190	// of each code is written into the codegenFreq array.
   191	// Codes 0-15 are single byte codes. Codes 16-18 are followed by additional
   192	// information.  Code badCode is an end marker
   193	//
   194	//  numLiterals      The number of literals in literalEncoding
   195	//  numOffsets       The number of offsets in offsetEncoding
   196	func (w *huffmanBitWriter) generateCodegen(numLiterals int, numOffsets int) {
   197		fillInt32s(w.codegenFreq, 0)
   198		// Note that we are using codegen both as a temporary variable for holding
   199		// a copy of the frequencies, and as the place where we put the result.
   200		// This is fine because the output is always shorter than the input used
   201		// so far.
   202		codegen := w.codegen // cache
   203		// Copy the concatenated code sizes to codegen.  Put a marker at the end.
   204		copyUint8s(codegen[0:numLiterals], w.literalEncoding.codeBits)
   205		copyUint8s(codegen[numLiterals:numLiterals+numOffsets], w.offsetEncoding.codeBits)
   206		codegen[numLiterals+numOffsets] = badCode
   207	
   208		size := codegen[0]
   209		count := 1
   210		outIndex := 0
   211		for inIndex := 1; size != badCode; inIndex++ {
   212			// INVARIANT: We have seen "count" copies of size that have not yet
   213			// had output generated for them.
   214			nextSize := codegen[inIndex]
   215			if nextSize == size {
   216				count++
   217				continue
   218			}
   219			// We need to generate codegen indicating "count" of size.
   220			if size != 0 {
   221				codegen[outIndex] = size
   222				outIndex++
   223				w.codegenFreq[size]++
   224				count--
   225				for count >= 3 {
   226					n := min(count, 6)
   227					codegen[outIndex] = 16
   228					outIndex++
   229					codegen[outIndex] = uint8(n - 3)
   230					outIndex++
   231					w.codegenFreq[16]++
   232					count -= n
   233				}
   234			} else {
   235				for count >= 11 {
   236					n := min(count, 138)
   237					codegen[outIndex] = 18
   238					outIndex++
   239					codegen[outIndex] = uint8(n - 11)
   240					outIndex++
   241					w.codegenFreq[18]++
   242					count -= n
   243				}
   244				if count >= 3 {
   245					// count >= 3 && count <= 10
   246					codegen[outIndex] = 17
   247					outIndex++
   248					codegen[outIndex] = uint8(count - 3)
   249					outIndex++
   250					w.codegenFreq[17]++
   251					count = 0
   252				}
   253			}
   254			count--
   255			for ; count >= 0; count-- {
   256				codegen[outIndex] = size
   257				outIndex++
   258				w.codegenFreq[size]++
   259			}
   260			// Set up invariant for next time through the loop.
   261			size = nextSize
   262			count = 1
   263		}
   264		// Marker indicating the end of the codegen.
   265		codegen[outIndex] = badCode
   266	}
   267	
   268	func (w *huffmanBitWriter) writeCode(code *huffmanEncoder, literal uint32) {
   269		if w.err != nil {
   270			return
   271		}
   272		w.writeBits(int32(code.code[literal]), int32(code.codeBits[literal]))
   273	}
   274	
   275	// Write the header of a dynamic Huffman block to the output stream.
   276	//
   277	//  numLiterals  The number of literals specified in codegen
   278	//  numOffsets   The number of offsets specified in codegen
   279	//  numCodegens  The number of codegens used in codegen
   280	func (w *huffmanBitWriter) writeDynamicHeader(numLiterals int, numOffsets int, numCodegens int, isEof bool) {
   281		if w.err != nil {
   282			return
   283		}
   284		var firstBits int32 = 4
   285		if isEof {
   286			firstBits = 5
   287		}
   288		w.writeBits(firstBits, 3)
   289		w.writeBits(int32(numLiterals-257), 5)
   290		w.writeBits(int32(numOffsets-1), 5)
   291		w.writeBits(int32(numCodegens-4), 4)
   292	
   293		for i := 0; i < numCodegens; i++ {
   294			value := w.codegenEncoding.codeBits[codegenOrder[i]]
   295			w.writeBits(int32(value), 3)
   296		}
   297	
   298		i := 0
   299		for {
   300			var codeWord int = int(w.codegen[i])
   301			i++
   302			if codeWord == badCode {
   303				break
   304			}
   305			// The low byte contains the actual code to generate.
   306			w.writeCode(w.codegenEncoding, uint32(codeWord))
   307	
   308			switch codeWord {
   309			case 16:
   310				w.writeBits(int32(w.codegen[i]), 2)
   311				i++
   312				break
   313			case 17:
   314				w.writeBits(int32(w.codegen[i]), 3)
   315				i++
   316				break
   317			case 18:
   318				w.writeBits(int32(w.codegen[i]), 7)
   319				i++
   320				break
   321			}
   322		}
   323	}
   324	
   325	func (w *huffmanBitWriter) writeStoredHeader(length int, isEof bool) {
   326		if w.err != nil {
   327			return
   328		}
   329		var flag int32
   330		if isEof {
   331			flag = 1
   332		}
   333		w.writeBits(flag, 3)
   334		w.flush()
   335		w.writeBits(int32(length), 16)
   336		w.writeBits(int32(^uint16(length)), 16)
   337	}
   338	
   339	func (w *huffmanBitWriter) writeFixedHeader(isEof bool) {
   340		if w.err != nil {
   341			return
   342		}
   343		// Indicate that we are a fixed Huffman block
   344		var value int32 = 2
   345		if isEof {
   346			value = 3
   347		}
   348		w.writeBits(value, 3)
   349	}
   350	
   351	func (w *huffmanBitWriter) writeBlock(tokens []token, eof bool, input []byte) {
   352		if w.err != nil {
   353			return
   354		}
   355		fillInt32s(w.literalFreq, 0)
   356		fillInt32s(w.offsetFreq, 0)
   357	
   358		n := len(tokens)
   359		tokens = tokens[0 : n+1]
   360		tokens[n] = endBlockMarker
   361	
   362		for _, t := range tokens {
   363			switch t.typ() {
   364			case literalType:
   365				w.literalFreq[t.literal()]++
   366			case matchType:
   367				length := t.length()
   368				offset := t.offset()
   369				w.literalFreq[lengthCodesStart+lengthCode(length)]++
   370				w.offsetFreq[offsetCode(offset)]++
   371			}
   372		}
   373	
   374		// get the number of literals
   375		numLiterals := len(w.literalFreq)
   376		for w.literalFreq[numLiterals-1] == 0 {
   377			numLiterals--
   378		}
   379		// get the number of offsets
   380		numOffsets := len(w.offsetFreq)
   381		for numOffsets > 0 && w.offsetFreq[numOffsets-1] == 0 {
   382			numOffsets--
   383		}
   384		if numOffsets == 0 {
   385			// We haven't found a single match. If we want to go with the dynamic encoding,
   386			// we should count at least one offset to be sure that the offset huffman tree could be encoded.
   387			w.offsetFreq[0] = 1
   388			numOffsets = 1
   389		}
   390	
   391		w.literalEncoding.generate(w.literalFreq, 15)
   392		w.offsetEncoding.generate(w.offsetFreq, 15)
   393	
   394		storedBytes := 0
   395		if input != nil {
   396			storedBytes = len(input)
   397		}
   398		var extraBits int64
   399		var storedSize int64 = math.MaxInt64
   400		if storedBytes <= maxStoreBlockSize && input != nil {
   401			storedSize = int64((storedBytes + 5) * 8)
   402			// We only bother calculating the costs of the extra bits required by
   403			// the length of offset fields (which will be the same for both fixed
   404			// and dynamic encoding), if we need to compare those two encodings
   405			// against stored encoding.
   406			for lengthCode := lengthCodesStart + 8; lengthCode < numLiterals; lengthCode++ {
   407				// First eight length codes have extra size = 0.
   408				extraBits += int64(w.literalFreq[lengthCode]) * int64(lengthExtraBits[lengthCode-lengthCodesStart])
   409			}
   410			for offsetCode := 4; offsetCode < numOffsets; offsetCode++ {
   411				// First four offset codes have extra size = 0.
   412				extraBits += int64(w.offsetFreq[offsetCode]) * int64(offsetExtraBits[offsetCode])
   413			}
   414		}
   415	
   416		// Figure out smallest code.
   417		// Fixed Huffman baseline.
   418		var size = int64(3) +
   419			fixedLiteralEncoding.bitLength(w.literalFreq) +
   420			fixedOffsetEncoding.bitLength(w.offsetFreq) +
   421			extraBits
   422		var literalEncoding = fixedLiteralEncoding
   423		var offsetEncoding = fixedOffsetEncoding
   424	
   425		// Dynamic Huffman?
   426		var numCodegens int
   427	
   428		// Generate codegen and codegenFrequencies, which indicates how to encode
   429		// the literalEncoding and the offsetEncoding.
   430		w.generateCodegen(numLiterals, numOffsets)
   431		w.codegenEncoding.generate(w.codegenFreq, 7)
   432		numCodegens = len(w.codegenFreq)
   433		for numCodegens > 4 && w.codegenFreq[codegenOrder[numCodegens-1]] == 0 {
   434			numCodegens--
   435		}
   436		dynamicHeader := int64(3+5+5+4+(3*numCodegens)) +
   437			w.codegenEncoding.bitLength(w.codegenFreq) +
   438			int64(extraBits) +
   439			int64(w.codegenFreq[16]*2) +
   440			int64(w.codegenFreq[17]*3) +
   441			int64(w.codegenFreq[18]*7)
   442		dynamicSize := dynamicHeader +
   443			w.literalEncoding.bitLength(w.literalFreq) +
   444			w.offsetEncoding.bitLength(w.offsetFreq)
   445	
   446		if dynamicSize < size {
   447			size = dynamicSize
   448			literalEncoding = w.literalEncoding
   449			offsetEncoding = w.offsetEncoding
   450		}
   451	
   452		// Stored bytes?
   453		if storedSize < size {
   454			w.writeStoredHeader(storedBytes, eof)
   455			w.writeBytes(input[0:storedBytes])
   456			return
   457		}
   458	
   459		// Huffman.
   460		if literalEncoding == fixedLiteralEncoding {
   461			w.writeFixedHeader(eof)
   462		} else {
   463			w.writeDynamicHeader(numLiterals, numOffsets, numCodegens, eof)
   464		}
   465		for _, t := range tokens {
   466			switch t.typ() {
   467			case literalType:
   468				w.writeCode(literalEncoding, t.literal())
   469				break
   470			case matchType:
   471				// Write the length
   472				length := t.length()
   473				lengthCode := lengthCode(length)
   474				w.writeCode(literalEncoding, lengthCode+lengthCodesStart)
   475				extraLengthBits := int32(lengthExtraBits[lengthCode])
   476				if extraLengthBits > 0 {
   477					extraLength := int32(length - lengthBase[lengthCode])
   478					w.writeBits(extraLength, extraLengthBits)
   479				}
   480				// Write the offset
   481				offset := t.offset()
   482				offsetCode := offsetCode(offset)
   483				w.writeCode(offsetEncoding, offsetCode)
   484				extraOffsetBits := int32(offsetExtraBits[offsetCode])
   485				if extraOffsetBits > 0 {
   486					extraOffset := int32(offset - offsetBase[offsetCode])
   487					w.writeBits(extraOffset, extraOffsetBits)
   488				}
   489				break
   490			default:
   491				panic("unknown token type: " + string(t))
   492			}
   493		}
   494	}

release.r60.3. Except as noted, this content is licensed under a Creative Commons Attribution 3.0 License.