...
Run Format

Source file src/compress/bzip2/bzip2.go

Documentation: compress/bzip2

  // Copyright 2011 The Go Authors. All rights reserved.
  // Use of this source code is governed by a BSD-style
  // license that can be found in the LICENSE file.
  
  // Package bzip2 implements bzip2 decompression.
  package bzip2
  
  import "io"
  
  // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot
  // of guessing: http://en.wikipedia.org/wiki/Bzip2
  // The source code to pyflate was useful for debugging:
  // http://www.paul.sladen.org/projects/pyflate
  
  // A StructuralError is returned when the bzip2 data is found to be
  // syntactically invalid.
  type StructuralError string
  
  func (s StructuralError) Error() string {
  	return "bzip2 data invalid: " + string(s)
  }
  
  // A reader decompresses bzip2 compressed data.
  type reader struct {
  	br           bitReader
  	fileCRC      uint32
  	blockCRC     uint32
  	wantBlockCRC uint32
  	setupDone    bool // true if we have parsed the bzip2 header.
  	blockSize    int  // blockSize in bytes, i.e. 900 * 1000.
  	eof          bool
  	c            [256]uint // the `C' array for the inverse BWT.
  	tt           []uint32  // mirrors the `tt' array in the bzip2 source and contains the P array in the upper 24 bits.
  	tPos         uint32    // Index of the next output byte in tt.
  
  	preRLE      []uint32 // contains the RLE data still to be processed.
  	preRLEUsed  int      // number of entries of preRLE used.
  	lastByte    int      // the last byte value seen.
  	byteRepeats uint     // the number of repeats of lastByte seen.
  	repeats     uint     // the number of copies of lastByte to output.
  }
  
  // NewReader returns an io.Reader which decompresses bzip2 data from r.
  // If r does not also implement io.ByteReader,
  // the decompressor may read more data than necessary from r.
  func NewReader(r io.Reader) io.Reader {
  	bz2 := new(reader)
  	bz2.br = newBitReader(r)
  	return bz2
  }
  
  const bzip2FileMagic = 0x425a // "BZ"
  const bzip2BlockMagic = 0x314159265359
  const bzip2FinalMagic = 0x177245385090
  
  // setup parses the bzip2 header.
  func (bz2 *reader) setup(needMagic bool) error {
  	br := &bz2.br
  
  	if needMagic {
  		magic := br.ReadBits(16)
  		if magic != bzip2FileMagic {
  			return StructuralError("bad magic value")
  		}
  	}
  
  	t := br.ReadBits(8)
  	if t != 'h' {
  		return StructuralError("non-Huffman entropy encoding")
  	}
  
  	level := br.ReadBits(8)
  	if level < '1' || level > '9' {
  		return StructuralError("invalid compression level")
  	}
  
  	bz2.fileCRC = 0
  	bz2.blockSize = 100 * 1000 * (level - '0')
  	if bz2.blockSize > len(bz2.tt) {
  		bz2.tt = make([]uint32, bz2.blockSize)
  	}
  	return nil
  }
  
  func (bz2 *reader) Read(buf []byte) (n int, err error) {
  	if bz2.eof {
  		return 0, io.EOF
  	}
  
  	if !bz2.setupDone {
  		err = bz2.setup(true)
  		brErr := bz2.br.Err()
  		if brErr != nil {
  			err = brErr
  		}
  		if err != nil {
  			return 0, err
  		}
  		bz2.setupDone = true
  	}
  
  	n, err = bz2.read(buf)
  	brErr := bz2.br.Err()
  	if brErr != nil {
  		err = brErr
  	}
  	return
  }
  
  func (bz2 *reader) readFromBlock(buf []byte) int {
  	// bzip2 is a block based compressor, except that it has a run-length
  	// preprocessing step. The block based nature means that we can
  	// preallocate fixed-size buffers and reuse them. However, the RLE
  	// preprocessing would require allocating huge buffers to store the
  	// maximum expansion. Thus we process blocks all at once, except for
  	// the RLE which we decompress as required.
  	n := 0
  	for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) {
  		// We have RLE data pending.
  
  		// The run-length encoding works like this:
  		// Any sequence of four equal bytes is followed by a length
  		// byte which contains the number of repeats of that byte to
  		// include. (The number of repeats can be zero.) Because we are
  		// decompressing on-demand our state is kept in the reader
  		// object.
  
  		if bz2.repeats > 0 {
  			buf[n] = byte(bz2.lastByte)
  			n++
  			bz2.repeats--
  			if bz2.repeats == 0 {
  				bz2.lastByte = -1
  			}
  			continue
  		}
  
  		bz2.tPos = bz2.preRLE[bz2.tPos]
  		b := byte(bz2.tPos)
  		bz2.tPos >>= 8
  		bz2.preRLEUsed++
  
  		if bz2.byteRepeats == 3 {
  			bz2.repeats = uint(b)
  			bz2.byteRepeats = 0
  			continue
  		}
  
  		if bz2.lastByte == int(b) {
  			bz2.byteRepeats++
  		} else {
  			bz2.byteRepeats = 0
  		}
  		bz2.lastByte = int(b)
  
  		buf[n] = b
  		n++
  	}
  
  	return n
  }
  
  func (bz2 *reader) read(buf []byte) (int, error) {
  	for {
  		n := bz2.readFromBlock(buf)
  		if n > 0 {
  			bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n])
  			return n, nil
  		}
  
  		// End of block. Check CRC.
  		if bz2.blockCRC != bz2.wantBlockCRC {
  			bz2.br.err = StructuralError("block checksum mismatch")
  			return 0, bz2.br.err
  		}
  
  		// Find next block.
  		br := &bz2.br
  		switch br.ReadBits64(48) {
  		default:
  			return 0, StructuralError("bad magic value found")
  
  		case bzip2BlockMagic:
  			// Start of block.
  			err := bz2.readBlock()
  			if err != nil {
  				return 0, err
  			}
  
  		case bzip2FinalMagic:
  			// Check end-of-file CRC.
  			wantFileCRC := uint32(br.ReadBits64(32))
  			if br.err != nil {
  				return 0, br.err
  			}
  			if bz2.fileCRC != wantFileCRC {
  				br.err = StructuralError("file checksum mismatch")
  				return 0, br.err
  			}
  
  			// Skip ahead to byte boundary.
  			// Is there a file concatenated to this one?
  			// It would start with BZ.
  			if br.bits%8 != 0 {
  				br.ReadBits(br.bits % 8)
  			}
  			b, err := br.r.ReadByte()
  			if err == io.EOF {
  				br.err = io.EOF
  				bz2.eof = true
  				return 0, io.EOF
  			}
  			if err != nil {
  				br.err = err
  				return 0, err
  			}
  			z, err := br.r.ReadByte()
  			if err != nil {
  				if err == io.EOF {
  					err = io.ErrUnexpectedEOF
  				}
  				br.err = err
  				return 0, err
  			}
  			if b != 'B' || z != 'Z' {
  				return 0, StructuralError("bad magic value in continuation file")
  			}
  			if err := bz2.setup(false); err != nil {
  				return 0, err
  			}
  		}
  	}
  }
  
  // readBlock reads a bzip2 block. The magic number should already have been consumed.
  func (bz2 *reader) readBlock() (err error) {
  	br := &bz2.br
  	bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is.
  	bz2.blockCRC = 0
  	bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC
  	randomized := br.ReadBits(1)
  	if randomized != 0 {
  		return StructuralError("deprecated randomized files")
  	}
  	origPtr := uint(br.ReadBits(24))
  
  	// If not every byte value is used in the block (i.e., it's text) then
  	// the symbol set is reduced. The symbols used are stored as a
  	// two-level, 16x16 bitmap.
  	symbolRangeUsedBitmap := br.ReadBits(16)
  	symbolPresent := make([]bool, 256)
  	numSymbols := 0
  	for symRange := uint(0); symRange < 16; symRange++ {
  		if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 {
  			bits := br.ReadBits(16)
  			for symbol := uint(0); symbol < 16; symbol++ {
  				if bits&(1<<(15-symbol)) != 0 {
  					symbolPresent[16*symRange+symbol] = true
  					numSymbols++
  				}
  			}
  		}
  	}
  
  	if numSymbols == 0 {
  		// There must be an EOF symbol.
  		return StructuralError("no symbols in input")
  	}
  
  	// A block uses between two and six different Huffman trees.
  	numHuffmanTrees := br.ReadBits(3)
  	if numHuffmanTrees < 2 || numHuffmanTrees > 6 {
  		return StructuralError("invalid number of Huffman trees")
  	}
  
  	// The Huffman tree can switch every 50 symbols so there's a list of
  	// tree indexes telling us which tree to use for each 50 symbol block.
  	numSelectors := br.ReadBits(15)
  	treeIndexes := make([]uint8, numSelectors)
  
  	// The tree indexes are move-to-front transformed and stored as unary
  	// numbers.
  	mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees)
  	for i := range treeIndexes {
  		c := 0
  		for {
  			inc := br.ReadBits(1)
  			if inc == 0 {
  				break
  			}
  			c++
  		}
  		if c >= numHuffmanTrees {
  			return StructuralError("tree index too large")
  		}
  		treeIndexes[i] = mtfTreeDecoder.Decode(c)
  	}
  
  	// The list of symbols for the move-to-front transform is taken from
  	// the previously decoded symbol bitmap.
  	symbols := make([]byte, numSymbols)
  	nextSymbol := 0
  	for i := 0; i < 256; i++ {
  		if symbolPresent[i] {
  			symbols[nextSymbol] = byte(i)
  			nextSymbol++
  		}
  	}
  	mtf := newMTFDecoder(symbols)
  
  	numSymbols += 2 // to account for RUNA and RUNB symbols
  	huffmanTrees := make([]huffmanTree, numHuffmanTrees)
  
  	// Now we decode the arrays of code-lengths for each tree.
  	lengths := make([]uint8, numSymbols)
  	for i := range huffmanTrees {
  		// The code lengths are delta encoded from a 5-bit base value.
  		length := br.ReadBits(5)
  		for j := range lengths {
  			for {
  				if length < 1 || length > 20 {
  					return StructuralError("Huffman length out of range")
  				}
  				if !br.ReadBit() {
  					break
  				}
  				if br.ReadBit() {
  					length--
  				} else {
  					length++
  				}
  			}
  			lengths[j] = uint8(length)
  		}
  		huffmanTrees[i], err = newHuffmanTree(lengths)
  		if err != nil {
  			return err
  		}
  	}
  
  	selectorIndex := 1 // the next tree index to use
  	if len(treeIndexes) == 0 {
  		return StructuralError("no tree selectors given")
  	}
  	if int(treeIndexes[0]) >= len(huffmanTrees) {
  		return StructuralError("tree selector out of range")
  	}
  	currentHuffmanTree := huffmanTrees[treeIndexes[0]]
  	bufIndex := 0 // indexes bz2.buf, the output buffer.
  	// The output of the move-to-front transform is run-length encoded and
  	// we merge the decoding into the Huffman parsing loop. These two
  	// variables accumulate the repeat count. See the Wikipedia page for
  	// details.
  	repeat := 0
  	repeatPower := 0
  
  	// The `C' array (used by the inverse BWT) needs to be zero initialized.
  	for i := range bz2.c {
  		bz2.c[i] = 0
  	}
  
  	decoded := 0 // counts the number of symbols decoded by the current tree.
  	for {
  		if decoded == 50 {
  			if selectorIndex >= numSelectors {
  				return StructuralError("insufficient selector indices for number of symbols")
  			}
  			if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) {
  				return StructuralError("tree selector out of range")
  			}
  			currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]]
  			selectorIndex++
  			decoded = 0
  		}
  
  		v := currentHuffmanTree.Decode(br)
  		decoded++
  
  		if v < 2 {
  			// This is either the RUNA or RUNB symbol.
  			if repeat == 0 {
  				repeatPower = 1
  			}
  			repeat += repeatPower << v
  			repeatPower <<= 1
  
  			// This limit of 2 million comes from the bzip2 source
  			// code. It prevents repeat from overflowing.
  			if repeat > 2*1024*1024 {
  				return StructuralError("repeat count too large")
  			}
  			continue
  		}
  
  		if repeat > 0 {
  			// We have decoded a complete run-length so we need to
  			// replicate the last output symbol.
  			if repeat > bz2.blockSize-bufIndex {
  				return StructuralError("repeats past end of block")
  			}
  			for i := 0; i < repeat; i++ {
  				b := mtf.First()
  				bz2.tt[bufIndex] = uint32(b)
  				bz2.c[b]++
  				bufIndex++
  			}
  			repeat = 0
  		}
  
  		if int(v) == numSymbols-1 {
  			// This is the EOF symbol. Because it's always at the
  			// end of the move-to-front list, and never gets moved
  			// to the front, it has this unique value.
  			break
  		}
  
  		// Since two metasymbols (RUNA and RUNB) have values 0 and 1,
  		// one would expect |v-2| to be passed to the MTF decoder.
  		// However, the front of the MTF list is never referenced as 0,
  		// it's always referenced with a run-length of 1. Thus 0
  		// doesn't need to be encoded and we have |v-1| in the next
  		// line.
  		b := mtf.Decode(int(v - 1))
  		if bufIndex >= bz2.blockSize {
  			return StructuralError("data exceeds block size")
  		}
  		bz2.tt[bufIndex] = uint32(b)
  		bz2.c[b]++
  		bufIndex++
  	}
  
  	if origPtr >= uint(bufIndex) {
  		return StructuralError("origPtr out of bounds")
  	}
  
  	// We have completed the entropy decoding. Now we can perform the
  	// inverse BWT and setup the RLE buffer.
  	bz2.preRLE = bz2.tt[:bufIndex]
  	bz2.preRLEUsed = 0
  	bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:])
  	bz2.lastByte = -1
  	bz2.byteRepeats = 0
  	bz2.repeats = 0
  
  	return nil
  }
  
  // inverseBWT implements the inverse Burrows-Wheeler transform as described in
  // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2.
  // In that document, origPtr is called `I' and c is the `C' array after the
  // first pass over the data. It's an argument here because we merge the first
  // pass with the Huffman decoding.
  //
  // This also implements the `single array' method from the bzip2 source code
  // which leaves the output, still shuffled, in the bottom 8 bits of tt with the
  // index of the next byte in the top 24-bits. The index of the first byte is
  // returned.
  func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 {
  	sum := uint(0)
  	for i := 0; i < 256; i++ {
  		sum += c[i]
  		c[i] = sum - c[i]
  	}
  
  	for i := range tt {
  		b := tt[i] & 0xff
  		tt[c[b]] |= uint32(i) << 8
  		c[b]++
  	}
  
  	return tt[origPtr] >> 8
  }
  
  // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed,
  // causing the bits in the input to be processed in the reverse of the usual order.
  
  var crctab [256]uint32
  
  func init() {
  	const poly = 0x04C11DB7
  	for i := range crctab {
  		crc := uint32(i) << 24
  		for j := 0; j < 8; j++ {
  			if crc&0x80000000 != 0 {
  				crc = (crc << 1) ^ poly
  			} else {
  				crc <<= 1
  			}
  		}
  		crctab[i] = crc
  	}
  }
  
  // updateCRC updates the crc value to incorporate the data in b.
  // The initial value is 0.
  func updateCRC(val uint32, b []byte) uint32 {
  	crc := ^val
  	for _, v := range b {
  		crc = crctab[byte(crc>>24)^v] ^ (crc << 8)
  	}
  	return ^crc
  }
  

View as plain text