|  | // Copyright 2011 The Go Authors. All rights reserved. | 
|  | // Use of this source code is governed by a BSD-style | 
|  | // license that can be found in the LICENSE file. | 
|  |  | 
|  | // Package bzip2 implements bzip2 decompression. | 
|  | package bzip2 | 
|  |  | 
|  | import "io" | 
|  |  | 
|  | // There's no RFC for bzip2. I used the Wikipedia page for reference and a lot | 
|  | // of guessing: https://en.wikipedia.org/wiki/Bzip2 | 
|  | // The source code to pyflate was useful for debugging: | 
|  | // http://www.paul.sladen.org/projects/pyflate | 
|  |  | 
|  | // A StructuralError is returned when the bzip2 data is found to be | 
|  | // syntactically invalid. | 
|  | type StructuralError string | 
|  |  | 
|  | func (s StructuralError) Error() string { | 
|  | return "bzip2 data invalid: " + string(s) | 
|  | } | 
|  |  | 
|  | // A reader decompresses bzip2 compressed data. | 
|  | type reader struct { | 
|  | br           bitReader | 
|  | fileCRC      uint32 | 
|  | blockCRC     uint32 | 
|  | wantBlockCRC uint32 | 
|  | setupDone    bool // true if we have parsed the bzip2 header. | 
|  | blockSize    int  // blockSize in bytes, i.e. 900 * 1000. | 
|  | eof          bool | 
|  | c            [256]uint // the ``C'' array for the inverse BWT. | 
|  | tt           []uint32  // mirrors the ``tt'' array in the bzip2 source and contains the P array in the upper 24 bits. | 
|  | tPos         uint32    // Index of the next output byte in tt. | 
|  |  | 
|  | preRLE      []uint32 // contains the RLE data still to be processed. | 
|  | preRLEUsed  int      // number of entries of preRLE used. | 
|  | lastByte    int      // the last byte value seen. | 
|  | byteRepeats uint     // the number of repeats of lastByte seen. | 
|  | repeats     uint     // the number of copies of lastByte to output. | 
|  | } | 
|  |  | 
|  | // NewReader returns an io.Reader which decompresses bzip2 data from r. | 
|  | // If r does not also implement io.ByteReader, | 
|  | // the decompressor may read more data than necessary from r. | 
|  | func NewReader(r io.Reader) io.Reader { | 
|  | bz2 := new(reader) | 
|  | bz2.br = newBitReader(r) | 
|  | return bz2 | 
|  | } | 
|  |  | 
|  | const bzip2FileMagic = 0x425a // "BZ" | 
|  | const bzip2BlockMagic = 0x314159265359 | 
|  | const bzip2FinalMagic = 0x177245385090 | 
|  |  | 
|  | // setup parses the bzip2 header. | 
|  | func (bz2 *reader) setup(needMagic bool) error { | 
|  | br := &bz2.br | 
|  |  | 
|  | if needMagic { | 
|  | magic := br.ReadBits(16) | 
|  | if magic != bzip2FileMagic { | 
|  | return StructuralError("bad magic value") | 
|  | } | 
|  | } | 
|  |  | 
|  | t := br.ReadBits(8) | 
|  | if t != 'h' { | 
|  | return StructuralError("non-Huffman entropy encoding") | 
|  | } | 
|  |  | 
|  | level := br.ReadBits(8) | 
|  | if level < '1' || level > '9' { | 
|  | return StructuralError("invalid compression level") | 
|  | } | 
|  |  | 
|  | bz2.fileCRC = 0 | 
|  | bz2.blockSize = 100 * 1000 * (level - '0') | 
|  | if bz2.blockSize > len(bz2.tt) { | 
|  | bz2.tt = make([]uint32, bz2.blockSize) | 
|  | } | 
|  | return nil | 
|  | } | 
|  |  | 
|  | func (bz2 *reader) Read(buf []byte) (n int, err error) { | 
|  | if bz2.eof { | 
|  | return 0, io.EOF | 
|  | } | 
|  |  | 
|  | if !bz2.setupDone { | 
|  | err = bz2.setup(true) | 
|  | brErr := bz2.br.Err() | 
|  | if brErr != nil { | 
|  | err = brErr | 
|  | } | 
|  | if err != nil { | 
|  | return 0, err | 
|  | } | 
|  | bz2.setupDone = true | 
|  | } | 
|  |  | 
|  | n, err = bz2.read(buf) | 
|  | brErr := bz2.br.Err() | 
|  | if brErr != nil { | 
|  | err = brErr | 
|  | } | 
|  | return | 
|  | } | 
|  |  | 
|  | func (bz2 *reader) readFromBlock(buf []byte) int { | 
|  | // bzip2 is a block based compressor, except that it has a run-length | 
|  | // preprocessing step. The block based nature means that we can | 
|  | // preallocate fixed-size buffers and reuse them. However, the RLE | 
|  | // preprocessing would require allocating huge buffers to store the | 
|  | // maximum expansion. Thus we process blocks all at once, except for | 
|  | // the RLE which we decompress as required. | 
|  | n := 0 | 
|  | for (bz2.repeats > 0 || bz2.preRLEUsed < len(bz2.preRLE)) && n < len(buf) { | 
|  | // We have RLE data pending. | 
|  |  | 
|  | // The run-length encoding works like this: | 
|  | // Any sequence of four equal bytes is followed by a length | 
|  | // byte which contains the number of repeats of that byte to | 
|  | // include. (The number of repeats can be zero.) Because we are | 
|  | // decompressing on-demand our state is kept in the reader | 
|  | // object. | 
|  |  | 
|  | if bz2.repeats > 0 { | 
|  | buf[n] = byte(bz2.lastByte) | 
|  | n++ | 
|  | bz2.repeats-- | 
|  | if bz2.repeats == 0 { | 
|  | bz2.lastByte = -1 | 
|  | } | 
|  | continue | 
|  | } | 
|  |  | 
|  | bz2.tPos = bz2.preRLE[bz2.tPos] | 
|  | b := byte(bz2.tPos) | 
|  | bz2.tPos >>= 8 | 
|  | bz2.preRLEUsed++ | 
|  |  | 
|  | if bz2.byteRepeats == 3 { | 
|  | bz2.repeats = uint(b) | 
|  | bz2.byteRepeats = 0 | 
|  | continue | 
|  | } | 
|  |  | 
|  | if bz2.lastByte == int(b) { | 
|  | bz2.byteRepeats++ | 
|  | } else { | 
|  | bz2.byteRepeats = 0 | 
|  | } | 
|  | bz2.lastByte = int(b) | 
|  |  | 
|  | buf[n] = b | 
|  | n++ | 
|  | } | 
|  |  | 
|  | return n | 
|  | } | 
|  |  | 
|  | func (bz2 *reader) read(buf []byte) (int, error) { | 
|  | for { | 
|  | n := bz2.readFromBlock(buf) | 
|  | if n > 0 || len(buf) == 0 { | 
|  | bz2.blockCRC = updateCRC(bz2.blockCRC, buf[:n]) | 
|  | return n, nil | 
|  | } | 
|  |  | 
|  | // End of block. Check CRC. | 
|  | if bz2.blockCRC != bz2.wantBlockCRC { | 
|  | bz2.br.err = StructuralError("block checksum mismatch") | 
|  | return 0, bz2.br.err | 
|  | } | 
|  |  | 
|  | // Find next block. | 
|  | br := &bz2.br | 
|  | switch br.ReadBits64(48) { | 
|  | default: | 
|  | return 0, StructuralError("bad magic value found") | 
|  |  | 
|  | case bzip2BlockMagic: | 
|  | // Start of block. | 
|  | err := bz2.readBlock() | 
|  | if err != nil { | 
|  | return 0, err | 
|  | } | 
|  |  | 
|  | case bzip2FinalMagic: | 
|  | // Check end-of-file CRC. | 
|  | wantFileCRC := uint32(br.ReadBits64(32)) | 
|  | if br.err != nil { | 
|  | return 0, br.err | 
|  | } | 
|  | if bz2.fileCRC != wantFileCRC { | 
|  | br.err = StructuralError("file checksum mismatch") | 
|  | return 0, br.err | 
|  | } | 
|  |  | 
|  | // Skip ahead to byte boundary. | 
|  | // Is there a file concatenated to this one? | 
|  | // It would start with BZ. | 
|  | if br.bits%8 != 0 { | 
|  | br.ReadBits(br.bits % 8) | 
|  | } | 
|  | b, err := br.r.ReadByte() | 
|  | if err == io.EOF { | 
|  | br.err = io.EOF | 
|  | bz2.eof = true | 
|  | return 0, io.EOF | 
|  | } | 
|  | if err != nil { | 
|  | br.err = err | 
|  | return 0, err | 
|  | } | 
|  | z, err := br.r.ReadByte() | 
|  | if err != nil { | 
|  | if err == io.EOF { | 
|  | err = io.ErrUnexpectedEOF | 
|  | } | 
|  | br.err = err | 
|  | return 0, err | 
|  | } | 
|  | if b != 'B' || z != 'Z' { | 
|  | return 0, StructuralError("bad magic value in continuation file") | 
|  | } | 
|  | if err := bz2.setup(false); err != nil { | 
|  | return 0, err | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | // readBlock reads a bzip2 block. The magic number should already have been consumed. | 
|  | func (bz2 *reader) readBlock() (err error) { | 
|  | br := &bz2.br | 
|  | bz2.wantBlockCRC = uint32(br.ReadBits64(32)) // skip checksum. TODO: check it if we can figure out what it is. | 
|  | bz2.blockCRC = 0 | 
|  | bz2.fileCRC = (bz2.fileCRC<<1 | bz2.fileCRC>>31) ^ bz2.wantBlockCRC | 
|  | randomized := br.ReadBits(1) | 
|  | if randomized != 0 { | 
|  | return StructuralError("deprecated randomized files") | 
|  | } | 
|  | origPtr := uint(br.ReadBits(24)) | 
|  |  | 
|  | // If not every byte value is used in the block (i.e., it's text) then | 
|  | // the symbol set is reduced. The symbols used are stored as a | 
|  | // two-level, 16x16 bitmap. | 
|  | symbolRangeUsedBitmap := br.ReadBits(16) | 
|  | symbolPresent := make([]bool, 256) | 
|  | numSymbols := 0 | 
|  | for symRange := uint(0); symRange < 16; symRange++ { | 
|  | if symbolRangeUsedBitmap&(1<<(15-symRange)) != 0 { | 
|  | bits := br.ReadBits(16) | 
|  | for symbol := uint(0); symbol < 16; symbol++ { | 
|  | if bits&(1<<(15-symbol)) != 0 { | 
|  | symbolPresent[16*symRange+symbol] = true | 
|  | numSymbols++ | 
|  | } | 
|  | } | 
|  | } | 
|  | } | 
|  |  | 
|  | if numSymbols == 0 { | 
|  | // There must be an EOF symbol. | 
|  | return StructuralError("no symbols in input") | 
|  | } | 
|  |  | 
|  | // A block uses between two and six different Huffman trees. | 
|  | numHuffmanTrees := br.ReadBits(3) | 
|  | if numHuffmanTrees < 2 || numHuffmanTrees > 6 { | 
|  | return StructuralError("invalid number of Huffman trees") | 
|  | } | 
|  |  | 
|  | // The Huffman tree can switch every 50 symbols so there's a list of | 
|  | // tree indexes telling us which tree to use for each 50 symbol block. | 
|  | numSelectors := br.ReadBits(15) | 
|  | treeIndexes := make([]uint8, numSelectors) | 
|  |  | 
|  | // The tree indexes are move-to-front transformed and stored as unary | 
|  | // numbers. | 
|  | mtfTreeDecoder := newMTFDecoderWithRange(numHuffmanTrees) | 
|  | for i := range treeIndexes { | 
|  | c := 0 | 
|  | for { | 
|  | inc := br.ReadBits(1) | 
|  | if inc == 0 { | 
|  | break | 
|  | } | 
|  | c++ | 
|  | } | 
|  | if c >= numHuffmanTrees { | 
|  | return StructuralError("tree index too large") | 
|  | } | 
|  | treeIndexes[i] = mtfTreeDecoder.Decode(c) | 
|  | } | 
|  |  | 
|  | // The list of symbols for the move-to-front transform is taken from | 
|  | // the previously decoded symbol bitmap. | 
|  | symbols := make([]byte, numSymbols) | 
|  | nextSymbol := 0 | 
|  | for i := 0; i < 256; i++ { | 
|  | if symbolPresent[i] { | 
|  | symbols[nextSymbol] = byte(i) | 
|  | nextSymbol++ | 
|  | } | 
|  | } | 
|  | mtf := newMTFDecoder(symbols) | 
|  |  | 
|  | numSymbols += 2 // to account for RUNA and RUNB symbols | 
|  | huffmanTrees := make([]huffmanTree, numHuffmanTrees) | 
|  |  | 
|  | // Now we decode the arrays of code-lengths for each tree. | 
|  | lengths := make([]uint8, numSymbols) | 
|  | for i := range huffmanTrees { | 
|  | // The code lengths are delta encoded from a 5-bit base value. | 
|  | length := br.ReadBits(5) | 
|  | for j := range lengths { | 
|  | for { | 
|  | if length < 1 || length > 20 { | 
|  | return StructuralError("Huffman length out of range") | 
|  | } | 
|  | if !br.ReadBit() { | 
|  | break | 
|  | } | 
|  | if br.ReadBit() { | 
|  | length-- | 
|  | } else { | 
|  | length++ | 
|  | } | 
|  | } | 
|  | lengths[j] = uint8(length) | 
|  | } | 
|  | huffmanTrees[i], err = newHuffmanTree(lengths) | 
|  | if err != nil { | 
|  | return err | 
|  | } | 
|  | } | 
|  |  | 
|  | selectorIndex := 1 // the next tree index to use | 
|  | if len(treeIndexes) == 0 { | 
|  | return StructuralError("no tree selectors given") | 
|  | } | 
|  | if int(treeIndexes[0]) >= len(huffmanTrees) { | 
|  | return StructuralError("tree selector out of range") | 
|  | } | 
|  | currentHuffmanTree := huffmanTrees[treeIndexes[0]] | 
|  | bufIndex := 0 // indexes bz2.buf, the output buffer. | 
|  | // The output of the move-to-front transform is run-length encoded and | 
|  | // we merge the decoding into the Huffman parsing loop. These two | 
|  | // variables accumulate the repeat count. See the Wikipedia page for | 
|  | // details. | 
|  | repeat := 0 | 
|  | repeatPower := 0 | 
|  |  | 
|  | // The `C' array (used by the inverse BWT) needs to be zero initialized. | 
|  | for i := range bz2.c { | 
|  | bz2.c[i] = 0 | 
|  | } | 
|  |  | 
|  | decoded := 0 // counts the number of symbols decoded by the current tree. | 
|  | for { | 
|  | if decoded == 50 { | 
|  | if selectorIndex >= numSelectors { | 
|  | return StructuralError("insufficient selector indices for number of symbols") | 
|  | } | 
|  | if int(treeIndexes[selectorIndex]) >= len(huffmanTrees) { | 
|  | return StructuralError("tree selector out of range") | 
|  | } | 
|  | currentHuffmanTree = huffmanTrees[treeIndexes[selectorIndex]] | 
|  | selectorIndex++ | 
|  | decoded = 0 | 
|  | } | 
|  |  | 
|  | v := currentHuffmanTree.Decode(br) | 
|  | decoded++ | 
|  |  | 
|  | if v < 2 { | 
|  | // This is either the RUNA or RUNB symbol. | 
|  | if repeat == 0 { | 
|  | repeatPower = 1 | 
|  | } | 
|  | repeat += repeatPower << v | 
|  | repeatPower <<= 1 | 
|  |  | 
|  | // This limit of 2 million comes from the bzip2 source | 
|  | // code. It prevents repeat from overflowing. | 
|  | if repeat > 2*1024*1024 { | 
|  | return StructuralError("repeat count too large") | 
|  | } | 
|  | continue | 
|  | } | 
|  |  | 
|  | if repeat > 0 { | 
|  | // We have decoded a complete run-length so we need to | 
|  | // replicate the last output symbol. | 
|  | if repeat > bz2.blockSize-bufIndex { | 
|  | return StructuralError("repeats past end of block") | 
|  | } | 
|  | for i := 0; i < repeat; i++ { | 
|  | b := mtf.First() | 
|  | bz2.tt[bufIndex] = uint32(b) | 
|  | bz2.c[b]++ | 
|  | bufIndex++ | 
|  | } | 
|  | repeat = 0 | 
|  | } | 
|  |  | 
|  | if int(v) == numSymbols-1 { | 
|  | // This is the EOF symbol. Because it's always at the | 
|  | // end of the move-to-front list, and never gets moved | 
|  | // to the front, it has this unique value. | 
|  | break | 
|  | } | 
|  |  | 
|  | // Since two metasymbols (RUNA and RUNB) have values 0 and 1, | 
|  | // one would expect |v-2| to be passed to the MTF decoder. | 
|  | // However, the front of the MTF list is never referenced as 0, | 
|  | // it's always referenced with a run-length of 1. Thus 0 | 
|  | // doesn't need to be encoded and we have |v-1| in the next | 
|  | // line. | 
|  | b := mtf.Decode(int(v - 1)) | 
|  | if bufIndex >= bz2.blockSize { | 
|  | return StructuralError("data exceeds block size") | 
|  | } | 
|  | bz2.tt[bufIndex] = uint32(b) | 
|  | bz2.c[b]++ | 
|  | bufIndex++ | 
|  | } | 
|  |  | 
|  | if origPtr >= uint(bufIndex) { | 
|  | return StructuralError("origPtr out of bounds") | 
|  | } | 
|  |  | 
|  | // We have completed the entropy decoding. Now we can perform the | 
|  | // inverse BWT and setup the RLE buffer. | 
|  | bz2.preRLE = bz2.tt[:bufIndex] | 
|  | bz2.preRLEUsed = 0 | 
|  | bz2.tPos = inverseBWT(bz2.preRLE, origPtr, bz2.c[:]) | 
|  | bz2.lastByte = -1 | 
|  | bz2.byteRepeats = 0 | 
|  | bz2.repeats = 0 | 
|  |  | 
|  | return nil | 
|  | } | 
|  |  | 
|  | // inverseBWT implements the inverse Burrows-Wheeler transform as described in | 
|  | // http://www.hpl.hp.com/techreports/Compaq-DEC/SRC-RR-124.pdf, section 4.2. | 
|  | // In that document, origPtr is called ``I'' and c is the ``C'' array after the | 
|  | // first pass over the data. It's an argument here because we merge the first | 
|  | // pass with the Huffman decoding. | 
|  | // | 
|  | // This also implements the ``single array'' method from the bzip2 source code | 
|  | // which leaves the output, still shuffled, in the bottom 8 bits of tt with the | 
|  | // index of the next byte in the top 24-bits. The index of the first byte is | 
|  | // returned. | 
|  | func inverseBWT(tt []uint32, origPtr uint, c []uint) uint32 { | 
|  | sum := uint(0) | 
|  | for i := 0; i < 256; i++ { | 
|  | sum += c[i] | 
|  | c[i] = sum - c[i] | 
|  | } | 
|  |  | 
|  | for i := range tt { | 
|  | b := tt[i] & 0xff | 
|  | tt[c[b]] |= uint32(i) << 8 | 
|  | c[b]++ | 
|  | } | 
|  |  | 
|  | return tt[origPtr] >> 8 | 
|  | } | 
|  |  | 
|  | // This is a standard CRC32 like in hash/crc32 except that all the shifts are reversed, | 
|  | // causing the bits in the input to be processed in the reverse of the usual order. | 
|  |  | 
|  | var crctab [256]uint32 | 
|  |  | 
|  | func init() { | 
|  | const poly = 0x04C11DB7 | 
|  | for i := range crctab { | 
|  | crc := uint32(i) << 24 | 
|  | for j := 0; j < 8; j++ { | 
|  | if crc&0x80000000 != 0 { | 
|  | crc = (crc << 1) ^ poly | 
|  | } else { | 
|  | crc <<= 1 | 
|  | } | 
|  | } | 
|  | crctab[i] = crc | 
|  | } | 
|  | } | 
|  |  | 
|  | // updateCRC updates the crc value to incorporate the data in b. | 
|  | // The initial value is 0. | 
|  | func updateCRC(val uint32, b []byte) uint32 { | 
|  | crc := ^val | 
|  | for _, v := range b { | 
|  | crc = crctab[byte(crc>>24)^v] ^ (crc << 8) | 
|  | } | 
|  | return ^crc | 
|  | } |