4
0
mirror of https://github.com/cwinfo/matterbridge.git synced 2025-07-03 15:27:45 +00:00

Update dependencies (#1851)

This commit is contained in:
Wim
2022-06-25 00:36:16 +02:00
committed by GitHub
parent 5604d140e3
commit 4649876956
87 changed files with 10535 additions and 4392 deletions

View File

@ -1,5 +0,0 @@
package huff0
//go:generate go run generate.go
//go:generate asmfmt -w decompress_amd64.s
//go:generate asmfmt -w decompress_8b_amd64.s

View File

@ -165,11 +165,6 @@ func (b *bitReaderShifted) peekBitsFast(n uint8) uint16 {
return uint16(b.value >> ((64 - n) & 63))
}
// peekTopBits(n) is equvialent to peekBitFast(64 - n)
func (b *bitReaderShifted) peekTopBits(n uint8) uint16 {
return uint16(b.value >> n)
}
func (b *bitReaderShifted) advance(n uint8) {
b.bitsRead += n
b.value <<= n & 63
@ -220,11 +215,6 @@ func (b *bitReaderShifted) fill() {
}
}
// finished returns true if all bits have been read from the bit stream.
func (b *bitReaderShifted) finished() bool {
return b.off == 0 && b.bitsRead >= 64
}
func (b *bitReaderShifted) remaining() uint {
return b.off*8 + uint(64-b.bitsRead)
}

View File

@ -5,8 +5,6 @@
package huff0
import "fmt"
// bitWriter will write bits.
// First bit will be LSB of the first byte of output.
type bitWriter struct {
@ -23,14 +21,6 @@ var bitMask16 = [32]uint16{
0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF, 0xFFFF,
0xFFFF, 0xFFFF} /* up to 16 bits */
// addBits16NC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16NC(value uint16, bits uint8) {
b.bitContainer |= uint64(value&bitMask16[bits&31]) << (b.nBits & 63)
b.nBits += bits
}
// addBits16Clean will add up to 16 bits. value may not contain more set bits than indicated.
// It will not check if there is space for them, so the caller must ensure that it has flushed recently.
func (b *bitWriter) addBits16Clean(value uint16, bits uint8) {
@ -70,104 +60,6 @@ func (b *bitWriter) encTwoSymbols(ct cTable, av, bv byte) {
b.nBits += encA.nBits + encB.nBits
}
// addBits16ZeroNC will add up to 16 bits.
// It will not check if there is space for them,
// so the caller must ensure that it has flushed recently.
// This is fastest if bits can be zero.
func (b *bitWriter) addBits16ZeroNC(value uint16, bits uint8) {
if bits == 0 {
return
}
value <<= (16 - bits) & 15
value >>= (16 - bits) & 15
b.bitContainer |= uint64(value) << (b.nBits & 63)
b.nBits += bits
}
// flush will flush all pending full bytes.
// There will be at least 56 bits available for writing when this has been called.
// Using flush32 is faster, but leaves less space for writing.
func (b *bitWriter) flush() {
v := b.nBits >> 3
switch v {
case 0:
return
case 1:
b.out = append(b.out,
byte(b.bitContainer),
)
b.bitContainer >>= 1 << 3
case 2:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
)
b.bitContainer >>= 2 << 3
case 3:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
)
b.bitContainer >>= 3 << 3
case 4:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
)
b.bitContainer >>= 4 << 3
case 5:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
)
b.bitContainer >>= 5 << 3
case 6:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
)
b.bitContainer >>= 6 << 3
case 7:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
)
b.bitContainer >>= 7 << 3
case 8:
b.out = append(b.out,
byte(b.bitContainer),
byte(b.bitContainer>>8),
byte(b.bitContainer>>16),
byte(b.bitContainer>>24),
byte(b.bitContainer>>32),
byte(b.bitContainer>>40),
byte(b.bitContainer>>48),
byte(b.bitContainer>>56),
)
b.bitContainer = 0
b.nBits = 0
return
default:
panic(fmt.Errorf("bits (%d) > 64", b.nBits))
}
b.nBits &= 7
}
// flush32 will flush out, so there are at least 32 bits available for writing.
func (b *bitWriter) flush32() {
if b.nBits < 32 {
@ -201,10 +93,3 @@ func (b *bitWriter) close() error {
b.flushAlign()
return nil
}
// reset and continue writing by appending to out.
func (b *bitWriter) reset(out []byte) {
b.bitContainer = 0
b.nBits = 0
b.out = out
}

View File

@ -20,11 +20,6 @@ func (b *byteReader) init(in []byte) {
b.off = 0
}
// advance the stream b n bytes.
func (b *byteReader) advance(n uint) {
b.off += int(n)
}
// Int32 returns a little endian int32 starting at current offset.
func (b byteReader) Int32() int32 {
v3 := int32(b.b[b.off+3])
@ -43,11 +38,6 @@ func (b byteReader) Uint32() uint32 {
return (v3 << 24) | (v2 << 16) | (v1 << 8) | v0
}
// unread returns the unread portion of the input.
func (b byteReader) unread() []byte {
return b.b[b.off:]
}
// remain will return the number of bytes remaining.
func (b byteReader) remain() int {
return len(b.b) - b.off

View File

@ -404,6 +404,7 @@ func (s *Scratch) canUseTable(c cTable) bool {
return true
}
//lint:ignore U1000 used for debugging
func (s *Scratch) validateTable(c cTable) bool {
if len(c) < int(s.symbolLen) {
return false

View File

@ -11,7 +11,6 @@ import (
type dTable struct {
single []dEntrySingle
double []dEntryDouble
}
// single-symbols decoding
@ -19,13 +18,6 @@ type dEntrySingle struct {
entry uint16
}
// double-symbols decoding
type dEntryDouble struct {
seq [4]byte
nBits uint8
len uint8
}
// Uses special code for all tables that are < 8 bits.
const use8BitTables = true
@ -35,7 +27,7 @@ const use8BitTables = true
// If no Scratch is provided a new one is allocated.
// The returned Scratch can be used for encoding or decoding input using this table.
func ReadTable(in []byte, s *Scratch) (s2 *Scratch, remain []byte, err error) {
s, err = s.prepare(in)
s, err = s.prepare(nil)
if err != nil {
return s, nil, err
}
@ -236,108 +228,6 @@ func (d *Decoder) buffer() *[4][256]byte {
return &[4][256]byte{}
}
// Decompress1X will decompress a 1X encoded stream.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress1X8Bit(dst, src)
}
var br bitReaderShifted
err := br.init(src)
if err != nil {
return dst, err
}
maxDecodedSize := cap(dst)
dst = dst[:0]
// Avoid bounds check by always having full sized table.
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
dt := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
bufs := d.buffer()
buf := &bufs[0]
var off uint8
for br.off >= 8 {
br.fillFast()
v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+0] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+1] = uint8(v.entry >> 8)
// Refill
br.fillFast()
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+2] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+3] = uint8(v.entry >> 8)
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
}
}
if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:off]...)
// br < 8, so uint8 is fine
bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
for bitsLeft > 0 {
br.fill()
if false && br.bitsRead >= 32 {
if br.off >= 4 {
v := br.in[br.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
br.value = (br.value << 32) | uint64(low)
br.bitsRead -= 32
br.off -= 4
} else {
for br.off > 0 {
br.value = (br.value << 8) | uint64(br.in[br.off-1])
br.bitsRead -= 8
br.off--
}
}
}
if len(dst) >= maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}
// decompress1X8Bit will decompress a 1X encoded stream with tablelog <= 8.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
@ -995,7 +885,6 @@ func (d *Decoder) decompress4X8bitExactly(dst, src []byte) ([]byte, error) {
const shift = 56
const tlSize = 1 << 8
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.

View File

@ -1,488 +0,0 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
// func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
// const stream = 0
// br0.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br0), br_bits_read
MOVQ bitReaderShifted_value(br0), br_value
MOVQ bitReaderShifted_off(br0), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill0
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br0), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br0.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill0:
// val0 := br0.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br0.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br0.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br0.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 0(buffer)(off*1)
// SECOND PART:
// val2 := br0.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br0.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br0.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br0.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 0+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br0)
MOVQ br_value, bitReaderShifted_value(br0)
MOVQ br_offset, bitReaderShifted_off(br0)
// const stream = 1
// br1.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br1), br_bits_read
MOVQ bitReaderShifted_value(br1), br_value
MOVQ bitReaderShifted_off(br1), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill1
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br1), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br1.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill1:
// val0 := br1.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br1.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br1.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br1.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 256(buffer)(off*1)
// SECOND PART:
// val2 := br1.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br1.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br1.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br1.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 256+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br1)
MOVQ br_value, bitReaderShifted_value(br1)
MOVQ br_offset, bitReaderShifted_off(br1)
// const stream = 2
// br2.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br2), br_bits_read
MOVQ bitReaderShifted_value(br2), br_value
MOVQ bitReaderShifted_off(br2), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill2
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br2), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br2.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill2:
// val0 := br2.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br2.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br2.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br2.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 512(buffer)(off*1)
// SECOND PART:
// val2 := br2.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br2.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br2.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br2.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 512+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br2)
MOVQ br_value, bitReaderShifted_value(br2)
MOVQ br_offset, bitReaderShifted_off(br2)
// const stream = 3
// br3.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br3), br_bits_read
MOVQ bitReaderShifted_value(br3), br_value
MOVQ bitReaderShifted_off(br3), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill3
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br3), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br3.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill3:
// val0 := br3.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br3.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br3.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br3.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, 768(buffer)(off*1)
// SECOND PART:
// val2 := br3.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br3.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br3.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br3.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, 768+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br3)
MOVQ br_value, bitReaderShifted_value(br3)
MOVQ br_offset, bitReaderShifted_off(br3)
ADDQ $4, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -1,197 +0,0 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_8b_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
{{ define "decode_2_values_x86" }}
// const stream = {{ var "id" }}
// br{{ var "id"}}.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
// if b.bitsRead >= 32 {
CMPQ br_bits_read, $32
JB skip_fill{{ var "id" }}
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
// b.value |= uint64(low) << (b.bitsRead & 63)
MOVQ br_bits_read, CX
SHLQ CL, AX
ORQ AX, br_value
// exhausted = exhausted || (br{{ var "id"}}.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill{{ var "id" }}:
// val0 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val1 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
// SECOND PART:
// val2 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v2 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// val3 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
// v3 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
MOVBQZX AL, CX
SHLQ CX, br_value // value <<= n
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off+2] = uint8(v2.entry >> 8)
// buf[stream][off+3] = uint8(v3.entry >> 8)
MOVW BX, {{ var "bufofs" }}+2(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
{{ end }}
{{ set "id" "0" }}
{{ set "ofs" "0" }}
{{ set "bufofs" "0" }} {{/* id * bufoff */}}
{{ template "decode_2_values_x86" . }}
{{ set "id" "1" }}
{{ set "ofs" "8" }}
{{ set "bufofs" "256" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "2" }}
{{ set "ofs" "16" }}
{{ set "bufofs" "512" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "3" }}
{{ set "ofs" "24" }}
{{ set "bufofs" "768" }}
{{ template "decode_2_values_x86" . }}
ADDQ $4, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -2,30 +2,43 @@
// +build amd64,!appengine,!noasm,gc
// This file contains the specialisation of Decoder.Decompress4X
// that uses an asm implementation of its main loop.
// and Decoder.Decompress1X that use an asm implementation of thir main loops.
package huff0
import (
"errors"
"fmt"
"github.com/klauspost/compress/internal/cpuinfo"
)
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog > 8.
// go:noescape
func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
//go:noescape
func decompress4x_main_loop_amd64(ctx *decompress4xContext)
// decompress4x_8b_loop_x86 is an x86 assembler implementation
// of Decompress4X when tablelog <= 8 which decodes 4 entries
// per loop.
// go:noescape
func decompress4x_8b_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
peekBits uint8, buf *byte, tbl *dEntrySingle) uint8
//go:noescape
func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext)
// fallback8BitSize is the size where using Go version is faster.
const fallback8BitSize = 800
type decompress4xContext struct {
pbr0 *bitReaderShifted
pbr1 *bitReaderShifted
pbr2 *bitReaderShifted
pbr3 *bitReaderShifted
peekBits uint8
out *byte
dstEvery int
tbl *dEntrySingle
decoded int
limit *byte
}
// Decompress4X will decompress a 4X encoded stream.
// The length of the supplied input must match the end of a block exactly.
// The *capacity* of the dst slice must match the destination size of
@ -42,6 +55,7 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
if cap(dst) < fallback8BitSize && use8BitTables {
return d.decompress4X8bit(dst, src)
}
var br [4]bitReaderShifted
// Decode "jump table"
start := 6
@ -71,70 +85,28 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
const tlMask = tlSize - 1
single := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
buf := d.buffer()
var off uint8
var decoded int
const debug = false
// see: bitReaderShifted.peekBitsFast()
peekBits := uint8((64 - d.actualTableLog) & 63)
// Decode 2 values from each decoder/loop.
const bufoff = 256
for {
if br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4 {
break
if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) {
ctx := decompress4xContext{
pbr0: &br[0],
pbr1: &br[1],
pbr2: &br[2],
pbr3: &br[3],
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
out: &out[0],
dstEvery: dstEvery,
tbl: &single[0],
limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last.
}
if use8BitTables {
off = decompress4x_8b_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
decompress4x_8b_main_loop_amd64(&ctx)
} else {
off = decompress4x_main_loop_x86(&br[0], &br[1], &br[2], &br[3], peekBits, &buf[0][0], &single[0])
}
if debug {
fmt.Print("DEBUG: ")
fmt.Printf("off=%d,", off)
for i := 0; i < 4; i++ {
fmt.Printf(" br[%d]={bitsRead=%d, value=%x, off=%d}",
i, br[i].bitsRead, br[i].value, br[i].off)
}
fmt.Println("")
decompress4x_main_loop_amd64(&ctx)
}
if off != 0 {
break
}
if bufoff > dstEvery {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 1")
}
copy(out, buf[0][:])
copy(out[dstEvery:], buf[1][:])
copy(out[dstEvery*2:], buf[2][:])
copy(out[dstEvery*3:], buf[3][:])
out = out[bufoff:]
decoded += bufoff * 4
// There must at least be 3 buffers left.
if len(out) < dstEvery*3 {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 2")
}
}
if off > 0 {
ioff := int(off)
if len(out) < dstEvery*3+ioff {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 3")
}
copy(out, buf[0][:off])
copy(out[dstEvery:], buf[1][:off])
copy(out[dstEvery*2:], buf[2][:off])
copy(out[dstEvery*3:], buf[3][:off])
decoded += int(off) * 4
out = out[off:]
decoded = ctx.decoded
out = out[decoded/4:]
}
// Decode remaining.
@ -150,7 +122,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
for bitsLeft > 0 {
br.fill()
if offset >= endsAt {
d.bufs.Put(buf)
return nil, errors.New("corruption detected: stream overrun 4")
}
@ -164,7 +135,6 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
offset++
}
if offset != endsAt {
d.bufs.Put(buf)
return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt)
}
decoded += offset - dstEvery*i
@ -173,9 +143,86 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
return nil, err
}
}
d.bufs.Put(buf)
if dstSize != decoded {
return nil, errors.New("corruption detected: short output block")
}
return dst, nil
}
// decompress4x_main_loop_x86 is an x86 assembler implementation
// of Decompress1X when tablelog > 8.
//go:noescape
func decompress1x_main_loop_amd64(ctx *decompress1xContext)
// decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation
// of Decompress1X when tablelog > 8.
//go:noescape
func decompress1x_main_loop_bmi2(ctx *decompress1xContext)
type decompress1xContext struct {
pbr *bitReaderShifted
peekBits uint8
out *byte
outCap int
tbl *dEntrySingle
decoded int
}
// Error reported by asm implementations
const error_max_decoded_size_exeeded = -1
// Decompress1X will decompress a 1X encoded stream.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
var br bitReaderShifted
err := br.init(src)
if err != nil {
return dst, err
}
maxDecodedSize := cap(dst)
dst = dst[:maxDecodedSize]
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
if maxDecodedSize >= 4 {
ctx := decompress1xContext{
pbr: &br,
out: &dst[0],
outCap: maxDecodedSize,
peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast()
tbl: &d.dt.single[0],
}
if cpuinfo.HasBMI2() {
decompress1x_main_loop_bmi2(&ctx)
} else {
decompress1x_main_loop_amd64(&ctx)
}
if ctx.decoded == error_max_decoded_size_exeeded {
return nil, ErrMaxDecodedSizeExceeded
}
dst = dst[:ctx.decoded]
}
// br < 8, so uint8 is fine
bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
for bitsLeft > 0 {
br.fill()
if len(dst) >= maxDecodedSize {
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
return dst, br.close()
}

File diff suppressed because it is too large Load Diff

View File

@ -1,195 +0,0 @@
// +build !appengine
// +build gc
// +build !noasm
#include "textflag.h"
#include "funcdata.h"
#include "go_asm.h"
#ifdef GOAMD64_v4
#ifndef GOAMD64_v3
#define GOAMD64_v3
#endif
#endif
#define bufoff 256 // see decompress.go, we're using [4][256]byte table
//func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted,
// peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool)
TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8
#define off R8
#define buffer DI
#define table SI
#define br_bits_read R9
#define br_value R10
#define br_offset R11
#define peek_bits R12
#define exhausted DX
#define br0 R13
#define br1 R14
#define br2 R15
#define br3 BP
MOVQ BP, 0(SP)
XORQ exhausted, exhausted // exhausted = false
XORQ off, off // off = 0
MOVBQZX peekBits+32(FP), peek_bits
MOVQ buf+40(FP), buffer
MOVQ tbl+48(FP), table
MOVQ pbr0+0(FP), br0
MOVQ pbr1+8(FP), br1
MOVQ pbr2+16(FP), br2
MOVQ pbr3+24(FP), br3
main_loop:
{{ define "decode_2_values_x86" }}
// const stream = {{ var "id" }}
// br{{ var "id"}}.fillFast()
MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read
MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value
MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset
// We must have at least 2 * max tablelog left
CMPQ br_bits_read, $64-22
JBE skip_fill{{ var "id" }}
SUBQ $32, br_bits_read // b.bitsRead -= 32
SUBQ $4, br_offset // b.off -= 4
// v := b.in[b.off-4 : b.off]
// v = v[:4]
// low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
MOVQ bitReaderShifted_in(br{{ var "id" }}), AX
// b.value |= uint64(low) << (b.bitsRead & 63)
#ifdef GOAMD64_v3
SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63)
#else
MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4])
MOVQ br_bits_read, CX
SHLQ CL, AX
#endif
ORQ AX, br_value
// exhausted = exhausted || (br{{ var "id"}}.off < 4)
CMPQ br_offset, $4
SETLT DL
ORB DL, DH
// }
skip_fill{{ var "id" }}:
// val0 := br{{ var "id"}}.peekTopBits(peekBits)
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
MOVQ br_value, AX
MOVQ peek_bits, CX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v0 := table[val0&mask]
MOVW 0(table)(AX*2), AX // AX - v0
// br{{ var "id"}}.advance(uint8(v0.entry))
MOVB AH, BL // BL = uint8(v0.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
#ifdef GOAMD64_v3
SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask
#else
// val1 := br{{ var "id"}}.peekTopBits(peekBits)
MOVQ peek_bits, CX
MOVQ br_value, AX
SHRQ CL, AX // AX = (value >> peek_bits) & mask
#endif
// v1 := table[val1&mask]
MOVW 0(table)(AX*2), AX // AX - v1
// br{{ var "id"}}.advance(uint8(v1.entry))
MOVB AH, BH // BH = uint8(v1.entry >> 8)
#ifdef GOAMD64_v3
MOVBQZX AL, CX
SHLXQ AX, br_value, br_value // value <<= n
#else
MOVBQZX AL, CX
SHLQ CL, br_value // value <<= n
#endif
ADDQ CX, br_bits_read // bits_read += n
// these two writes get coalesced
// buf[stream][off] = uint8(v0.entry >> 8)
// buf[stream][off+1] = uint8(v1.entry >> 8)
MOVW BX, {{ var "bufofs" }}(buffer)(off*1)
// update the bitrader reader structure
MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }})
MOVQ br_value, bitReaderShifted_value(br{{ var "id" }})
MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }})
{{ end }}
{{ set "id" "0" }}
{{ set "ofs" "0" }}
{{ set "bufofs" "0" }} {{/* id * bufoff */}}
{{ template "decode_2_values_x86" . }}
{{ set "id" "1" }}
{{ set "ofs" "8" }}
{{ set "bufofs" "256" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "2" }}
{{ set "ofs" "16" }}
{{ set "bufofs" "512" }}
{{ template "decode_2_values_x86" . }}
{{ set "id" "3" }}
{{ set "ofs" "24" }}
{{ set "bufofs" "768" }}
{{ template "decode_2_values_x86" . }}
ADDQ $2, off // off += 2
TESTB DH, DH // any br[i].ofs < 4?
JNZ end
CMPQ off, $bufoff
JL main_loop
end:
MOVQ 0(SP), BP
MOVB off, ret+56(FP)
RET
#undef off
#undef buffer
#undef table
#undef br_bits_read
#undef br_value
#undef br_offset
#undef peek_bits
#undef exhausted
#undef br0
#undef br1
#undef br2
#undef br3

View File

@ -191,3 +191,105 @@ func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) {
}
return dst, nil
}
// Decompress1X will decompress a 1X encoded stream.
// The cap of the output buffer will be the maximum decompressed size.
// The length of the supplied input must match the end of a block exactly.
func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) {
if len(d.dt.single) == 0 {
return nil, errors.New("no table loaded")
}
if use8BitTables && d.actualTableLog <= 8 {
return d.decompress1X8Bit(dst, src)
}
var br bitReaderShifted
err := br.init(src)
if err != nil {
return dst, err
}
maxDecodedSize := cap(dst)
dst = dst[:0]
// Avoid bounds check by always having full sized table.
const tlSize = 1 << tableLogMax
const tlMask = tlSize - 1
dt := d.dt.single[:tlSize]
// Use temp table to avoid bound checks/append penalty.
bufs := d.buffer()
buf := &bufs[0]
var off uint8
for br.off >= 8 {
br.fillFast()
v := dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+0] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+1] = uint8(v.entry >> 8)
// Refill
br.fillFast()
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+2] = uint8(v.entry >> 8)
v = dt[br.peekBitsFast(d.actualTableLog)&tlMask]
br.advance(uint8(v.entry))
buf[off+3] = uint8(v.entry >> 8)
off += 4
if off == 0 {
if len(dst)+256 > maxDecodedSize {
br.close()
d.bufs.Put(bufs)
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:]...)
}
}
if len(dst)+int(off) > maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
dst = append(dst, buf[:off]...)
// br < 8, so uint8 is fine
bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead
for bitsLeft > 0 {
br.fill()
if false && br.bitsRead >= 32 {
if br.off >= 4 {
v := br.in[br.off-4:]
v = v[:4]
low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24)
br.value = (br.value << 32) | uint64(low)
br.bitsRead -= 32
br.off -= 4
} else {
for br.off > 0 {
br.value = (br.value << 8) | uint64(br.in[br.off-1])
br.bitsRead -= 8
br.off--
}
}
}
if len(dst) >= maxDecodedSize {
d.bufs.Put(bufs)
br.close()
return nil, ErrMaxDecodedSizeExceeded
}
v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask]
nBits := uint8(v.entry)
br.advance(nBits)
bitsLeft -= nBits
dst = append(dst, uint8(v.entry>>8))
}
d.bufs.Put(bufs)
return dst, br.close()
}