// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. //go:build amd64 && !appengine && !noasm && gc // +build amd64,!appengine,!noasm,gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $8-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX MOVBQZX 32(AX), SI MOVQ 40(AX), DI MOVQ DI, BX MOVQ 72(AX), CX MOVQ CX, (SP) MOVQ 48(AX), R8 MOVQ 56(AX), R9 MOVQ (AX), R10 MOVQ 8(AX), R11 MOVQ 16(AX), R12 MOVQ 24(AX), R13 // Main loop main_loop: MOVQ BX, DI CMPQ DI, (SP) SETGE DL // br0.fillFast32() MOVQ 32(R10), R14 MOVBQZX 40(R10), R15 CMPQ R15, $0x20 JBE skip_fill0 MOVQ 24(R10), AX SUBQ $0x20, R15 SUBQ $0x04, AX MOVQ (R10), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(BP*1), BP MOVQ R15, CX SHLQ CL, BP MOVQ AX, 24(R10) ORQ BP, R14 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) MOVQ R14, BP MOVQ SI, CX SHRQ CL, BP // v0 := table[val0&mask] MOVW (R9)(BP*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R14 ADDB CL, R15 // val1 := br0.peekTopBits(peekBits) MOVQ SI, CX MOVQ R14, BP SHRQ CL, BP // v1 := table[val1&mask] MOVW (R9)(BP*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R14 ADDB CL, R15 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (DI) // update the bitrader reader structure MOVQ R14, 32(R10) MOVB R15, 40(R10) ADDQ R8, DI // br1.fillFast32() MOVQ 32(R11), R14 MOVBQZX 40(R11), R15 CMPQ R15, $0x20 JBE skip_fill1 MOVQ 24(R11), AX SUBQ $0x20, R15 SUBQ $0x04, AX MOVQ (R11), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(BP*1), BP MOVQ R15, CX SHLQ CL, BP MOVQ AX, 24(R11) ORQ BP, R14 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) MOVQ R14, BP MOVQ SI, CX SHRQ CL, BP // v0 := table[val0&mask] MOVW (R9)(BP*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R14 ADDB CL, R15 // val1 := br1.peekTopBits(peekBits) MOVQ SI, CX MOVQ R14, BP SHRQ CL, BP // v1 := table[val1&mask] MOVW (R9)(BP*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R14 ADDB CL, R15 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (DI) // update the bitrader reader structure MOVQ R14, 32(R11) MOVB R15, 40(R11) ADDQ R8, DI // br2.fillFast32() MOVQ 32(R12), R14 MOVBQZX 40(R12), R15 CMPQ R15, $0x20 JBE skip_fill2 MOVQ 24(R12), AX SUBQ $0x20, R15 SUBQ $0x04, AX MOVQ (R12), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(BP*1), BP MOVQ R15, CX SHLQ CL, BP MOVQ AX, 24(R12) ORQ BP, R14 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) MOVQ R14, BP MOVQ SI, CX SHRQ CL, BP // v0 := table[val0&mask] MOVW (R9)(BP*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R14 ADDB CL, R15 // val1 := br2.peekTopBits(peekBits) MOVQ SI, CX MOVQ R14, BP SHRQ CL, BP // v1 := table[val1&mask] MOVW (R9)(BP*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R14 ADDB CL, R15 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (DI) // update the bitrader reader structure MOVQ R14, 32(R12) MOVB R15, 40(R12) ADDQ R8, DI // br3.fillFast32() MOVQ 32(R13), R14 MOVBQZX 40(R13), R15 CMPQ R15, $0x20 JBE skip_fill3 MOVQ 24(R13), AX SUBQ $0x20, R15 SUBQ $0x04, AX MOVQ (R13), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(BP*1), BP MOVQ R15, CX SHLQ CL, BP MOVQ AX, 24(R13) ORQ BP, R14 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) MOVQ R14, BP MOVQ SI, CX SHRQ CL, BP // v0 := table[val0&mask] MOVW (R9)(BP*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R14 ADDB CL, R15 // val1 := br3.peekTopBits(peekBits) MOVQ SI, CX MOVQ R14, BP SHRQ CL, BP // v1 := table[val1&mask] MOVW (R9)(BP*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R14 ADDB CL, R15 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (DI) // update the bitrader reader structure MOVQ R14, 32(R13) MOVB R15, 40(R13) ADDQ $0x02, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX MOVQ 40(AX), CX MOVQ BX, DX SUBQ CX, DX SHLQ $0x02, DX MOVQ DX, 64(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $16-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX MOVBQZX 32(CX), BX MOVQ 40(CX), SI MOVQ SI, (SP) MOVQ 72(CX), DX MOVQ DX, 8(SP) MOVQ 48(CX), DI MOVQ 56(CX), R8 MOVQ (CX), R9 MOVQ 8(CX), R10 MOVQ 16(CX), R11 MOVQ 24(CX), R12 // Main loop main_loop: MOVQ (SP), SI CMPQ SI, 8(SP) SETGE DL // br1000.fillFast32() MOVQ 32(R9), R13 MOVBQZX 40(R9), R14 CMPQ R14, $0x20 JBE skip_fill1000 MOVQ 24(R9), R15 SUBQ $0x20, R14 SUBQ $0x04, R15 MOVQ (R9), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R15)(BP*1), BP MOVQ R14, CX SHLQ CL, BP MOVQ R15, 24(R9) ORQ BP, R13 // exhausted = exhausted || (br1000.off < 4) CMPQ R15, $0x04 SETLT AL ORB AL, DL skip_fill1000: // val0 := br0.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v0 := table[val0&mask] MOVW (R8)(R15*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 // val1 := br0.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v1 := table[val0&mask] MOVW (R8)(R15*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // val2 := br0.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v2 := table[val0&mask] MOVW (R8)(R15*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 // val3 := br0.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v3 := table[val0&mask] MOVW (R8)(R15*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (SI) // update the bitreader reader structure MOVQ R13, 32(R9) MOVB R14, 40(R9) ADDQ DI, SI // br1001.fillFast32() MOVQ 32(R10), R13 MOVBQZX 40(R10), R14 CMPQ R14, $0x20 JBE skip_fill1001 MOVQ 24(R10), R15 SUBQ $0x20, R14 SUBQ $0x04, R15 MOVQ (R10), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R15)(BP*1), BP MOVQ R14, CX SHLQ CL, BP MOVQ R15, 24(R10) ORQ BP, R13 // exhausted = exhausted || (br1001.off < 4) CMPQ R15, $0x04 SETLT AL ORB AL, DL skip_fill1001: // val0 := br1.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v0 := table[val0&mask] MOVW (R8)(R15*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 // val1 := br1.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v1 := table[val0&mask] MOVW (R8)(R15*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // val2 := br1.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v2 := table[val0&mask] MOVW (R8)(R15*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 // val3 := br1.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v3 := table[val0&mask] MOVW (R8)(R15*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (SI) // update the bitreader reader structure MOVQ R13, 32(R10) MOVB R14, 40(R10) ADDQ DI, SI // br1002.fillFast32() MOVQ 32(R11), R13 MOVBQZX 40(R11), R14 CMPQ R14, $0x20 JBE skip_fill1002 MOVQ 24(R11), R15 SUBQ $0x20, R14 SUBQ $0x04, R15 MOVQ (R11), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R15)(BP*1), BP MOVQ R14, CX SHLQ CL, BP MOVQ R15, 24(R11) ORQ BP, R13 // exhausted = exhausted || (br1002.off < 4) CMPQ R15, $0x04 SETLT AL ORB AL, DL skip_fill1002: // val0 := br2.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v0 := table[val0&mask] MOVW (R8)(R15*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 // val1 := br2.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v1 := table[val0&mask] MOVW (R8)(R15*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // val2 := br2.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v2 := table[val0&mask] MOVW (R8)(R15*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 // val3 := br2.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v3 := table[val0&mask] MOVW (R8)(R15*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (SI) // update the bitreader reader structure MOVQ R13, 32(R11) MOVB R14, 40(R11) ADDQ DI, SI // br1003.fillFast32() MOVQ 32(R12), R13 MOVBQZX 40(R12), R14 CMPQ R14, $0x20 JBE skip_fill1003 MOVQ 24(R12), R15 SUBQ $0x20, R14 SUBQ $0x04, R15 MOVQ (R12), BP // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R15)(BP*1), BP MOVQ R14, CX SHLQ CL, BP MOVQ R15, 24(R12) ORQ BP, R13 // exhausted = exhausted || (br1003.off < 4) CMPQ R15, $0x04 SETLT AL ORB AL, DL skip_fill1003: // val0 := br3.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v0 := table[val0&mask] MOVW (R8)(R15*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 // val1 := br3.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v1 := table[val0&mask] MOVW (R8)(R15*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // val2 := br3.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v2 := table[val0&mask] MOVW (R8)(R15*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R13 ADDB CL, R14 // val3 := br3.peekTopBits(peekBits) MOVQ R13, R15 MOVQ BX, CX SHRQ CL, R15 // v3 := table[val0&mask] MOVW (R8)(R15*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R13 ADDB CL, R14 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (SI) // update the bitreader reader structure MOVQ R13, 32(R12) MOVB R14, 40(R12) ADDQ $0x04, (SP) TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX MOVQ 40(AX), CX MOVQ (SP), DX SUBQ CX, DX SHLQ $0x02, DX MOVQ DX, 64(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exeeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exeeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_1_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_2_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX MOVQ DX, CX MOVQ 16(AX), DX SUBQ DX, CX MOVQ CX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exeeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET // func decompress1x_main_loop_bmi2(ctx *decompress1xContext) // Requires: BMI2 TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exeeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exeeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_1_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_2_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX MOVQ DX, CX MOVQ 16(AX), DX SUBQ DX, CX MOVQ CX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exeeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET