5
0
mirror of https://github.com/cwinfo/matterbridge.git synced 2024-11-29 23:11:35 +00:00
matterbridge/vendor/github.com/klauspost/compress/zstd/seqdec_amd64.s

3690 lines
74 KiB
ArmAsm
Raw Normal View History

2022-06-24 22:36:16 +00:00
// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT.
//go:build !appengine && !noasm && gc && !noasm
// +build !appengine,!noasm,gc,!noasm
// func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 104(AX), R10
MOVQ s+0(FP), AX
MOVQ 144(AX), R11
MOVQ 152(AX), R12
MOVQ 160(AX), R13
sequenceDecs_decode_amd64_main_loop:
MOVQ (SP), R14
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decode_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_amd64_fill_end
sequenceDecs_decode_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_end
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_byte_by_byte
sequenceDecs_decode_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 16(R10)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 8(R10)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decode_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_amd64_fill_2_end
sequenceDecs_decode_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_amd64_fill_2_end
CMPQ BX, $0x07
JLE sequenceDecs_decode_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte
sequenceDecs_decode_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
CMPQ R14, $0x00
JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, DI
sequenceDecs_decode_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
CMPQ R14, $0x00
JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, R8
sequenceDecs_decode_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
CMPQ R14, $0x00
JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, R9
sequenceDecs_decode_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decode_amd64_skip_update:
// Adjust offset
MOVQ 16(R10), CX
CMPQ AX, $0x01
JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
JMP sequenceDecs_decode_amd64_adjust_end
sequenceDecs_decode_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
JNE sequenceDecs_decode_amd64_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_amd64_adjust_offset_nonzero
sequenceDecs_decode_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero
MOVQ R11, CX
JMP sequenceDecs_decode_amd64_adjust_end
sequenceDecs_decode_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_amd64_adjust_zero
JEQ sequenceDecs_decode_amd64_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_amd64_adjust_three
JMP sequenceDecs_decode_amd64_adjust_two
sequenceDecs_decode_amd64_adjust_zero:
MOVQ R11, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_one:
MOVQ R12, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_two:
MOVQ R13, AX
JMP sequenceDecs_decode_amd64_adjust_test_temp_valid
sequenceDecs_decode_amd64_adjust_three:
LEAQ -1(R11), AX
sequenceDecs_decode_amd64_adjust_test_temp_valid:
TESTQ AX, AX
JNZ sequenceDecs_decode_amd64_adjust_temp_valid
MOVQ $0x00000001, AX
sequenceDecs_decode_amd64_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R12, R13
MOVQ R11, R12
MOVQ AX, R11
MOVQ AX, CX
sequenceDecs_decode_amd64_adjust_end:
MOVQ CX, 16(R10)
// Check values
MOVQ 8(R10), AX
MOVQ (R10), R14
LEAQ (AX)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decode_amd64_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch
sequenceDecs_decode_amd64_match_len_ofs_ok:
ADDQ $0x18, R10
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decode_amd64_main_loop
MOVQ s+0(FP), AX
MOVQ R11, 144(AX)
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_amd64_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_amd64_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: CMOV
TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 104(AX), R10
MOVQ s+0(FP), AX
MOVQ 144(AX), R11
MOVQ 152(AX), R12
MOVQ 160(AX), R13
sequenceDecs_decode_56_amd64_main_loop:
MOVQ (SP), R14
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decode_56_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R14
MOVQ (R14), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decode_56_amd64_fill_end
sequenceDecs_decode_56_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decode_56_amd64_fill_end
CMPQ BX, $0x07
JLE sequenceDecs_decode_56_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R14
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R14), AX
ORQ AX, DX
JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte
sequenceDecs_decode_56_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 16(R10)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, 8(R10)
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R15
SHLQ CL, R15
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R15
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R15
ADDQ R15, AX
MOVQ AX, (R10)
// Fill bitreader for state updates
MOVQ R14, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_56_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R14
SHRQ $0x10, DI
MOVWQZX DI, DI
CMPQ R14, $0x00
JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, DI
sequenceDecs_decode_56_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R14
SHRQ $0x10, R8
MOVWQZX R8, R8
CMPQ R14, $0x00
JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, R8
sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R14
SHRQ $0x10, R9
MOVWQZX R9, R9
CMPQ R14, $0x00
JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero
MOVQ BX, CX
ADDQ R14, BX
MOVQ DX, R15
SHLQ CL, R15
MOVQ R14, CX
NEGQ CX
SHRQ CL, R15
ADDQ R15, R9
sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decode_56_amd64_skip_update:
// Adjust offset
MOVQ 16(R10), CX
CMPQ AX, $0x01
JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0
MOVQ R12, R13
MOVQ R11, R12
MOVQ CX, R11
JMP sequenceDecs_decode_56_amd64_adjust_end
sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0:
CMPQ (R10), $0x00000000
JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero
sequenceDecs_decode_56_amd64_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero
MOVQ R11, CX
JMP sequenceDecs_decode_56_amd64_adjust_end
sequenceDecs_decode_56_amd64_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_56_amd64_adjust_zero
JEQ sequenceDecs_decode_56_amd64_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_56_amd64_adjust_three
JMP sequenceDecs_decode_56_amd64_adjust_two
sequenceDecs_decode_56_amd64_adjust_zero:
MOVQ R11, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_one:
MOVQ R12, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_two:
MOVQ R13, AX
JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid
sequenceDecs_decode_56_amd64_adjust_three:
LEAQ -1(R11), AX
sequenceDecs_decode_56_amd64_adjust_test_temp_valid:
TESTQ AX, AX
JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid
MOVQ $0x00000001, AX
sequenceDecs_decode_56_amd64_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R12, R13
MOVQ R11, R12
MOVQ AX, R11
MOVQ AX, CX
sequenceDecs_decode_56_amd64_adjust_end:
MOVQ CX, 16(R10)
// Check values
MOVQ 8(R10), AX
MOVQ (R10), R14
LEAQ (AX)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decode_56_amd64_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch
sequenceDecs_decode_56_amd64_match_len_ofs_ok:
ADDQ $0x18, R10
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decode_56_amd64_main_loop
MOVQ s+0(FP), AX
MOVQ R11, 144(AX)
MOVQ R12, 152(AX)
MOVQ R13, 160(AX)
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_56_amd64_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 104(CX), R9
MOVQ s+0(FP), CX
MOVQ 144(CX), R10
MOVQ 152(CX), R11
MOVQ 160(CX), R12
sequenceDecs_decode_bmi2_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decode_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_bmi2_fill_end
sequenceDecs_decode_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_end
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_byte_by_byte
sequenceDecs_decode_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 16(R9)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 8(R9)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_bmi2_fill_2_end
sequenceDecs_decode_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_bmi2_fill_2_end
CMPQ DX, $0x07
JLE sequenceDecs_decode_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte
sequenceDecs_decode_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, (R9)
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_bmi2_skip_update
LEAQ (SI)(DI*1), R14
ADDQ R8, R14
MOVBQZX R14, R14
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
sequenceDecs_decode_bmi2_skip_update:
// Adjust offset
MOVQ 16(R9), CX
CMPQ R13, $0x01
JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
JMP sequenceDecs_decode_bmi2_adjust_end
sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero
sequenceDecs_decode_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero
MOVQ R10, CX
JMP sequenceDecs_decode_bmi2_adjust_end
sequenceDecs_decode_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_bmi2_adjust_zero
JEQ sequenceDecs_decode_bmi2_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_bmi2_adjust_three
JMP sequenceDecs_decode_bmi2_adjust_two
sequenceDecs_decode_bmi2_adjust_zero:
MOVQ R10, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_one:
MOVQ R11, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_two:
MOVQ R12, R13
JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid
sequenceDecs_decode_bmi2_adjust_three:
LEAQ -1(R10), R13
sequenceDecs_decode_bmi2_adjust_test_temp_valid:
TESTQ R13, R13
JNZ sequenceDecs_decode_bmi2_adjust_temp_valid
MOVQ $0x00000001, R13
sequenceDecs_decode_bmi2_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R11, R12
MOVQ R10, R11
MOVQ R13, R10
MOVQ R13, CX
sequenceDecs_decode_bmi2_adjust_end:
MOVQ CX, 16(R9)
// Check values
MOVQ 8(R9), R13
MOVQ (R9), R14
LEAQ (R13)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ R13, $0x00020002
JA sequenceDecs_decode_bmi2_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok
TESTQ R13, R13
JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decode_bmi2_match_len_ofs_ok:
ADDQ $0x18, R9
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decode_bmi2_main_loop
MOVQ s+0(FP), CX
MOVQ R10, 144(CX)
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_bmi2_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int
// Requires: BMI, BMI2, CMOV
TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 104(CX), R9
MOVQ s+0(FP), CX
MOVQ 144(CX), R10
MOVQ 152(CX), R11
MOVQ 160(CX), R12
sequenceDecs_decode_56_bmi2_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R13
MOVQ (R13), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decode_56_bmi2_fill_end
sequenceDecs_decode_56_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decode_56_bmi2_fill_end
CMPQ DX, $0x07
JLE sequenceDecs_decode_56_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R13
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R13), CX
ORQ CX, AX
JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte
sequenceDecs_decode_56_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 16(R9)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, 8(R9)
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R14
MOVQ AX, R15
LEAQ (DX)(R14*1), CX
ROLQ CL, R15
BZHIQ R14, R15, R15
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R15, CX
MOVQ CX, (R9)
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decode_56_bmi2_skip_update
LEAQ (SI)(DI*1), R14
ADDQ R8, R14
MOVBQZX R14, R14
LEAQ (DX)(R14*1), CX
MOVQ AX, R15
MOVQ CX, DX
ROLQ CL, R15
BZHIQ R14, R15, R15
// Update Offset State
BZHIQ R8, R15, CX
SHRXQ R8, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
// Update Match Length State
BZHIQ DI, R15, CX
SHRXQ DI, R15, R15
MOVQ $0x00001010, R14
BEXTRQ R14, DI, DI
ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
// Update Literal Length State
BZHIQ SI, R15, CX
MOVQ $0x00001010, R14
BEXTRQ R14, SI, SI
ADDQ CX, SI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
sequenceDecs_decode_56_bmi2_skip_update:
// Adjust offset
MOVQ 16(R9), CX
CMPQ R13, $0x01
JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0
MOVQ R11, R12
MOVQ R10, R11
MOVQ CX, R10
JMP sequenceDecs_decode_56_bmi2_adjust_end
sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0:
CMPQ (R9), $0x00000000
JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero
INCQ CX
JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
sequenceDecs_decode_56_bmi2_adjust_offset_maybezero:
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero
MOVQ R10, CX
JMP sequenceDecs_decode_56_bmi2_adjust_end
sequenceDecs_decode_56_bmi2_adjust_offset_nonzero:
CMPQ CX, $0x01
JB sequenceDecs_decode_56_bmi2_adjust_zero
JEQ sequenceDecs_decode_56_bmi2_adjust_one
CMPQ CX, $0x02
JA sequenceDecs_decode_56_bmi2_adjust_three
JMP sequenceDecs_decode_56_bmi2_adjust_two
sequenceDecs_decode_56_bmi2_adjust_zero:
MOVQ R10, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_one:
MOVQ R11, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_two:
MOVQ R12, R13
JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid
sequenceDecs_decode_56_bmi2_adjust_three:
LEAQ -1(R10), R13
sequenceDecs_decode_56_bmi2_adjust_test_temp_valid:
TESTQ R13, R13
JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid
MOVQ $0x00000001, R13
sequenceDecs_decode_56_bmi2_adjust_temp_valid:
CMPQ CX, $0x01
CMOVQNE R11, R12
MOVQ R10, R11
MOVQ R13, R10
MOVQ R13, CX
sequenceDecs_decode_56_bmi2_adjust_end:
MOVQ CX, 16(R9)
// Check values
MOVQ 8(R9), R13
MOVQ (R9), R14
LEAQ (R13)(R14*1), R15
MOVQ s+0(FP), BP
ADDQ R15, 256(BP)
MOVQ ctx+16(FP), R15
SUBQ R14, 128(R15)
JS error_not_enough_literals
CMPQ R13, $0x00020002
JA sequenceDecs_decode_56_bmi2_error_match_len_too_big
TESTQ CX, CX
JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok
TESTQ R13, R13
JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decode_56_bmi2_match_len_ofs_ok:
ADDQ $0x18, R9
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decode_56_bmi2_main_loop
MOVQ s+0(FP), CX
MOVQ R10, 144(CX)
MOVQ R11, 152(CX)
MOVQ R12, 160(CX)
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch:
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decode_56_bmi2_error_match_len_too_big:
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9
MOVQ ctx+0(FP), R10
MOVQ 8(R10), CX
TESTQ CX, CX
JZ empty_seqs
MOVQ (R10), AX
MOVQ 24(R10), DX
MOVQ 32(R10), BX
MOVQ 80(R10), SI
MOVQ 104(R10), DI
MOVQ 120(R10), R8
MOVQ 56(R10), R9
MOVQ 64(R10), R10
ADDQ R10, R9
// seqsBase += 24 * seqIndex
LEAQ (DX)(DX*2), R11
SHLQ $0x03, R11
ADDQ R11, AX
// outBase += outPosition
ADDQ DI, BX
main_loop:
MOVQ (AX), R11
MOVQ 16(AX), R12
MOVQ 8(AX), R13
// Copy literals
TESTQ R11, R11
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (SI)(R14*1), X0
MOVUPS X0, (BX)(R14*1)
ADDQ $0x10, R14
CMPQ R14, R11
JB copy_1
ADDQ R11, SI
ADDQ R11, BX
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
LEAQ (DI)(R10*1), R11
CMPQ R12, R11
JG error_match_off_too_big
CMPQ R12, R8
JG error_match_off_too_big
// Copy match from history
MOVQ R12, R11
SUBQ DI, R11
JLS copy_match
MOVQ R9, R14
SUBQ R11, R14
CMPQ R13, R11
JGE copy_all_from_history
XORQ R11, R11
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(R11*1), R12
MOVB R12, (BX)(R11*1)
ADDQ $0x01, R11
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(R11*1), R12
MOVW R12, (BX)(R11*1)
ADDQ $0x02, R11
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(R11*1), R12
MOVL R12, (BX)(R11*1)
ADDQ $0x04, R11
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(R11*1), R12
MOVQ R12, (BX)(R11*1)
ADDQ $0x08, R11
JMP copy_4_test
copy_4:
MOVUPS (R14)(R11*1), X0
MOVUPS X0, (BX)(R11*1)
ADDQ $0x10, R11
copy_4_test:
CMPQ R11, R13
JB copy_4
ADDQ R13, DI
ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, R11
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (BX)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, R11
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (BX)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, R11
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (BX)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, R11
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (BX)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (BX)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, R11
JB copy_5
ADDQ R11, BX
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ BX, R11
SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, DI
MOVQ BX, R12
ADDQ R13, BX
copy_2:
MOVUPS (R11), X0
MOVUPS X0, (R12)
ADDQ $0x10, R11
ADDQ $0x10, R12
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, DI
copy_slow_3:
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3
handle_loop:
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
loop_finished:
// Return value
MOVB $0x01, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
MOVQ SI, 112(AX)
RET
error_match_off_too_big:
// Return value
MOVB $0x00, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
MOVQ SI, 112(AX)
RET
empty_seqs:
// Return value
MOVB $0x01, ret+8(FP)
RET
// func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool
// Requires: SSE
TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9
MOVQ ctx+0(FP), R10
MOVQ 8(R10), CX
TESTQ CX, CX
JZ empty_seqs
MOVQ (R10), AX
MOVQ 24(R10), DX
MOVQ 32(R10), BX
MOVQ 80(R10), SI
MOVQ 104(R10), DI
MOVQ 120(R10), R8
MOVQ 56(R10), R9
MOVQ 64(R10), R10
ADDQ R10, R9
// seqsBase += 24 * seqIndex
LEAQ (DX)(DX*2), R11
SHLQ $0x03, R11
ADDQ R11, AX
// outBase += outPosition
ADDQ DI, BX
main_loop:
MOVQ (AX), R11
MOVQ 16(AX), R12
MOVQ 8(AX), R13
// Copy literals
TESTQ R11, R11
JZ check_offset
XORQ R14, R14
TESTQ $0x00000001, R11
JZ copy_1_word
MOVB (SI)(R14*1), R15
MOVB R15, (BX)(R14*1)
ADDQ $0x01, R14
copy_1_word:
TESTQ $0x00000002, R11
JZ copy_1_dword
MOVW (SI)(R14*1), R15
MOVW R15, (BX)(R14*1)
ADDQ $0x02, R14
copy_1_dword:
TESTQ $0x00000004, R11
JZ copy_1_qword
MOVL (SI)(R14*1), R15
MOVL R15, (BX)(R14*1)
ADDQ $0x04, R14
copy_1_qword:
TESTQ $0x00000008, R11
JZ copy_1_test
MOVQ (SI)(R14*1), R15
MOVQ R15, (BX)(R14*1)
ADDQ $0x08, R14
JMP copy_1_test
copy_1:
MOVUPS (SI)(R14*1), X0
MOVUPS X0, (BX)(R14*1)
ADDQ $0x10, R14
copy_1_test:
CMPQ R14, R11
JB copy_1
ADDQ R11, SI
ADDQ R11, BX
ADDQ R11, DI
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
LEAQ (DI)(R10*1), R11
CMPQ R12, R11
JG error_match_off_too_big
CMPQ R12, R8
JG error_match_off_too_big
// Copy match from history
MOVQ R12, R11
SUBQ DI, R11
JLS copy_match
MOVQ R9, R14
SUBQ R11, R14
CMPQ R13, R11
JGE copy_all_from_history
XORQ R11, R11
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(R11*1), R12
MOVB R12, (BX)(R11*1)
ADDQ $0x01, R11
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(R11*1), R12
MOVW R12, (BX)(R11*1)
ADDQ $0x02, R11
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(R11*1), R12
MOVL R12, (BX)(R11*1)
ADDQ $0x04, R11
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(R11*1), R12
MOVQ R12, (BX)(R11*1)
ADDQ $0x08, R11
JMP copy_4_test
copy_4:
MOVUPS (R14)(R11*1), X0
MOVUPS X0, (BX)(R11*1)
ADDQ $0x10, R11
copy_4_test:
CMPQ R11, R13
JB copy_4
ADDQ R13, DI
ADDQ R13, BX
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, R11
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (BX)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, R11
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (BX)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, R11
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (BX)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, R11
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (BX)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (BX)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, R11
JB copy_5
ADDQ R11, BX
ADDQ R11, DI
SUBQ R11, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ BX, R11
SUBQ R12, R11
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, DI
XORQ R12, R12
TESTQ $0x00000001, R13
JZ copy_2_word
MOVB (R11)(R12*1), R14
MOVB R14, (BX)(R12*1)
ADDQ $0x01, R12
copy_2_word:
TESTQ $0x00000002, R13
JZ copy_2_dword
MOVW (R11)(R12*1), R14
MOVW R14, (BX)(R12*1)
ADDQ $0x02, R12
copy_2_dword:
TESTQ $0x00000004, R13
JZ copy_2_qword
MOVL (R11)(R12*1), R14
MOVL R14, (BX)(R12*1)
ADDQ $0x04, R12
copy_2_qword:
TESTQ $0x00000008, R13
JZ copy_2_test
MOVQ (R11)(R12*1), R14
MOVQ R14, (BX)(R12*1)
ADDQ $0x08, R12
JMP copy_2_test
copy_2:
MOVUPS (R11)(R12*1), X0
MOVUPS X0, (BX)(R12*1)
ADDQ $0x10, R12
copy_2_test:
CMPQ R12, R13
JB copy_2
ADDQ R13, BX
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, DI
copy_slow_3:
MOVB (R11), R12
MOVB R12, (BX)
INCQ R11
INCQ BX
DECQ R13
JNZ copy_slow_3
handle_loop:
ADDQ $0x18, AX
INCQ DX
CMPQ DX, CX
JB main_loop
loop_finished:
// Return value
MOVB $0x01, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
MOVQ SI, 112(AX)
RET
error_match_off_too_big:
// Return value
MOVB $0x00, ret+8(FP)
// Update the context
MOVQ ctx+0(FP), AX
MOVQ DX, 24(AX)
MOVQ DI, 104(AX)
MOVQ 80(AX), CX
SUBQ CX, SI
MOVQ SI, 112(AX)
RET
empty_seqs:
// Return value
MOVB $0x01, ret+8(FP)
RET
// func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
MOVQ 144(AX), R11
MOVQ 136(AX), R12
MOVQ 200(AX), CX
MOVQ CX, 56(SP)
MOVQ 176(AX), CX
MOVQ CX, 48(SP)
MOVQ 184(AX), AX
MOVQ AX, 40(SP)
MOVQ 40(SP), AX
ADDQ AX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R10, 32(SP)
// outBase += outPosition
ADDQ R12, R10
sequenceDecs_decodeSync_amd64_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_amd64_fill_end
sequenceDecs_decodeSync_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_end
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 8(SP)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_amd64_fill_2_end
sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_amd64_fill_2_end
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, DI
sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, R8
sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, R9
sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decodeSync_amd64_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ AX, $0x01
JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_amd64_adjust_end
sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
sequenceDecs_decodeSync_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_amd64_adjust_end
sequenceDecs_decodeSync_amd64_adjust_offset_nonzero:
MOVQ R13, AX
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(AX*8), R14
JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_amd64_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_amd64_adjust_skip
MOVQ 152(CX), AX
MOVQ AX, 160(CX)
sequenceDecs_decodeSync_amd64_adjust_skip:
MOVQ 144(CX), AX
MOVQ AX, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_amd64_adjust_end:
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), AX
MOVQ 24(SP), CX
LEAQ (AX)(CX*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ CX, 104(R14)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decodeSync_amd64_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_amd64_match_len_ofs_ok:
MOVQ 24(SP), AX
MOVQ 8(SP), CX
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (AX)(R13*1), R14
ADDQ R10, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ AX, AX
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (R11)(R14*1), X0
MOVUPS X0, (R10)(R14*1)
ADDQ $0x10, R14
CMPQ R14, AX
JB copy_1
ADDQ AX, R11
ADDQ AX, R10
ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R12, AX
ADDQ 40(SP), AX
CMPQ CX, AX
JG error_match_off_too_big
CMPQ CX, 56(SP)
JG error_match_off_too_big
// Copy match from history
MOVQ CX, AX
SUBQ R12, AX
JLS copy_match
MOVQ 48(SP), R14
SUBQ AX, R14
CMPQ R13, AX
JGE copy_all_from_history
XORQ AX, AX
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(AX*1), CL
MOVB CL, (R10)(AX*1)
ADDQ $0x01, AX
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(AX*1), CX
MOVW CX, (R10)(AX*1)
ADDQ $0x02, AX
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(AX*1), CX
MOVL CX, (R10)(AX*1)
ADDQ $0x04, AX
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(AX*1), CX
MOVQ CX, (R10)(AX*1)
ADDQ $0x08, AX
JMP copy_4_test
copy_4:
MOVUPS (R14)(AX*1), X0
MOVUPS X0, (R10)(AX*1)
ADDQ $0x10, AX
copy_4_test:
CMPQ AX, R13
JB copy_4
ADDQ R13, R12
ADDQ R13, R10
JMP handle_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, AX
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (R10)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, AX
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (R10)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, AX
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (R10)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, AX
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (R10)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (R10)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, AX
JB copy_5
ADDQ AX, R10
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ R10, AX
SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, R12
MOVQ R10, CX
ADDQ R13, R10
copy_2:
MOVUPS (AX), X0
MOVUPS X0, (CX)
ADDQ $0x10, AX
ADDQ $0x10, CX
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, R12
copy_slow_3:
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decodeSync_amd64_main_loop
loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R12, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R11
MOVQ R11, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_amd64_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
MOVQ 144(CX), R10
MOVQ 136(CX), R11
MOVQ 200(CX), R12
MOVQ R12, 56(SP)
MOVQ 176(CX), R12
MOVQ R12, 48(SP)
MOVQ 184(CX), CX
MOVQ CX, 40(SP)
MOVQ 40(SP), CX
ADDQ CX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R9, 32(SP)
// outBase += outPosition
ADDQ R11, R9
sequenceDecs_decodeSync_bmi2_main_loop:
MOVQ (SP), R12
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_bmi2_fill_end
sequenceDecs_decodeSync_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_end
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 8(SP)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_bmi2_fill_2_end
sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 24(SP)
// Fill bitreader for state updates
MOVQ R12, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R12
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_bmi2_skip_update
LEAQ (SI)(DI*1), R13
ADDQ R8, R13
MOVBQZX R13, R13
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
sequenceDecs_decodeSync_bmi2_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ R12, $0x01
JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_bmi2_adjust_end
sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_bmi2_adjust_end
sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(R12*8), R14
JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_bmi2_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_bmi2_adjust_skip
MOVQ 152(CX), R12
MOVQ R12, 160(CX)
sequenceDecs_decodeSync_bmi2_adjust_skip:
MOVQ 144(CX), R12
MOVQ R12, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_bmi2_adjust_end:
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), CX
MOVQ 24(SP), R12
LEAQ (CX)(R12*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ R12, 104(R14)
JS error_not_enough_literals
CMPQ CX, $0x00020002
JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok
TESTQ CX, CX
JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_bmi2_match_len_ofs_ok:
MOVQ 24(SP), CX
MOVQ 8(SP), R12
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (CX)(R13*1), R14
ADDQ R9, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ CX, CX
JZ check_offset
XORQ R14, R14
copy_1:
MOVUPS (R10)(R14*1), X0
MOVUPS X0, (R9)(R14*1)
ADDQ $0x10, R14
CMPQ R14, CX
JB copy_1
ADDQ CX, R10
ADDQ CX, R9
ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R11, CX
ADDQ 40(SP), CX
CMPQ R12, CX
JG error_match_off_too_big
CMPQ R12, 56(SP)
JG error_match_off_too_big
// Copy match from history
MOVQ R12, CX
SUBQ R11, CX
JLS copy_match
MOVQ 48(SP), R14
SUBQ CX, R14
CMPQ R13, CX
JGE copy_all_from_history
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(CX*1), R12
MOVB R12, (R9)(CX*1)
ADDQ $0x01, CX
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(CX*1), R12
MOVW R12, (R9)(CX*1)
ADDQ $0x02, CX
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(CX*1), R12
MOVL R12, (R9)(CX*1)
ADDQ $0x04, CX
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(CX*1), R12
MOVQ R12, (R9)(CX*1)
ADDQ $0x08, CX
JMP copy_4_test
copy_4:
MOVUPS (R14)(CX*1), X0
MOVUPS X0, (R9)(CX*1)
ADDQ $0x10, CX
copy_4_test:
CMPQ CX, R13
JB copy_4
ADDQ R13, R11
ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, CX
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (R9)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, CX
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (R9)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, CX
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (R9)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, CX
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (R9)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (R9)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, CX
JB copy_5
ADDQ CX, R9
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ R9, CX
SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, R11
MOVQ R9, R12
ADDQ R13, R9
copy_2:
MOVUPS (CX), X0
MOVUPS X0, (R12)
ADDQ $0x10, CX
ADDQ $0x10, R12
SUBQ $0x10, R13
JHI copy_2
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, R11
copy_slow_3:
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decodeSync_bmi2_main_loop
loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R11, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R10
MOVQ R10, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_bmi2_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32
MOVQ br+8(FP), AX
MOVQ 32(AX), DX
MOVBQZX 40(AX), BX
MOVQ 24(AX), SI
MOVQ (AX), AX
ADDQ SI, AX
MOVQ AX, (SP)
MOVQ ctx+16(FP), AX
MOVQ 72(AX), DI
MOVQ 80(AX), R8
MOVQ 88(AX), R9
MOVQ 112(AX), R10
MOVQ 128(AX), CX
MOVQ CX, 32(SP)
MOVQ 144(AX), R11
MOVQ 136(AX), R12
MOVQ 200(AX), CX
MOVQ CX, 56(SP)
MOVQ 176(AX), CX
MOVQ CX, 48(SP)
MOVQ 184(AX), AX
MOVQ AX, 40(SP)
MOVQ 40(SP), AX
ADDQ AX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R10, 32(SP)
// outBase += outPosition
ADDQ R12, R10
sequenceDecs_decodeSync_safe_amd64_main_loop:
MOVQ (SP), R13
// Fill bitreader to have enough for the offset and match length.
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_safe_amd64_fill_end
sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_end:
// Update offset
MOVQ R9, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 8(SP)
// Update match length
MOVQ R8, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ SI, $0x08
JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
MOVQ BX, AX
SHRQ $0x03, AX
SUBQ AX, R13
MOVQ (R13), DX
SUBQ AX, SI
ANDQ $0x07, BX
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end
sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte:
CMPQ SI, $0x00
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
CMPQ BX, $0x07
JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end
SHLQ $0x08, DX
SUBQ $0x01, R13
SUBQ $0x01, SI
SUBQ $0x08, BX
MOVBQZX (R13), AX
ORQ AX, DX
JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_amd64_fill_2_end:
// Update literal length
MOVQ DI, AX
MOVQ BX, CX
MOVQ DX, R14
SHLQ CL, R14
MOVB AH, CL
ADDQ CX, BX
NEGL CX
SHRQ CL, R14
SHRQ $0x20, AX
TESTQ CX, CX
CMOVQEQ CX, R14
ADDQ R14, AX
MOVQ AX, 24(SP)
// Fill bitreader for state updates
MOVQ R13, (SP)
MOVQ R9, AX
SHRQ $0x08, AX
MOVBQZX AL, AX
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_safe_amd64_skip_update
// Update Literal Length State
MOVBQZX DI, R13
SHRQ $0x10, DI
MOVWQZX DI, DI
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, DI
sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero:
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(DI*8), DI
// Update Match Length State
MOVBQZX R8, R13
SHRQ $0x10, R8
MOVWQZX R8, R8
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, R8
sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero:
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(R8*8), R8
// Update Offset State
MOVBQZX R9, R13
SHRQ $0x10, R9
MOVWQZX R9, R9
CMPQ R13, $0x00
JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero
MOVQ BX, CX
ADDQ R13, BX
MOVQ DX, R14
SHLQ CL, R14
MOVQ R13, CX
NEGQ CX
SHRQ CL, R14
ADDQ R14, R9
sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero:
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R9*8), R9
sequenceDecs_decodeSync_safe_amd64_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ AX, $0x01
JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_safe_amd64_adjust_end
sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero:
MOVQ R13, AX
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, AX
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(AX*8), R14
JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip
MOVQ 152(CX), AX
MOVQ AX, 160(CX)
sequenceDecs_decodeSync_safe_amd64_adjust_skip:
MOVQ 144(CX), AX
MOVQ AX, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_safe_amd64_adjust_end:
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), AX
MOVQ 24(SP), CX
LEAQ (AX)(CX*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ CX, 104(R14)
JS error_not_enough_literals
CMPQ AX, $0x00020002
JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok
TESTQ AX, AX
JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok:
MOVQ 24(SP), AX
MOVQ 8(SP), CX
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (AX)(R13*1), R14
ADDQ R10, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ AX, AX
JZ check_offset
XORQ R14, R14
TESTQ $0x00000001, AX
JZ copy_1_word
MOVB (R11)(R14*1), R15
MOVB R15, (R10)(R14*1)
ADDQ $0x01, R14
copy_1_word:
TESTQ $0x00000002, AX
JZ copy_1_dword
MOVW (R11)(R14*1), R15
MOVW R15, (R10)(R14*1)
ADDQ $0x02, R14
copy_1_dword:
TESTQ $0x00000004, AX
JZ copy_1_qword
MOVL (R11)(R14*1), R15
MOVL R15, (R10)(R14*1)
ADDQ $0x04, R14
copy_1_qword:
TESTQ $0x00000008, AX
JZ copy_1_test
MOVQ (R11)(R14*1), R15
MOVQ R15, (R10)(R14*1)
ADDQ $0x08, R14
JMP copy_1_test
copy_1:
MOVUPS (R11)(R14*1), X0
MOVUPS X0, (R10)(R14*1)
ADDQ $0x10, R14
copy_1_test:
CMPQ R14, AX
JB copy_1
ADDQ AX, R11
ADDQ AX, R10
ADDQ AX, R12
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R12, AX
ADDQ 40(SP), AX
CMPQ CX, AX
JG error_match_off_too_big
CMPQ CX, 56(SP)
JG error_match_off_too_big
// Copy match from history
MOVQ CX, AX
SUBQ R12, AX
JLS copy_match
MOVQ 48(SP), R14
SUBQ AX, R14
CMPQ R13, AX
JGE copy_all_from_history
XORQ AX, AX
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(AX*1), CL
MOVB CL, (R10)(AX*1)
ADDQ $0x01, AX
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(AX*1), CX
MOVW CX, (R10)(AX*1)
ADDQ $0x02, AX
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(AX*1), CX
MOVL CX, (R10)(AX*1)
ADDQ $0x04, AX
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(AX*1), CX
MOVQ CX, (R10)(AX*1)
ADDQ $0x08, AX
JMP copy_4_test
copy_4:
MOVUPS (R14)(AX*1), X0
MOVUPS X0, (R10)(AX*1)
ADDQ $0x10, AX
copy_4_test:
CMPQ AX, R13
JB copy_4
ADDQ R13, R12
ADDQ R13, R10
JMP handle_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, AX
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (R10)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, AX
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (R10)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, AX
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (R10)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, AX
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (R10)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (R10)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, AX
JB copy_5
ADDQ AX, R10
ADDQ AX, R12
SUBQ AX, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ R10, AX
SUBQ CX, AX
// ml <= mo
CMPQ R13, CX
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, R12
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_2_word
MOVB (AX)(CX*1), R14
MOVB R14, (R10)(CX*1)
ADDQ $0x01, CX
copy_2_word:
TESTQ $0x00000002, R13
JZ copy_2_dword
MOVW (AX)(CX*1), R14
MOVW R14, (R10)(CX*1)
ADDQ $0x02, CX
copy_2_dword:
TESTQ $0x00000004, R13
JZ copy_2_qword
MOVL (AX)(CX*1), R14
MOVL R14, (R10)(CX*1)
ADDQ $0x04, CX
copy_2_qword:
TESTQ $0x00000008, R13
JZ copy_2_test
MOVQ (AX)(CX*1), R14
MOVQ R14, (R10)(CX*1)
ADDQ $0x08, CX
JMP copy_2_test
copy_2:
MOVUPS (AX)(CX*1), X0
MOVUPS X0, (R10)(CX*1)
ADDQ $0x10, CX
copy_2_test:
CMPQ CX, R13
JB copy_2
ADDQ R13, R10
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, R12
copy_slow_3:
MOVB (AX), CL
MOVB CL, (R10)
INCQ AX
INCQ R10
DECQ R13
JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), AX
DECQ 96(AX)
JNS sequenceDecs_decodeSync_safe_amd64_main_loop
loop_finished:
MOVQ br+8(FP), AX
MOVQ DX, 32(AX)
MOVB BL, 40(AX)
MOVQ SI, 24(AX)
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R12, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R11
MOVQ R11, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R12, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET
// func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int
// Requires: BMI, BMI2, CMOV, SSE
TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32
MOVQ br+8(FP), CX
MOVQ 32(CX), AX
MOVBQZX 40(CX), DX
MOVQ 24(CX), BX
MOVQ (CX), CX
ADDQ BX, CX
MOVQ CX, (SP)
MOVQ ctx+16(FP), CX
MOVQ 72(CX), SI
MOVQ 80(CX), DI
MOVQ 88(CX), R8
MOVQ 112(CX), R9
MOVQ 128(CX), R10
MOVQ R10, 32(SP)
MOVQ 144(CX), R10
MOVQ 136(CX), R11
MOVQ 200(CX), R12
MOVQ R12, 56(SP)
MOVQ 176(CX), R12
MOVQ R12, 48(SP)
MOVQ 184(CX), CX
MOVQ CX, 40(SP)
MOVQ 40(SP), CX
ADDQ CX, 48(SP)
// Calculate poiter to s.out[cap(s.out)] (a past-end pointer)
ADDQ R9, 32(SP)
// outBase += outPosition
ADDQ R11, R9
sequenceDecs_decodeSync_safe_bmi2_main_loop:
MOVQ (SP), R12
// Fill bitreader to have enough for the offset and match length.
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_end
sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_end:
// Update offset
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ R8, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 8(SP)
// Update match length
MOVQ $0x00000808, CX
BEXTRQ CX, DI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ DI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 16(SP)
// Fill bitreader to have enough for the remaining
CMPQ BX, $0x08
JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
MOVQ DX, CX
SHRQ $0x03, CX
SUBQ CX, R12
MOVQ (R12), AX
SUBQ CX, BX
ANDQ $0x07, DX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end
sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte:
CMPQ BX, $0x00
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
CMPQ DX, $0x07
JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end
SHLQ $0x08, AX
SUBQ $0x01, R12
SUBQ $0x01, BX
SUBQ $0x08, DX
MOVBQZX (R12), CX
ORQ CX, AX
JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte
sequenceDecs_decodeSync_safe_bmi2_fill_2_end:
// Update literal length
MOVQ $0x00000808, CX
BEXTRQ CX, SI, R13
MOVQ AX, R14
LEAQ (DX)(R13*1), CX
ROLQ CL, R14
BZHIQ R13, R14, R14
MOVQ CX, DX
MOVQ SI, CX
SHRQ $0x20, CX
ADDQ R14, CX
MOVQ CX, 24(SP)
// Fill bitreader for state updates
MOVQ R12, (SP)
MOVQ $0x00000808, CX
BEXTRQ CX, R8, R12
MOVQ ctx+16(FP), CX
CMPQ 96(CX), $0x00
JZ sequenceDecs_decodeSync_safe_bmi2_skip_update
LEAQ (SI)(DI*1), R13
ADDQ R8, R13
MOVBQZX R13, R13
LEAQ (DX)(R13*1), CX
MOVQ AX, R14
MOVQ CX, DX
ROLQ CL, R14
BZHIQ R13, R14, R14
// Update Offset State
BZHIQ R8, R14, CX
SHRXQ R8, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, R8, R8
ADDQ CX, R8
// Load ctx.ofTable
MOVQ ctx+16(FP), CX
MOVQ 48(CX), CX
MOVQ (CX)(R8*8), R8
// Update Match Length State
BZHIQ DI, R14, CX
SHRXQ DI, R14, R14
MOVQ $0x00001010, R13
BEXTRQ R13, DI, DI
ADDQ CX, DI
// Load ctx.mlTable
MOVQ ctx+16(FP), CX
MOVQ 24(CX), CX
MOVQ (CX)(DI*8), DI
// Update Literal Length State
BZHIQ SI, R14, CX
MOVQ $0x00001010, R13
BEXTRQ R13, SI, SI
ADDQ CX, SI
// Load ctx.llTable
MOVQ ctx+16(FP), CX
MOVQ (CX), CX
MOVQ (CX)(SI*8), SI
sequenceDecs_decodeSync_safe_bmi2_skip_update:
// Adjust offset
MOVQ s+0(FP), CX
MOVQ 8(SP), R13
CMPQ R12, $0x01
JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0
MOVUPS 144(CX), X0
MOVQ R13, 144(CX)
MOVUPS X0, 152(CX)
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0:
CMPQ 24(SP), $0x00000000
JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero
INCQ R13
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero:
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero
MOVQ 144(CX), R13
JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end
sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero:
MOVQ R13, R12
XORQ R14, R14
MOVQ $-1, R15
CMPQ R13, $0x03
CMOVQEQ R14, R12
CMOVQEQ R15, R14
LEAQ 144(CX), R15
ADDQ (R15)(R12*8), R14
JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid
MOVQ $0x00000001, R14
sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid:
CMPQ R13, $0x01
JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip
MOVQ 152(CX), R12
MOVQ R12, 160(CX)
sequenceDecs_decodeSync_safe_bmi2_adjust_skip:
MOVQ 144(CX), R12
MOVQ R12, 152(CX)
MOVQ R14, 144(CX)
MOVQ R14, R13
sequenceDecs_decodeSync_safe_bmi2_adjust_end:
MOVQ R13, 8(SP)
// Check values
MOVQ 16(SP), CX
MOVQ 24(SP), R12
LEAQ (CX)(R12*1), R14
MOVQ s+0(FP), R15
ADDQ R14, 256(R15)
MOVQ ctx+16(FP), R14
SUBQ R12, 104(R14)
JS error_not_enough_literals
CMPQ CX, $0x00020002
JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big
TESTQ R13, R13
JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok
TESTQ CX, CX
JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch
sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok:
MOVQ 24(SP), CX
MOVQ 8(SP), R12
MOVQ 16(SP), R13
// Check if we have enough space in s.out
LEAQ (CX)(R13*1), R14
ADDQ R9, R14
CMPQ R14, 32(SP)
JA error_not_enough_space
// Copy literals
TESTQ CX, CX
JZ check_offset
XORQ R14, R14
TESTQ $0x00000001, CX
JZ copy_1_word
MOVB (R10)(R14*1), R15
MOVB R15, (R9)(R14*1)
ADDQ $0x01, R14
copy_1_word:
TESTQ $0x00000002, CX
JZ copy_1_dword
MOVW (R10)(R14*1), R15
MOVW R15, (R9)(R14*1)
ADDQ $0x02, R14
copy_1_dword:
TESTQ $0x00000004, CX
JZ copy_1_qword
MOVL (R10)(R14*1), R15
MOVL R15, (R9)(R14*1)
ADDQ $0x04, R14
copy_1_qword:
TESTQ $0x00000008, CX
JZ copy_1_test
MOVQ (R10)(R14*1), R15
MOVQ R15, (R9)(R14*1)
ADDQ $0x08, R14
JMP copy_1_test
copy_1:
MOVUPS (R10)(R14*1), X0
MOVUPS X0, (R9)(R14*1)
ADDQ $0x10, R14
copy_1_test:
CMPQ R14, CX
JB copy_1
ADDQ CX, R10
ADDQ CX, R9
ADDQ CX, R11
// Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize)
check_offset:
MOVQ R11, CX
ADDQ 40(SP), CX
CMPQ R12, CX
JG error_match_off_too_big
CMPQ R12, 56(SP)
JG error_match_off_too_big
// Copy match from history
MOVQ R12, CX
SUBQ R11, CX
JLS copy_match
MOVQ 48(SP), R14
SUBQ CX, R14
CMPQ R13, CX
JGE copy_all_from_history
XORQ CX, CX
TESTQ $0x00000001, R13
JZ copy_4_word
MOVB (R14)(CX*1), R12
MOVB R12, (R9)(CX*1)
ADDQ $0x01, CX
copy_4_word:
TESTQ $0x00000002, R13
JZ copy_4_dword
MOVW (R14)(CX*1), R12
MOVW R12, (R9)(CX*1)
ADDQ $0x02, CX
copy_4_dword:
TESTQ $0x00000004, R13
JZ copy_4_qword
MOVL (R14)(CX*1), R12
MOVL R12, (R9)(CX*1)
ADDQ $0x04, CX
copy_4_qword:
TESTQ $0x00000008, R13
JZ copy_4_test
MOVQ (R14)(CX*1), R12
MOVQ R12, (R9)(CX*1)
ADDQ $0x08, CX
JMP copy_4_test
copy_4:
MOVUPS (R14)(CX*1), X0
MOVUPS X0, (R9)(CX*1)
ADDQ $0x10, CX
copy_4_test:
CMPQ CX, R13
JB copy_4
ADDQ R13, R11
ADDQ R13, R9
JMP handle_loop
JMP loop_finished
copy_all_from_history:
XORQ R15, R15
TESTQ $0x00000001, CX
JZ copy_5_word
MOVB (R14)(R15*1), BP
MOVB BP, (R9)(R15*1)
ADDQ $0x01, R15
copy_5_word:
TESTQ $0x00000002, CX
JZ copy_5_dword
MOVW (R14)(R15*1), BP
MOVW BP, (R9)(R15*1)
ADDQ $0x02, R15
copy_5_dword:
TESTQ $0x00000004, CX
JZ copy_5_qword
MOVL (R14)(R15*1), BP
MOVL BP, (R9)(R15*1)
ADDQ $0x04, R15
copy_5_qword:
TESTQ $0x00000008, CX
JZ copy_5_test
MOVQ (R14)(R15*1), BP
MOVQ BP, (R9)(R15*1)
ADDQ $0x08, R15
JMP copy_5_test
copy_5:
MOVUPS (R14)(R15*1), X0
MOVUPS X0, (R9)(R15*1)
ADDQ $0x10, R15
copy_5_test:
CMPQ R15, CX
JB copy_5
ADDQ CX, R9
ADDQ CX, R11
SUBQ CX, R13
// Copy match from the current buffer
copy_match:
TESTQ R13, R13
JZ handle_loop
MOVQ R9, CX
SUBQ R12, CX
// ml <= mo
CMPQ R13, R12
JA copy_overlapping_match
// Copy non-overlapping match
ADDQ R13, R11
XORQ R12, R12
TESTQ $0x00000001, R13
JZ copy_2_word
MOVB (CX)(R12*1), R14
MOVB R14, (R9)(R12*1)
ADDQ $0x01, R12
copy_2_word:
TESTQ $0x00000002, R13
JZ copy_2_dword
MOVW (CX)(R12*1), R14
MOVW R14, (R9)(R12*1)
ADDQ $0x02, R12
copy_2_dword:
TESTQ $0x00000004, R13
JZ copy_2_qword
MOVL (CX)(R12*1), R14
MOVL R14, (R9)(R12*1)
ADDQ $0x04, R12
copy_2_qword:
TESTQ $0x00000008, R13
JZ copy_2_test
MOVQ (CX)(R12*1), R14
MOVQ R14, (R9)(R12*1)
ADDQ $0x08, R12
JMP copy_2_test
copy_2:
MOVUPS (CX)(R12*1), X0
MOVUPS X0, (R9)(R12*1)
ADDQ $0x10, R12
copy_2_test:
CMPQ R12, R13
JB copy_2
ADDQ R13, R9
JMP handle_loop
// Copy overlapping match
copy_overlapping_match:
ADDQ R13, R11
copy_slow_3:
MOVB (CX), R12
MOVB R12, (R9)
INCQ CX
INCQ R9
DECQ R13
JNZ copy_slow_3
handle_loop:
MOVQ ctx+16(FP), CX
DECQ 96(CX)
JNS sequenceDecs_decodeSync_safe_bmi2_main_loop
loop_finished:
MOVQ br+8(FP), CX
MOVQ AX, 32(CX)
MOVB DL, 40(CX)
MOVQ BX, 24(CX)
// Update the context
MOVQ ctx+16(FP), AX
MOVQ R11, 136(AX)
MOVQ 144(AX), CX
SUBQ CX, R10
MOVQ R10, 168(AX)
// Return success
MOVQ $0x00000000, ret+24(FP)
RET
// Return with match length error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch:
MOVQ 16(SP), AX
MOVQ ctx+16(FP), CX
MOVQ AX, 216(CX)
MOVQ $0x00000001, ret+24(FP)
RET
// Return with match too long error
sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big:
MOVQ ctx+16(FP), AX
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ $0x00000002, ret+24(FP)
RET
// Return with match offset too long error
error_match_off_too_big:
MOVQ ctx+16(FP), AX
MOVQ 8(SP), CX
MOVQ CX, 224(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000003, ret+24(FP)
RET
// Return with not enough literals error
error_not_enough_literals:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ $0x00000004, ret+24(FP)
RET
// Return with not enough output space error
error_not_enough_space:
MOVQ ctx+16(FP), AX
MOVQ 24(SP), CX
MOVQ CX, 208(AX)
MOVQ 16(SP), CX
MOVQ CX, 216(AX)
MOVQ R11, 136(AX)
MOVQ $0x00000005, ret+24(FP)
RET