// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm // +build !appengine,!noasm,gc,!noasm // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_amd64(SB), $8-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 104(AX), R10 MOVQ s+0(FP), AX MOVQ 144(AX), R11 MOVQ 152(AX), R12 MOVQ 160(AX), R13 sequenceDecs_decode_amd64_main_loop: MOVQ (SP), R14 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decode_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_amd64_fill_end sequenceDecs_decode_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decode_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_byte_by_byte sequenceDecs_decode_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, 16(R10) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, 8(R10) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_amd64_fill_2_end sequenceDecs_decode_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decode_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte sequenceDecs_decode_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_amd64_skip_update // Update Literal Length State MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_llState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, DI sequenceDecs_decode_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_mlState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, R8 sequenceDecs_decode_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 CMPQ R14, $0x00 JZ sequenceDecs_decode_amd64_ofState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, R9 sequenceDecs_decode_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decode_amd64_skip_update: // Adjust offset MOVQ 16(R10), CX CMPQ AX, $0x01 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 JMP sequenceDecs_decode_amd64_adjust_end sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_amd64_adjust_offset_nonzero sequenceDecs_decode_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero MOVQ R11, CX JMP sequenceDecs_decode_amd64_adjust_end sequenceDecs_decode_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_amd64_adjust_zero JEQ sequenceDecs_decode_amd64_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_amd64_adjust_three JMP sequenceDecs_decode_amd64_adjust_two sequenceDecs_decode_amd64_adjust_zero: MOVQ R11, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_one: MOVQ R12, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_two: MOVQ R13, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_three: LEAQ -1(R11), AX sequenceDecs_decode_amd64_adjust_test_temp_valid: TESTQ AX, AX JNZ sequenceDecs_decode_amd64_adjust_temp_valid MOVQ $0x00000001, AX sequenceDecs_decode_amd64_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R12, R13 MOVQ R11, R12 MOVQ AX, R11 MOVQ AX, CX sequenceDecs_decode_amd64_adjust_end: MOVQ CX, 16(R10) // Check values MOVQ 8(R10), AX MOVQ (R10), R14 LEAQ (AX)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_amd64_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch sequenceDecs_decode_amd64_match_len_ofs_ok: ADDQ $0x18, R10 MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decode_amd64_main_loop MOVQ s+0(FP), AX MOVQ R11, 144(AX) MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 104(AX), R10 MOVQ s+0(FP), AX MOVQ 144(AX), R11 MOVQ 152(AX), R12 MOVQ 160(AX), R13 sequenceDecs_decode_56_amd64_main_loop: MOVQ (SP), R14 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_56_amd64_fill_end sequenceDecs_decode_56_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_56_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decode_56_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte sequenceDecs_decode_56_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, 16(R10) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, 8(R10) // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R15 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R15 ADDQ R15, AX MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_56_amd64_skip_update // Update Literal Length State MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI CMPQ R14, $0x00 JZ sequenceDecs_decode_56_amd64_llState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, DI sequenceDecs_decode_56_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 CMPQ R14, $0x00 JZ sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, R8 sequenceDecs_decode_56_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 CMPQ R14, $0x00 JZ sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero MOVQ BX, CX ADDQ R14, BX MOVQ DX, R15 SHLQ CL, R15 MOVQ R14, CX NEGQ CX SHRQ CL, R15 ADDQ R15, R9 sequenceDecs_decode_56_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decode_56_amd64_skip_update: // Adjust offset MOVQ 16(R10), CX CMPQ AX, $0x01 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 JMP sequenceDecs_decode_56_amd64_adjust_end sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero sequenceDecs_decode_56_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero MOVQ R11, CX JMP sequenceDecs_decode_56_amd64_adjust_end sequenceDecs_decode_56_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_56_amd64_adjust_zero JEQ sequenceDecs_decode_56_amd64_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_56_amd64_adjust_three JMP sequenceDecs_decode_56_amd64_adjust_two sequenceDecs_decode_56_amd64_adjust_zero: MOVQ R11, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_one: MOVQ R12, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_two: MOVQ R13, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_three: LEAQ -1(R11), AX sequenceDecs_decode_56_amd64_adjust_test_temp_valid: TESTQ AX, AX JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid MOVQ $0x00000001, AX sequenceDecs_decode_56_amd64_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R12, R13 MOVQ R11, R12 MOVQ AX, R11 MOVQ AX, CX sequenceDecs_decode_56_amd64_adjust_end: MOVQ CX, 16(R10) // Check values MOVQ 8(R10), AX MOVQ (R10), R14 LEAQ (AX)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_56_amd64_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch sequenceDecs_decode_56_amd64_match_len_ofs_ok: ADDQ $0x18, R10 MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decode_56_amd64_main_loop MOVQ s+0(FP), AX MOVQ R11, 144(AX) MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_56_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 104(CX), R9 MOVQ s+0(FP), CX MOVQ 144(CX), R10 MOVQ 152(CX), R11 MOVQ 160(CX), R12 sequenceDecs_decode_bmi2_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decode_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_bmi2_fill_end sequenceDecs_decode_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decode_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_byte_by_byte sequenceDecs_decode_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 16(R9) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 8(R9) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_bmi2_fill_2_end sequenceDecs_decode_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decode_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte sequenceDecs_decode_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, (R9) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_bmi2_skip_update LEAQ (SI)(DI*1), R14 ADDQ R8, R14 MOVBQZX R14, R14 LEAQ (DX)(R14*1), CX MOVQ AX, R15 MOVQ CX, DX ROLQ CL, R15 BZHIQ R14, R15, R15 // Update Offset State BZHIQ R8, R15, CX SHRXQ R8, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R15, CX SHRXQ DI, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R15, CX MOVQ $0x00001010, R14 BEXTRQ R14, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decode_bmi2_skip_update: // Adjust offset MOVQ 16(R9), CX CMPQ R13, $0x01 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 JMP sequenceDecs_decode_bmi2_adjust_end sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero sequenceDecs_decode_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero MOVQ R10, CX JMP sequenceDecs_decode_bmi2_adjust_end sequenceDecs_decode_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_bmi2_adjust_zero JEQ sequenceDecs_decode_bmi2_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_bmi2_adjust_three JMP sequenceDecs_decode_bmi2_adjust_two sequenceDecs_decode_bmi2_adjust_zero: MOVQ R10, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_one: MOVQ R11, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_two: MOVQ R12, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_three: LEAQ -1(R10), R13 sequenceDecs_decode_bmi2_adjust_test_temp_valid: TESTQ R13, R13 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid MOVQ $0x00000001, R13 sequenceDecs_decode_bmi2_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R11, R12 MOVQ R10, R11 MOVQ R13, R10 MOVQ R13, CX sequenceDecs_decode_bmi2_adjust_end: MOVQ CX, 16(R9) // Check values MOVQ 8(R9), R13 MOVQ (R9), R14 LEAQ (R13)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_bmi2_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok TESTQ R13, R13 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch sequenceDecs_decode_bmi2_match_len_ofs_ok: ADDQ $0x18, R9 MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decode_bmi2_main_loop MOVQ s+0(FP), CX MOVQ R10, 144(CX) MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 104(CX), R9 MOVQ s+0(FP), CX MOVQ 144(CX), R10 MOVQ 152(CX), R11 MOVQ 160(CX), R12 sequenceDecs_decode_56_bmi2_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_56_bmi2_fill_end sequenceDecs_decode_56_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_56_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decode_56_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte sequenceDecs_decode_56_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 16(R9) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 8(R9) // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, (R9) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_56_bmi2_skip_update LEAQ (SI)(DI*1), R14 ADDQ R8, R14 MOVBQZX R14, R14 LEAQ (DX)(R14*1), CX MOVQ AX, R15 MOVQ CX, DX ROLQ CL, R15 BZHIQ R14, R15, R15 // Update Offset State BZHIQ R8, R15, CX SHRXQ R8, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R15, CX SHRXQ DI, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R15, CX MOVQ $0x00001010, R14 BEXTRQ R14, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decode_56_bmi2_skip_update: // Adjust offset MOVQ 16(R9), CX CMPQ R13, $0x01 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 JMP sequenceDecs_decode_56_bmi2_adjust_end sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero MOVQ R10, CX JMP sequenceDecs_decode_56_bmi2_adjust_end sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_56_bmi2_adjust_zero JEQ sequenceDecs_decode_56_bmi2_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_56_bmi2_adjust_three JMP sequenceDecs_decode_56_bmi2_adjust_two sequenceDecs_decode_56_bmi2_adjust_zero: MOVQ R10, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_one: MOVQ R11, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_two: MOVQ R12, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_three: LEAQ -1(R10), R13 sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: TESTQ R13, R13 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid MOVQ $0x00000001, R13 sequenceDecs_decode_56_bmi2_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R11, R12 MOVQ R10, R11 MOVQ R13, R10 MOVQ R13, CX sequenceDecs_decode_56_bmi2_adjust_end: MOVQ CX, 16(R9) // Check values MOVQ 8(R9), R13 MOVQ (R9), R14 LEAQ (R13)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok TESTQ R13, R13 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch sequenceDecs_decode_56_bmi2_match_len_ofs_ok: ADDQ $0x18, R9 MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decode_56_bmi2_main_loop MOVQ s+0(FP), CX MOVQ R10, 144(CX) MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_56_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 MOVQ ctx+0(FP), R10 MOVQ 8(R10), CX TESTQ CX, CX JZ empty_seqs MOVQ (R10), AX MOVQ 24(R10), DX MOVQ 32(R10), BX MOVQ 80(R10), SI MOVQ 104(R10), DI MOVQ 120(R10), R8 MOVQ 56(R10), R9 MOVQ 64(R10), R10 ADDQ R10, R9 // seqsBase += 24 * seqIndex LEAQ (DX)(DX*2), R11 SHLQ $0x03, R11 ADDQ R11, AX // outBase += outPosition ADDQ DI, BX main_loop: MOVQ (AX), R11 MOVQ 16(AX), R12 MOVQ 8(AX), R13 // Copy literals TESTQ R11, R11 JZ check_offset XORQ R14, R14 copy_1: MOVUPS (SI)(R14*1), X0 MOVUPS X0, (BX)(R14*1) ADDQ $0x10, R14 CMPQ R14, R11 JB copy_1 ADDQ R11, SI ADDQ R11, BX ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: LEAQ (DI)(R10*1), R11 CMPQ R12, R11 JG error_match_off_too_big CMPQ R12, R8 JG error_match_off_too_big // Copy match from history MOVQ R12, R11 SUBQ DI, R11 JLS copy_match MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 JGE copy_all_from_history XORQ R11, R11 TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(R11*1), R12 MOVB R12, (BX)(R11*1) ADDQ $0x01, R11 copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(R11*1), R12 MOVW R12, (BX)(R11*1) ADDQ $0x02, R11 copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(R11*1), R12 MOVL R12, (BX)(R11*1) ADDQ $0x04, R11 copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(R11*1), R12 MOVQ R12, (BX)(R11*1) ADDQ $0x08, R11 JMP copy_4_test copy_4: MOVUPS (R14)(R11*1), X0 MOVUPS X0, (BX)(R11*1) ADDQ $0x10, R11 copy_4_test: CMPQ R11, R13 JB copy_4 ADDQ R13, DI ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, R11 JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (BX)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, R11 JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (BX)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, R11 JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (BX)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, R11 JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (BX)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (BX)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, R11 JB copy_5 ADDQ R11, BX ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ BX, R11 SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, DI MOVQ BX, R12 ADDQ R13, BX copy_2: MOVUPS (R11), X0 MOVUPS X0, (R12) ADDQ $0x10, R11 ADDQ $0x10, R12 SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, DI copy_slow_3: MOVB (R11), R12 MOVB R12, (BX) INCQ R11 INCQ BX DECQ R13 JNZ copy_slow_3 handle_loop: ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop loop_finished: // Return value MOVB $0x01, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET error_match_off_too_big: // Return value MOVB $0x00, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET empty_seqs: // Return value MOVB $0x01, ret+8(FP) RET // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 MOVQ ctx+0(FP), R10 MOVQ 8(R10), CX TESTQ CX, CX JZ empty_seqs MOVQ (R10), AX MOVQ 24(R10), DX MOVQ 32(R10), BX MOVQ 80(R10), SI MOVQ 104(R10), DI MOVQ 120(R10), R8 MOVQ 56(R10), R9 MOVQ 64(R10), R10 ADDQ R10, R9 // seqsBase += 24 * seqIndex LEAQ (DX)(DX*2), R11 SHLQ $0x03, R11 ADDQ R11, AX // outBase += outPosition ADDQ DI, BX main_loop: MOVQ (AX), R11 MOVQ 16(AX), R12 MOVQ 8(AX), R13 // Copy literals TESTQ R11, R11 JZ check_offset XORQ R14, R14 TESTQ $0x00000001, R11 JZ copy_1_word MOVB (SI)(R14*1), R15 MOVB R15, (BX)(R14*1) ADDQ $0x01, R14 copy_1_word: TESTQ $0x00000002, R11 JZ copy_1_dword MOVW (SI)(R14*1), R15 MOVW R15, (BX)(R14*1) ADDQ $0x02, R14 copy_1_dword: TESTQ $0x00000004, R11 JZ copy_1_qword MOVL (SI)(R14*1), R15 MOVL R15, (BX)(R14*1) ADDQ $0x04, R14 copy_1_qword: TESTQ $0x00000008, R11 JZ copy_1_test MOVQ (SI)(R14*1), R15 MOVQ R15, (BX)(R14*1) ADDQ $0x08, R14 JMP copy_1_test copy_1: MOVUPS (SI)(R14*1), X0 MOVUPS X0, (BX)(R14*1) ADDQ $0x10, R14 copy_1_test: CMPQ R14, R11 JB copy_1 ADDQ R11, SI ADDQ R11, BX ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: LEAQ (DI)(R10*1), R11 CMPQ R12, R11 JG error_match_off_too_big CMPQ R12, R8 JG error_match_off_too_big // Copy match from history MOVQ R12, R11 SUBQ DI, R11 JLS copy_match MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 JGE copy_all_from_history XORQ R11, R11 TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(R11*1), R12 MOVB R12, (BX)(R11*1) ADDQ $0x01, R11 copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(R11*1), R12 MOVW R12, (BX)(R11*1) ADDQ $0x02, R11 copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(R11*1), R12 MOVL R12, (BX)(R11*1) ADDQ $0x04, R11 copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(R11*1), R12 MOVQ R12, (BX)(R11*1) ADDQ $0x08, R11 JMP copy_4_test copy_4: MOVUPS (R14)(R11*1), X0 MOVUPS X0, (BX)(R11*1) ADDQ $0x10, R11 copy_4_test: CMPQ R11, R13 JB copy_4 ADDQ R13, DI ADDQ R13, BX ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, R11 JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (BX)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, R11 JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (BX)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, R11 JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (BX)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, R11 JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (BX)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (BX)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, R11 JB copy_5 ADDQ R11, BX ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ BX, R11 SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, DI XORQ R12, R12 TESTQ $0x00000001, R13 JZ copy_2_word MOVB (R11)(R12*1), R14 MOVB R14, (BX)(R12*1) ADDQ $0x01, R12 copy_2_word: TESTQ $0x00000002, R13 JZ copy_2_dword MOVW (R11)(R12*1), R14 MOVW R14, (BX)(R12*1) ADDQ $0x02, R12 copy_2_dword: TESTQ $0x00000004, R13 JZ copy_2_qword MOVL (R11)(R12*1), R14 MOVL R14, (BX)(R12*1) ADDQ $0x04, R12 copy_2_qword: TESTQ $0x00000008, R13 JZ copy_2_test MOVQ (R11)(R12*1), R14 MOVQ R14, (BX)(R12*1) ADDQ $0x08, R12 JMP copy_2_test copy_2: MOVUPS (R11)(R12*1), X0 MOVUPS X0, (BX)(R12*1) ADDQ $0x10, R12 copy_2_test: CMPQ R12, R13 JB copy_2 ADDQ R13, BX JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, DI copy_slow_3: MOVB (R11), R12 MOVB R12, (BX) INCQ R11 INCQ BX DECQ R13 JNZ copy_slow_3 handle_loop: ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop loop_finished: // Return value MOVB $0x01, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET error_match_off_too_big: // Return value MOVB $0x00, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET empty_seqs: // Return value MOVB $0x01, ret+8(FP) RET // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) MOVQ 144(AX), R11 MOVQ 136(AX), R12 MOVQ 200(AX), CX MOVQ CX, 56(SP) MOVQ 176(AX), CX MOVQ CX, 48(SP) MOVQ 184(AX), AX MOVQ AX, 40(SP) MOVQ 40(SP), AX ADDQ AX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R10, 32(SP) // outBase += outPosition ADDQ R12, R10 sequenceDecs_decodeSync_amd64_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_amd64_fill_end sequenceDecs_decodeSync_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte sequenceDecs_decodeSync_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 8(SP) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_amd64_fill_2_end sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte sequenceDecs_decodeSync_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_amd64_skip_update // Update Literal Length State MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, DI sequenceDecs_decodeSync_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, R8 sequenceDecs_decodeSync_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, R9 sequenceDecs_decodeSync_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decodeSync_amd64_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ AX, $0x01 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_amd64_adjust_end sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_amd64_adjust_end sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: MOVQ R13, AX XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 LEAQ 144(CX), R15 ADDQ (R15)(AX*8), R14 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_amd64_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_amd64_adjust_skip MOVQ 152(CX), AX MOVQ AX, 160(CX) sequenceDecs_decodeSync_amd64_adjust_skip: MOVQ 144(CX), AX MOVQ AX, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_amd64_adjust_end: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), AX MOVQ 24(SP), CX LEAQ (AX)(CX*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ CX, 104(R14) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch sequenceDecs_decodeSync_amd64_match_len_ofs_ok: MOVQ 24(SP), AX MOVQ 8(SP), CX MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (AX)(R13*1), R14 ADDQ R10, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ AX, AX JZ check_offset XORQ R14, R14 copy_1: MOVUPS (R11)(R14*1), X0 MOVUPS X0, (R10)(R14*1) ADDQ $0x10, R14 CMPQ R14, AX JB copy_1 ADDQ AX, R11 ADDQ AX, R10 ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R12, AX ADDQ 40(SP), AX CMPQ CX, AX JG error_match_off_too_big CMPQ CX, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ CX, AX SUBQ R12, AX JLS copy_match MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX JGE copy_all_from_history XORQ AX, AX TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(AX*1), CL MOVB CL, (R10)(AX*1) ADDQ $0x01, AX copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(AX*1), CX MOVW CX, (R10)(AX*1) ADDQ $0x02, AX copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(AX*1), CX MOVL CX, (R10)(AX*1) ADDQ $0x04, AX copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(AX*1), CX MOVQ CX, (R10)(AX*1) ADDQ $0x08, AX JMP copy_4_test copy_4: MOVUPS (R14)(AX*1), X0 MOVUPS X0, (R10)(AX*1) ADDQ $0x10, AX copy_4_test: CMPQ AX, R13 JB copy_4 ADDQ R13, R12 ADDQ R13, R10 JMP handle_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, AX JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (R10)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, AX JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (R10)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, AX JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (R10)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, AX JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (R10)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (R10)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, AX JB copy_5 ADDQ AX, R10 ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ R10, AX SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R12 MOVQ R10, CX ADDQ R13, R10 copy_2: MOVUPS (AX), X0 MOVUPS X0, (CX) ADDQ $0x10, AX ADDQ $0x10, CX SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R12 copy_slow_3: MOVB (AX), CL MOVB CL, (R10) INCQ AX INCQ R10 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decodeSync_amd64_main_loop loop_finished: MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Update the context MOVQ ctx+16(FP), AX MOVQ R12, 136(AX) MOVQ 144(AX), CX SUBQ CX, R11 MOVQ R11, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_amd64_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R12, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R12, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) MOVQ 144(CX), R10 MOVQ 136(CX), R11 MOVQ 200(CX), R12 MOVQ R12, 56(SP) MOVQ 176(CX), R12 MOVQ R12, 48(SP) MOVQ 184(CX), CX MOVQ CX, 40(SP) MOVQ 40(SP), CX ADDQ CX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R9, 32(SP) // outBase += outPosition ADDQ R11, R9 sequenceDecs_decodeSync_bmi2_main_loop: MOVQ (SP), R12 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_bmi2_fill_end sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte sequenceDecs_decodeSync_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 8(SP) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_bmi2_fill_2_end sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte sequenceDecs_decodeSync_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 24(SP) // Fill bitreader for state updates MOVQ R12, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R12 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_bmi2_skip_update LEAQ (SI)(DI*1), R13 ADDQ R8, R13 MOVBQZX R13, R13 LEAQ (DX)(R13*1), CX MOVQ AX, R14 MOVQ CX, DX ROLQ CL, R14 BZHIQ R13, R14, R14 // Update Offset State BZHIQ R8, R14, CX SHRXQ R8, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R14, CX SHRXQ DI, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R14, CX MOVQ $0x00001010, R13 BEXTRQ R13, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decodeSync_bmi2_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ R12, $0x01 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_bmi2_adjust_end sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_bmi2_adjust_end sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: MOVQ R13, R12 XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 LEAQ 144(CX), R15 ADDQ (R15)(R12*8), R14 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_bmi2_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_bmi2_adjust_skip MOVQ 152(CX), R12 MOVQ R12, 160(CX) sequenceDecs_decodeSync_bmi2_adjust_skip: MOVQ 144(CX), R12 MOVQ R12, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_bmi2_adjust_end: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), CX MOVQ 24(SP), R12 LEAQ (CX)(R12*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ R12, 104(R14) JS error_not_enough_literals CMPQ CX, $0x00020002 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok TESTQ CX, CX JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: MOVQ 24(SP), CX MOVQ 8(SP), R12 MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (CX)(R13*1), R14 ADDQ R9, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ CX, CX JZ check_offset XORQ R14, R14 copy_1: MOVUPS (R10)(R14*1), X0 MOVUPS X0, (R9)(R14*1) ADDQ $0x10, R14 CMPQ R14, CX JB copy_1 ADDQ CX, R10 ADDQ CX, R9 ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R11, CX ADDQ 40(SP), CX CMPQ R12, CX JG error_match_off_too_big CMPQ R12, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ R12, CX SUBQ R11, CX JLS copy_match MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX JGE copy_all_from_history XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(CX*1), R12 MOVB R12, (R9)(CX*1) ADDQ $0x01, CX copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(CX*1), R12 MOVW R12, (R9)(CX*1) ADDQ $0x02, CX copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(CX*1), R12 MOVL R12, (R9)(CX*1) ADDQ $0x04, CX copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(CX*1), R12 MOVQ R12, (R9)(CX*1) ADDQ $0x08, CX JMP copy_4_test copy_4: MOVUPS (R14)(CX*1), X0 MOVUPS X0, (R9)(CX*1) ADDQ $0x10, CX copy_4_test: CMPQ CX, R13 JB copy_4 ADDQ R13, R11 ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, CX JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (R9)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, CX JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (R9)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, CX JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (R9)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, CX JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (R9)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (R9)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, CX JB copy_5 ADDQ CX, R9 ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ R9, CX SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R11 MOVQ R9, R12 ADDQ R13, R9 copy_2: MOVUPS (CX), X0 MOVUPS X0, (R12) ADDQ $0x10, CX ADDQ $0x10, R12 SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R11 copy_slow_3: MOVB (CX), R12 MOVB R12, (R9) INCQ CX INCQ R9 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decodeSync_bmi2_main_loop loop_finished: MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Update the context MOVQ ctx+16(FP), AX MOVQ R11, 136(AX) MOVQ 144(AX), CX SUBQ CX, R10 MOVQ R10, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_bmi2_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R11, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R11, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) MOVQ 144(AX), R11 MOVQ 136(AX), R12 MOVQ 200(AX), CX MOVQ CX, 56(SP) MOVQ 176(AX), CX MOVQ CX, 48(SP) MOVQ 184(AX), AX MOVQ AX, 40(SP) MOVQ 40(SP), AX ADDQ AX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R10, 32(SP) // outBase += outPosition ADDQ R12, R10 sequenceDecs_decodeSync_safe_amd64_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_safe_amd64_fill_end sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_safe_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_safe_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte sequenceDecs_decodeSync_safe_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 8(SP) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte sequenceDecs_decodeSync_safe_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL ADDQ CX, BX NEGL CX SHRQ CL, R14 SHRQ $0x20, AX TESTQ CX, CX CMOVQEQ CX, R14 ADDQ R14, AX MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_safe_amd64_skip_update // Update Literal Length State MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, DI sequenceDecs_decodeSync_safe_amd64_llState_updateState_skip_zero: // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, R8 sequenceDecs_decodeSync_safe_amd64_mlState_updateState_skip_zero: // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 CMPQ R13, $0x00 JZ sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero MOVQ BX, CX ADDQ R13, BX MOVQ DX, R14 SHLQ CL, R14 MOVQ R13, CX NEGQ CX SHRQ CL, R14 ADDQ R14, R9 sequenceDecs_decodeSync_safe_amd64_ofState_updateState_skip_zero: // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decodeSync_safe_amd64_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ AX, $0x01 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_safe_amd64_adjust_end sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_safe_amd64_adjust_end sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: MOVQ R13, AX XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 LEAQ 144(CX), R15 ADDQ (R15)(AX*8), R14 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip MOVQ 152(CX), AX MOVQ AX, 160(CX) sequenceDecs_decodeSync_safe_amd64_adjust_skip: MOVQ 144(CX), AX MOVQ AX, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_safe_amd64_adjust_end: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), AX MOVQ 24(SP), CX LEAQ (AX)(CX*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ CX, 104(R14) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: MOVQ 24(SP), AX MOVQ 8(SP), CX MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (AX)(R13*1), R14 ADDQ R10, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ AX, AX JZ check_offset XORQ R14, R14 TESTQ $0x00000001, AX JZ copy_1_word MOVB (R11)(R14*1), R15 MOVB R15, (R10)(R14*1) ADDQ $0x01, R14 copy_1_word: TESTQ $0x00000002, AX JZ copy_1_dword MOVW (R11)(R14*1), R15 MOVW R15, (R10)(R14*1) ADDQ $0x02, R14 copy_1_dword: TESTQ $0x00000004, AX JZ copy_1_qword MOVL (R11)(R14*1), R15 MOVL R15, (R10)(R14*1) ADDQ $0x04, R14 copy_1_qword: TESTQ $0x00000008, AX JZ copy_1_test MOVQ (R11)(R14*1), R15 MOVQ R15, (R10)(R14*1) ADDQ $0x08, R14 JMP copy_1_test copy_1: MOVUPS (R11)(R14*1), X0 MOVUPS X0, (R10)(R14*1) ADDQ $0x10, R14 copy_1_test: CMPQ R14, AX JB copy_1 ADDQ AX, R11 ADDQ AX, R10 ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R12, AX ADDQ 40(SP), AX CMPQ CX, AX JG error_match_off_too_big CMPQ CX, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ CX, AX SUBQ R12, AX JLS copy_match MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX JGE copy_all_from_history XORQ AX, AX TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(AX*1), CL MOVB CL, (R10)(AX*1) ADDQ $0x01, AX copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(AX*1), CX MOVW CX, (R10)(AX*1) ADDQ $0x02, AX copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(AX*1), CX MOVL CX, (R10)(AX*1) ADDQ $0x04, AX copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(AX*1), CX MOVQ CX, (R10)(AX*1) ADDQ $0x08, AX JMP copy_4_test copy_4: MOVUPS (R14)(AX*1), X0 MOVUPS X0, (R10)(AX*1) ADDQ $0x10, AX copy_4_test: CMPQ AX, R13 JB copy_4 ADDQ R13, R12 ADDQ R13, R10 JMP handle_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, AX JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (R10)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, AX JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (R10)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, AX JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (R10)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, AX JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (R10)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (R10)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, AX JB copy_5 ADDQ AX, R10 ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ R10, AX SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R12 XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_2_word MOVB (AX)(CX*1), R14 MOVB R14, (R10)(CX*1) ADDQ $0x01, CX copy_2_word: TESTQ $0x00000002, R13 JZ copy_2_dword MOVW (AX)(CX*1), R14 MOVW R14, (R10)(CX*1) ADDQ $0x02, CX copy_2_dword: TESTQ $0x00000004, R13 JZ copy_2_qword MOVL (AX)(CX*1), R14 MOVL R14, (R10)(CX*1) ADDQ $0x04, CX copy_2_qword: TESTQ $0x00000008, R13 JZ copy_2_test MOVQ (AX)(CX*1), R14 MOVQ R14, (R10)(CX*1) ADDQ $0x08, CX JMP copy_2_test copy_2: MOVUPS (AX)(CX*1), X0 MOVUPS X0, (R10)(CX*1) ADDQ $0x10, CX copy_2_test: CMPQ CX, R13 JB copy_2 ADDQ R13, R10 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R12 copy_slow_3: MOVB (AX), CL MOVB CL, (R10) INCQ AX INCQ R10 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decodeSync_safe_amd64_main_loop loop_finished: MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Update the context MOVQ ctx+16(FP), AX MOVQ R12, 136(AX) MOVQ 144(AX), CX SUBQ CX, R11 MOVQ R11, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R12, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R12, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) MOVQ 144(CX), R10 MOVQ 136(CX), R11 MOVQ 200(CX), R12 MOVQ R12, 56(SP) MOVQ 176(CX), R12 MOVQ R12, 48(SP) MOVQ 184(CX), CX MOVQ CX, 40(SP) MOVQ 40(SP), CX ADDQ CX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R9, 32(SP) // outBase += outPosition ADDQ R11, R9 sequenceDecs_decodeSync_safe_bmi2_main_loop: MOVQ (SP), R12 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_safe_bmi2_fill_end sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte sequenceDecs_decodeSync_safe_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 8(SP) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte sequenceDecs_decodeSync_safe_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 24(SP) // Fill bitreader for state updates MOVQ R12, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R12 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update LEAQ (SI)(DI*1), R13 ADDQ R8, R13 MOVBQZX R13, R13 LEAQ (DX)(R13*1), CX MOVQ AX, R14 MOVQ CX, DX ROLQ CL, R14 BZHIQ R13, R14, R14 // Update Offset State BZHIQ R8, R14, CX SHRXQ R8, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R14, CX SHRXQ DI, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R14, CX MOVQ $0x00001010, R13 BEXTRQ R13, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decodeSync_safe_bmi2_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ R12, $0x01 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_end sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: MOVQ R13, R12 XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 LEAQ 144(CX), R15 ADDQ (R15)(R12*8), R14 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip MOVQ 152(CX), R12 MOVQ R12, 160(CX) sequenceDecs_decodeSync_safe_bmi2_adjust_skip: MOVQ 144(CX), R12 MOVQ R12, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_safe_bmi2_adjust_end: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), CX MOVQ 24(SP), R12 LEAQ (CX)(R12*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ R12, 104(R14) JS error_not_enough_literals CMPQ CX, $0x00020002 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok TESTQ CX, CX JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: MOVQ 24(SP), CX MOVQ 8(SP), R12 MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (CX)(R13*1), R14 ADDQ R9, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ CX, CX JZ check_offset XORQ R14, R14 TESTQ $0x00000001, CX JZ copy_1_word MOVB (R10)(R14*1), R15 MOVB R15, (R9)(R14*1) ADDQ $0x01, R14 copy_1_word: TESTQ $0x00000002, CX JZ copy_1_dword MOVW (R10)(R14*1), R15 MOVW R15, (R9)(R14*1) ADDQ $0x02, R14 copy_1_dword: TESTQ $0x00000004, CX JZ copy_1_qword MOVL (R10)(R14*1), R15 MOVL R15, (R9)(R14*1) ADDQ $0x04, R14 copy_1_qword: TESTQ $0x00000008, CX JZ copy_1_test MOVQ (R10)(R14*1), R15 MOVQ R15, (R9)(R14*1) ADDQ $0x08, R14 JMP copy_1_test copy_1: MOVUPS (R10)(R14*1), X0 MOVUPS X0, (R9)(R14*1) ADDQ $0x10, R14 copy_1_test: CMPQ R14, CX JB copy_1 ADDQ CX, R10 ADDQ CX, R9 ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R11, CX ADDQ 40(SP), CX CMPQ R12, CX JG error_match_off_too_big CMPQ R12, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ R12, CX SUBQ R11, CX JLS copy_match MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX JGE copy_all_from_history XORQ CX, CX TESTQ $0x00000001, R13 JZ copy_4_word MOVB (R14)(CX*1), R12 MOVB R12, (R9)(CX*1) ADDQ $0x01, CX copy_4_word: TESTQ $0x00000002, R13 JZ copy_4_dword MOVW (R14)(CX*1), R12 MOVW R12, (R9)(CX*1) ADDQ $0x02, CX copy_4_dword: TESTQ $0x00000004, R13 JZ copy_4_qword MOVL (R14)(CX*1), R12 MOVL R12, (R9)(CX*1) ADDQ $0x04, CX copy_4_qword: TESTQ $0x00000008, R13 JZ copy_4_test MOVQ (R14)(CX*1), R12 MOVQ R12, (R9)(CX*1) ADDQ $0x08, CX JMP copy_4_test copy_4: MOVUPS (R14)(CX*1), X0 MOVUPS X0, (R9)(CX*1) ADDQ $0x10, CX copy_4_test: CMPQ CX, R13 JB copy_4 ADDQ R13, R11 ADDQ R13, R9 JMP handle_loop JMP loop_finished copy_all_from_history: XORQ R15, R15 TESTQ $0x00000001, CX JZ copy_5_word MOVB (R14)(R15*1), BP MOVB BP, (R9)(R15*1) ADDQ $0x01, R15 copy_5_word: TESTQ $0x00000002, CX JZ copy_5_dword MOVW (R14)(R15*1), BP MOVW BP, (R9)(R15*1) ADDQ $0x02, R15 copy_5_dword: TESTQ $0x00000004, CX JZ copy_5_qword MOVL (R14)(R15*1), BP MOVL BP, (R9)(R15*1) ADDQ $0x04, R15 copy_5_qword: TESTQ $0x00000008, CX JZ copy_5_test MOVQ (R14)(R15*1), BP MOVQ BP, (R9)(R15*1) ADDQ $0x08, R15 JMP copy_5_test copy_5: MOVUPS (R14)(R15*1), X0 MOVUPS X0, (R9)(R15*1) ADDQ $0x10, R15 copy_5_test: CMPQ R15, CX JB copy_5 ADDQ CX, R9 ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: TESTQ R13, R13 JZ handle_loop MOVQ R9, CX SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R11 XORQ R12, R12 TESTQ $0x00000001, R13 JZ copy_2_word MOVB (CX)(R12*1), R14 MOVB R14, (R9)(R12*1) ADDQ $0x01, R12 copy_2_word: TESTQ $0x00000002, R13 JZ copy_2_dword MOVW (CX)(R12*1), R14 MOVW R14, (R9)(R12*1) ADDQ $0x02, R12 copy_2_dword: TESTQ $0x00000004, R13 JZ copy_2_qword MOVL (CX)(R12*1), R14 MOVL R14, (R9)(R12*1) ADDQ $0x04, R12 copy_2_qword: TESTQ $0x00000008, R13 JZ copy_2_test MOVQ (CX)(R12*1), R14 MOVQ R14, (R9)(R12*1) ADDQ $0x08, R12 JMP copy_2_test copy_2: MOVUPS (CX)(R12*1), X0 MOVUPS X0, (R9)(R12*1) ADDQ $0x10, R12 copy_2_test: CMPQ R12, R13 JB copy_2 ADDQ R13, R9 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R11 copy_slow_3: MOVB (CX), R12 MOVB R12, (R9) INCQ CX INCQ R9 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decodeSync_safe_bmi2_main_loop loop_finished: MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Update the context MOVQ ctx+16(FP), AX MOVQ R11, 136(AX) MOVQ 144(AX), CX SUBQ CX, R10 MOVQ R10, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R11, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R11, 136(AX) MOVQ $0x00000005, ret+24(FP) RET