// Code generated by command: go run gen.go -out ../encodeblock_amd64.s -stubs ../encodeblock_amd64.go -pkg=s2. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm #include "textflag.h" // func _dummy_() TEXT ·_dummy_(SB), $0 #ifdef GOAMD64_v4 #ifndef GOAMD64_v3 #define GOAMD64_v3 #endif #endif RET // func encodeBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBlockAsm MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x10, R10 IMULQ R8, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BX SUBL 16(SP), BX JZ repeat_extend_back_end_encodeBlockAsm repeat_extend_back_loop_encodeBlockAsm: CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm MOVB -1(DX)(BX*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBlockAsm LEAL -1(SI), SI DECL BX JNZ repeat_extend_back_loop_encodeBlockAsm repeat_extend_back_end_encodeBlockAsm: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm CMPL BX, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm CMPL BX, $0x01000000 JLT four_bytes_repeat_emit_encodeBlockAsm MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP memmove_long_repeat_emit_encodeBlockAsm four_bytes_repeat_emit_encodeBlockAsm: MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeBlockAsm three_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm two_bytes_repeat_emit_encodeBlockAsm: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeBlockAsm JMP memmove_long_repeat_emit_encodeBlockAsm one_byte_repeat_emit_encodeBlockAsm: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm emit_lit_memmove_repeat_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeBlockAsm: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm memmove_long_repeat_emit_encodeBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeBlockAsm: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), R8 SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BX*1), BX // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_match4_repeat_extend_encodeBlockAsm matchlen_loopback_repeat_extend_encodeBlockAsm: MOVQ (R9)(R11*1), R10 XORQ (BX)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeBlockAsm #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm matchlen_loop_repeat_extend_encodeBlockAsm: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm JZ repeat_extend_forward_end_encodeBlockAsm matchlen_match4_repeat_extend_encodeBlockAsm: CMPL R8, $0x04 JL matchlen_match2_repeat_extend_encodeBlockAsm MOVL (R9)(R11*1), R10 CMPL (BX)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm SUBL $0x04, R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm: CMPL R8, $0x02 JL matchlen_match1_repeat_extend_encodeBlockAsm MOVW (R9)(R11*1), R10 CMPW (BX)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm SUBL $0x02, R8 LEAL 2(R11), R11 matchlen_match1_repeat_extend_encodeBlockAsm: CMPL R8, $0x01 JL repeat_extend_forward_end_encodeBlockAsm MOVB (R9)(R11*1), R10 CMPB (BX)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm: ADDL R11, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_match_repeat_encodeBlockAsm: MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm cant_repeat_two_offset_match_repeat_encodeBlockAsm: CMPL BX, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm CMPL BX, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm CMPL BX, $0x0100ffff JLT repeat_five_match_repeat_encodeBlockAsm LEAL -16842747(BX), BX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_repeat_encodeBlockAsm repeat_five_match_repeat_encodeBlockAsm: LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_match_repeat_encodeBlockAsm: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_match_repeat_encodeBlockAsm: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_match_repeat_encodeBlockAsm: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_match_repeat_encodeBlockAsm: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_as_copy_encodeBlockAsm: // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm MOVB $0xff, (AX) MOVL SI, 1(AX) LEAL -64(BX), BX ADDQ $0x05, AX CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy: MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy LEAL -16842747(BX), BX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm four_bytes_remain_repeat_as_copy_encodeBlockAsm: TESTL BX, BX JZ repeat_end_emit_encodeBlockAsm XORL DI, DI LEAL -1(DI)(BX*4), BX MOVB BL, (AX) MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_repeat_as_copy_encodeBlockAsm: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm CMPL SI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(AX) MOVL SI, R8 SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, DI MOVB DI, (AX) ADDQ $0x02, AX SUBL $0x08, BX // emitRepeat LEAL -4(BX), BX JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b LEAL -16842747(BX), BX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm long_offset_short_repeat_as_copy_encodeBlockAsm: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX // emitRepeat emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short: MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short CMPL BX, $0x0100ffff JLT repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short LEAL -16842747(BX), BX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_repeat_as_copy_encodeBlockAsm_emit_copy_short repeat_five_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm repeat_four_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm repeat_three_repeat_as_copy_encodeBlockAsm_emit_copy_short: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_repeat_as_copy_encodeBlockAsm_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm repeat_two_offset_repeat_as_copy_encodeBlockAsm_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm two_byte_offset_short_repeat_as_copy_encodeBlockAsm: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm emit_copy_three_repeat_as_copy_encodeBlockAsm: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm: MOVL CX, 12(SP) JMP search_loop_encodeBlockAsm no_repeat_found_encodeBlockAsm: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBlockAsm SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBlockAsm candidate3_match_encodeBlockAsm: ADDL $0x02, CX JMP candidate_match_encodeBlockAsm candidate2_match_encodeBlockAsm: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeBlockAsm: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm match_extend_back_loop_encodeBlockAsm: CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBlockAsm LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBlockAsm JMP match_extend_back_loop_encodeBlockAsm match_extend_back_end_encodeBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeBlockAsm CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm CMPL DI, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm CMPL DI, $0x01000000 JLT four_bytes_match_emit_encodeBlockAsm MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeBlockAsm four_bytes_match_emit_encodeBlockAsm: MOVL DI, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBlockAsm three_bytes_match_emit_encodeBlockAsm: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm two_bytes_match_emit_encodeBlockAsm: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeBlockAsm JMP memmove_long_match_emit_encodeBlockAsm one_byte_match_emit_encodeBlockAsm: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm emit_lit_memmove_match_emit_encodeBlockAsm_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeBlockAsm memmove_long_match_emit_encodeBlockAsm: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm: match_nolit_loop_encodeBlockAsm: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeBlockAsm matchlen_loopback_match_nolit_encodeBlockAsm: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm matchlen_loop_match_nolit_encodeBlockAsm: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm JZ match_nolit_end_encodeBlockAsm matchlen_match4_match_nolit_encodeBlockAsm: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeBlockAsm MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeBlockAsm SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeBlockAsm: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeBlockAsm MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeBlockAsm SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeBlockAsm: CMPL SI, $0x01 JL match_nolit_end_encodeBlockAsm MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm LEAL 1(R9), R9 match_nolit_end_encodeBlockAsm: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL BX, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm MOVB $0xff, (AX) MOVL BX, 1(AX) LEAL -64(R9), R9 ADDQ $0x05, AX CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy: MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy LEAL -16842747(R9), R9 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy repeat_five_match_nolit_encodeBlockAsm_emit_copy: LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm four_bytes_remain_match_nolit_encodeBlockAsm: TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm XORL SI, SI LEAL -1(SI)(R9*4), R9 MOVB R9, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_match_nolit_encodeBlockAsm: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm CMPL BX, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm MOVL $0x00000001, SI LEAL 16(SI), SI MOVB BL, 1(AX) MOVL BX, DI SHRL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (AX) ADDQ $0x02, AX SUBL $0x08, R9 // emitRepeat LEAL -4(R9), R9 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b: MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b LEAL -16842747(R9), R9 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short_2b repeat_five_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short_2b: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short_2b: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm long_offset_short_match_nolit_encodeBlockAsm: MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short: MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm_emit_copy_short CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm_emit_copy_short CMPL R9, $0x0100ffff JLT repeat_five_match_nolit_encodeBlockAsm_emit_copy_short LEAL -16842747(R9), R9 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_four_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_three_match_nolit_encodeBlockAsm_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_match_nolit_encodeBlockAsm_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm repeat_two_offset_match_nolit_encodeBlockAsm_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm two_byte_offset_short_match_nolit_encodeBlockAsm: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm emit_copy_three_match_nolit_encodeBlockAsm: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm: MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x10, DI IMULQ R8, DI SHRQ $0x32, DI SHLQ $0x10, BX IMULQ R8, BX SHRQ $0x32, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeBlockAsm INCL CX JMP search_loop_encodeBlockAsm emit_remainder_encodeBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBlockAsm CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeBlockAsm CMPL DX, $0x01000000 JLT four_bytes_emit_remainder_encodeBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX JMP memmove_long_emit_remainder_encodeBlockAsm four_bytes_emit_remainder_encodeBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeBlockAsm three_bytes_emit_remainder_encodeBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBlockAsm two_bytes_emit_remainder_encodeBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBlockAsm JMP memmove_long_emit_remainder_encodeBlockAsm one_byte_emit_remainder_encodeBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm emit_lit_memmove_emit_remainder_encodeBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBlockAsm memmove_long_emit_remainder_encodeBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBlockAsm4MB(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm4MB(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBlockAsm4MB: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm4MB: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBlockAsm4MB MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x10, R10 IMULQ R8, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm4MB LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BX SUBL 16(SP), BX JZ repeat_extend_back_end_encodeBlockAsm4MB repeat_extend_back_loop_encodeBlockAsm4MB: CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BX*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBlockAsm4MB LEAL -1(SI), SI DECL BX JNZ repeat_extend_back_loop_encodeBlockAsm4MB repeat_extend_back_end_encodeBlockAsm4MB: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm4MB CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm4MB CMPL BX, $0x00010000 JLT three_bytes_repeat_emit_encodeBlockAsm4MB MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeBlockAsm4MB three_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm4MB two_bytes_repeat_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeBlockAsm4MB JMP memmove_long_repeat_emit_encodeBlockAsm4MB one_byte_repeat_emit_encodeBlockAsm4MB: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm4MB emit_lit_memmove_repeat_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeBlockAsm4MB: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm4MB memmove_long_repeat_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeBlockAsm4MB: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), R8 SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BX*1), BX // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_match4_repeat_extend_encodeBlockAsm4MB matchlen_loopback_repeat_extend_encodeBlockAsm4MB: MOVQ (R9)(R11*1), R10 XORQ (BX)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeBlockAsm4MB #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm4MB matchlen_loop_repeat_extend_encodeBlockAsm4MB: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm4MB JZ repeat_extend_forward_end_encodeBlockAsm4MB matchlen_match4_repeat_extend_encodeBlockAsm4MB: CMPL R8, $0x04 JL matchlen_match2_repeat_extend_encodeBlockAsm4MB MOVL (R9)(R11*1), R10 CMPL (BX)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm4MB SUBL $0x04, R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm4MB: CMPL R8, $0x02 JL matchlen_match1_repeat_extend_encodeBlockAsm4MB MOVW (R9)(R11*1), R10 CMPW (BX)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm4MB SUBL $0x02, R8 LEAL 2(R11), R11 matchlen_match1_repeat_extend_encodeBlockAsm4MB: CMPL R8, $0x01 JL repeat_extend_forward_end_encodeBlockAsm4MB MOVB (R9)(R11*1), R10 CMPB (BX)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm4MB LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm4MB: ADDL R11, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm4MB // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm4MB CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm4MB cant_repeat_two_offset_match_repeat_encodeBlockAsm4MB: CMPL BX, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm4MB CMPL BX, $0x00010100 JLT repeat_four_match_repeat_encodeBlockAsm4MB LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_match_repeat_encodeBlockAsm4MB: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_match_repeat_encodeBlockAsm4MB: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_match_repeat_encodeBlockAsm4MB: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_match_repeat_encodeBlockAsm4MB: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_as_copy_encodeBlockAsm4MB: // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeBlockAsm4MB CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB MOVB $0xff, (AX) MOVL SI, 1(AX) LEAL -64(BX), BX ADDQ $0x05, AX CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB four_bytes_remain_repeat_as_copy_encodeBlockAsm4MB: TESTL BX, BX JZ repeat_end_emit_encodeBlockAsm4MB XORL DI, DI LEAL -1(DI)(BX*4), BX MOVB BL, (AX) MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB two_byte_offset_repeat_as_copy_encodeBlockAsm4MB: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB CMPL SI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm4MB MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX SUBL $0x08, BX // emitRepeat LEAL -4(BX), BX JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB long_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short CMPL BX, $0x00010100 JLT repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short LEAL -65536(BX), BX MOVL BX, SI MOVW $0x001d, (AX) MOVW BX, 2(AX) SARL $0x10, SI MOVB SI, 4(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_four_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_three_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB repeat_two_offset_repeat_as_copy_encodeBlockAsm4MB_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB two_byte_offset_short_repeat_as_copy_encodeBlockAsm4MB: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm4MB LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm4MB emit_copy_three_repeat_as_copy_encodeBlockAsm4MB: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm4MB: MOVL CX, 12(SP) JMP search_loop_encodeBlockAsm4MB no_repeat_found_encodeBlockAsm4MB: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBlockAsm4MB SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm4MB MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm4MB MOVL 20(SP), CX JMP search_loop_encodeBlockAsm4MB candidate3_match_encodeBlockAsm4MB: ADDL $0x02, CX JMP candidate_match_encodeBlockAsm4MB candidate2_match_encodeBlockAsm4MB: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeBlockAsm4MB: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm4MB match_extend_back_loop_encodeBlockAsm4MB: CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm4MB MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBlockAsm4MB LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBlockAsm4MB JMP match_extend_back_loop_encodeBlockAsm4MB match_extend_back_end_encodeBlockAsm4MB: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm4MB: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeBlockAsm4MB CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm4MB CMPL DI, $0x00010000 JLT three_bytes_match_emit_encodeBlockAsm4MB MOVL DI, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBlockAsm4MB three_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm4MB two_bytes_match_emit_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeBlockAsm4MB JMP memmove_long_match_emit_encodeBlockAsm4MB one_byte_match_emit_encodeBlockAsm4MB: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm4MB emit_lit_memmove_match_emit_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm4MB: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeBlockAsm4MB memmove_long_match_emit_encodeBlockAsm4MB: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm4MB: match_nolit_loop_encodeBlockAsm4MB: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeBlockAsm4MB matchlen_loopback_match_nolit_encodeBlockAsm4MB: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm4MB #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm4MB matchlen_loop_match_nolit_encodeBlockAsm4MB: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm4MB JZ match_nolit_end_encodeBlockAsm4MB matchlen_match4_match_nolit_encodeBlockAsm4MB: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeBlockAsm4MB MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeBlockAsm4MB SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeBlockAsm4MB: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeBlockAsm4MB MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeBlockAsm4MB SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeBlockAsm4MB: CMPL SI, $0x01 JL match_nolit_end_encodeBlockAsm4MB MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm4MB LEAL 1(R9), R9 match_nolit_end_encodeBlockAsm4MB: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL BX, $0x00010000 JL two_byte_offset_match_nolit_encodeBlockAsm4MB CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeBlockAsm4MB MOVB $0xff, (AX) MOVL BX, 1(AX) LEAL -64(R9), R9 ADDQ $0x05, AX CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeBlockAsm4MB // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB four_bytes_remain_match_nolit_encodeBlockAsm4MB: TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeBlockAsm4MB XORL SI, SI LEAL -1(SI)(R9*4), R9 MOVB R9, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB two_byte_offset_match_nolit_encodeBlockAsm4MB: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm4MB CMPL BX, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm4MB MOVL $0x00000001, SI LEAL 16(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX SUBL $0x08, R9 // emitRepeat LEAL -4(R9), R9 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB long_offset_short_match_nolit_encodeBlockAsm4MB: MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short CMPL R9, $0x00010100 JLT repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short LEAL -65536(R9), R9 MOVL R9, BX MOVW $0x001d, (AX) MOVW R9, 2(AX) SARL $0x10, BX MOVB BL, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_four_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_three_match_nolit_encodeBlockAsm4MB_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_match_nolit_encodeBlockAsm4MB_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB repeat_two_offset_match_nolit_encodeBlockAsm4MB_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB two_byte_offset_short_match_nolit_encodeBlockAsm4MB: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm4MB CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm4MB LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm4MB emit_copy_three_match_nolit_encodeBlockAsm4MB: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm4MB: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm4MB MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm4MB: MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x10, DI IMULQ R8, DI SHRQ $0x32, DI SHLQ $0x10, BX IMULQ R8, BX SHRQ $0x32, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeBlockAsm4MB INCL CX JMP search_loop_encodeBlockAsm4MB emit_remainder_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBlockAsm4MB: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm4MB MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBlockAsm4MB CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeBlockAsm4MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeBlockAsm4MB three_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBlockAsm4MB two_bytes_emit_remainder_encodeBlockAsm4MB: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBlockAsm4MB JMP memmove_long_emit_remainder_encodeBlockAsm4MB one_byte_emit_remainder_encodeBlockAsm4MB: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm4MB: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBlockAsm4MB_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm4MB: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBlockAsm4MB memmove_long_emit_remainder_encodeBlockAsm4MB: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBlockAsm4MB: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBlockAsm12B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm12B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBlockAsm12B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x18, R9 IMULQ R8, R9 SHRQ $0x34, R9 SHLQ $0x18, R10 IMULQ R8, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x18, R9 IMULQ R8, R9 SHRQ $0x34, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm12B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BX SUBL 16(SP), BX JZ repeat_extend_back_end_encodeBlockAsm12B repeat_extend_back_loop_encodeBlockAsm12B: CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BX*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBlockAsm12B LEAL -1(SI), SI DECL BX JNZ repeat_extend_back_loop_encodeBlockAsm12B repeat_extend_back_end_encodeBlockAsm12B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm12B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm12B two_bytes_repeat_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeBlockAsm12B JMP memmove_long_repeat_emit_encodeBlockAsm12B one_byte_repeat_emit_encodeBlockAsm12B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm12B emit_lit_memmove_repeat_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeBlockAsm12B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm12B memmove_long_repeat_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeBlockAsm12B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), R8 SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BX*1), BX // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_match4_repeat_extend_encodeBlockAsm12B matchlen_loopback_repeat_extend_encodeBlockAsm12B: MOVQ (R9)(R11*1), R10 XORQ (BX)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm12B matchlen_loop_repeat_extend_encodeBlockAsm12B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm12B JZ repeat_extend_forward_end_encodeBlockAsm12B matchlen_match4_repeat_extend_encodeBlockAsm12B: CMPL R8, $0x04 JL matchlen_match2_repeat_extend_encodeBlockAsm12B MOVL (R9)(R11*1), R10 CMPL (BX)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm12B SUBL $0x04, R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm12B: CMPL R8, $0x02 JL matchlen_match1_repeat_extend_encodeBlockAsm12B MOVW (R9)(R11*1), R10 CMPW (BX)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm12B SUBL $0x02, R8 LEAL 2(R11), R11 matchlen_match1_repeat_extend_encodeBlockAsm12B: CMPL R8, $0x01 JL repeat_extend_forward_end_encodeBlockAsm12B MOVB (R9)(R11*1), R10 CMPB (BX)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm12B LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm12B: ADDL R11, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm12B // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm12B CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm12B CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm12B cant_repeat_two_offset_match_repeat_encodeBlockAsm12B: CMPL BX, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm12B LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_match_repeat_encodeBlockAsm12B: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_match_repeat_encodeBlockAsm12B: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_match_repeat_encodeBlockAsm12B: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_as_copy_encodeBlockAsm12B: // emitCopy CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B CMPL SI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm12B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX SUBL $0x08, BX // emitRepeat LEAL -4(BX), BX JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B long_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_three_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B repeat_two_offset_repeat_as_copy_encodeBlockAsm12B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeBlockAsm12B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm12B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm12B emit_copy_three_repeat_as_copy_encodeBlockAsm12B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm12B: MOVL CX, 12(SP) JMP search_loop_encodeBlockAsm12B no_repeat_found_encodeBlockAsm12B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBlockAsm12B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm12B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm12B candidate3_match_encodeBlockAsm12B: ADDL $0x02, CX JMP candidate_match_encodeBlockAsm12B candidate2_match_encodeBlockAsm12B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeBlockAsm12B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm12B match_extend_back_loop_encodeBlockAsm12B: CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm12B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBlockAsm12B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBlockAsm12B JMP match_extend_back_loop_encodeBlockAsm12B match_extend_back_end_encodeBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm12B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeBlockAsm12B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm12B two_bytes_match_emit_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeBlockAsm12B JMP memmove_long_match_emit_encodeBlockAsm12B one_byte_match_emit_encodeBlockAsm12B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm12B emit_lit_memmove_match_emit_encodeBlockAsm12B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm12B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeBlockAsm12B memmove_long_match_emit_encodeBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm12B: match_nolit_loop_encodeBlockAsm12B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeBlockAsm12B matchlen_loopback_match_nolit_encodeBlockAsm12B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm12B matchlen_loop_match_nolit_encodeBlockAsm12B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm12B JZ match_nolit_end_encodeBlockAsm12B matchlen_match4_match_nolit_encodeBlockAsm12B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeBlockAsm12B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeBlockAsm12B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeBlockAsm12B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeBlockAsm12B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeBlockAsm12B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeBlockAsm12B: CMPL SI, $0x01 JL match_nolit_end_encodeBlockAsm12B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm12B LEAL 1(R9), R9 match_nolit_end_encodeBlockAsm12B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm12B CMPL BX, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm12B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX SUBL $0x08, R9 // emitRepeat LEAL -4(R9), R9 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B long_offset_short_match_nolit_encodeBlockAsm12B: MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_three_match_nolit_encodeBlockAsm12B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_match_nolit_encodeBlockAsm12B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B repeat_two_offset_match_nolit_encodeBlockAsm12B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B two_byte_offset_short_match_nolit_encodeBlockAsm12B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm12B CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm12B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm12B emit_copy_three_match_nolit_encodeBlockAsm12B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm12B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm12B: MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x18, DI IMULQ R8, DI SHRQ $0x34, DI SHLQ $0x18, BX IMULQ R8, BX SHRQ $0x34, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeBlockAsm12B INCL CX JMP search_loop_encodeBlockAsm12B emit_remainder_encodeBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm12B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm12B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBlockAsm12B two_bytes_emit_remainder_encodeBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBlockAsm12B JMP memmove_long_emit_remainder_encodeBlockAsm12B one_byte_emit_remainder_encodeBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm12B emit_lit_memmove_emit_remainder_encodeBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm12B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBlockAsm12B memmove_long_emit_remainder_encodeBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBlockAsm10B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm10B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBlockAsm10B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ R8, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x36, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm10B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BX SUBL 16(SP), BX JZ repeat_extend_back_end_encodeBlockAsm10B repeat_extend_back_loop_encodeBlockAsm10B: CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BX*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBlockAsm10B LEAL -1(SI), SI DECL BX JNZ repeat_extend_back_loop_encodeBlockAsm10B repeat_extend_back_end_encodeBlockAsm10B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm10B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm10B two_bytes_repeat_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeBlockAsm10B JMP memmove_long_repeat_emit_encodeBlockAsm10B one_byte_repeat_emit_encodeBlockAsm10B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm10B emit_lit_memmove_repeat_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeBlockAsm10B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm10B memmove_long_repeat_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeBlockAsm10B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), R8 SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BX*1), BX // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_match4_repeat_extend_encodeBlockAsm10B matchlen_loopback_repeat_extend_encodeBlockAsm10B: MOVQ (R9)(R11*1), R10 XORQ (BX)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm10B matchlen_loop_repeat_extend_encodeBlockAsm10B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm10B JZ repeat_extend_forward_end_encodeBlockAsm10B matchlen_match4_repeat_extend_encodeBlockAsm10B: CMPL R8, $0x04 JL matchlen_match2_repeat_extend_encodeBlockAsm10B MOVL (R9)(R11*1), R10 CMPL (BX)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm10B SUBL $0x04, R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm10B: CMPL R8, $0x02 JL matchlen_match1_repeat_extend_encodeBlockAsm10B MOVW (R9)(R11*1), R10 CMPW (BX)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm10B SUBL $0x02, R8 LEAL 2(R11), R11 matchlen_match1_repeat_extend_encodeBlockAsm10B: CMPL R8, $0x01 JL repeat_extend_forward_end_encodeBlockAsm10B MOVB (R9)(R11*1), R10 CMPB (BX)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm10B LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm10B: ADDL R11, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm10B // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm10B CMPL DI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm10B CMPL SI, $0x00000800 JLT repeat_two_offset_match_repeat_encodeBlockAsm10B cant_repeat_two_offset_match_repeat_encodeBlockAsm10B: CMPL BX, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm10B LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_match_repeat_encodeBlockAsm10B: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_match_repeat_encodeBlockAsm10B: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_match_repeat_encodeBlockAsm10B: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_as_copy_encodeBlockAsm10B: // emitCopy CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B CMPL SI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm10B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX SUBL $0x08, BX // emitRepeat LEAL -4(BX), BX JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short_2b: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B long_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX // emitRepeat MOVL BX, DI LEAL -4(BX), BX CMPL DI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short CMPL DI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short CMPL SI, $0x00000800 JLT repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_three_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B repeat_two_offset_repeat_as_copy_encodeBlockAsm10B_emit_copy_short: XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeBlockAsm10B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeBlockAsm10B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm10B emit_copy_three_repeat_as_copy_encodeBlockAsm10B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm10B: MOVL CX, 12(SP) JMP search_loop_encodeBlockAsm10B no_repeat_found_encodeBlockAsm10B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBlockAsm10B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm10B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm10B candidate3_match_encodeBlockAsm10B: ADDL $0x02, CX JMP candidate_match_encodeBlockAsm10B candidate2_match_encodeBlockAsm10B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeBlockAsm10B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm10B match_extend_back_loop_encodeBlockAsm10B: CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm10B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBlockAsm10B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBlockAsm10B JMP match_extend_back_loop_encodeBlockAsm10B match_extend_back_end_encodeBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm10B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeBlockAsm10B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm10B two_bytes_match_emit_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeBlockAsm10B JMP memmove_long_match_emit_encodeBlockAsm10B one_byte_match_emit_encodeBlockAsm10B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm10B emit_lit_memmove_match_emit_encodeBlockAsm10B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm10B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeBlockAsm10B memmove_long_match_emit_encodeBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm10B: match_nolit_loop_encodeBlockAsm10B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeBlockAsm10B matchlen_loopback_match_nolit_encodeBlockAsm10B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm10B matchlen_loop_match_nolit_encodeBlockAsm10B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm10B JZ match_nolit_end_encodeBlockAsm10B matchlen_match4_match_nolit_encodeBlockAsm10B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeBlockAsm10B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeBlockAsm10B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeBlockAsm10B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeBlockAsm10B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeBlockAsm10B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeBlockAsm10B: CMPL SI, $0x01 JL match_nolit_end_encodeBlockAsm10B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm10B LEAL 1(R9), R9 match_nolit_end_encodeBlockAsm10B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm10B CMPL BX, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm10B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX SUBL $0x08, R9 // emitRepeat LEAL -4(R9), R9 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B long_offset_short_match_nolit_encodeBlockAsm10B: MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat MOVL R9, SI LEAL -4(R9), R9 CMPL SI, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short CMPL BX, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_three_match_nolit_encodeBlockAsm10B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_match_nolit_encodeBlockAsm10B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B repeat_two_offset_match_nolit_encodeBlockAsm10B_emit_copy_short: XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B two_byte_offset_short_match_nolit_encodeBlockAsm10B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm10B CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeBlockAsm10B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm10B emit_copy_three_match_nolit_encodeBlockAsm10B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm10B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm10B: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x20, DI IMULQ R8, DI SHRQ $0x36, DI SHLQ $0x20, BX IMULQ R8, BX SHRQ $0x36, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeBlockAsm10B INCL CX JMP search_loop_encodeBlockAsm10B emit_remainder_encodeBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm10B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm10B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBlockAsm10B two_bytes_emit_remainder_encodeBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBlockAsm10B JMP memmove_long_emit_remainder_encodeBlockAsm10B one_byte_emit_remainder_encodeBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm10B emit_lit_memmove_emit_remainder_encodeBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm10B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBlockAsm10B memmove_long_emit_remainder_encodeBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBlockAsm8B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBlockAsm8B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x04, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBlockAsm8B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x38, R9 SHLQ $0x20, R10 IMULQ R8, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x38, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeBlockAsm8B LEAL 1(CX), SI MOVL 12(SP), DI MOVL SI, BX SUBL 16(SP), BX JZ repeat_extend_back_end_encodeBlockAsm8B repeat_extend_back_loop_encodeBlockAsm8B: CMPL SI, DI JLE repeat_extend_back_end_encodeBlockAsm8B MOVB -1(DX)(BX*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeBlockAsm8B LEAL -1(SI), SI DECL BX JNZ repeat_extend_back_loop_encodeBlockAsm8B repeat_extend_back_end_encodeBlockAsm8B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeBlockAsm8B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeBlockAsm8B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeBlockAsm8B two_bytes_repeat_emit_encodeBlockAsm8B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeBlockAsm8B JMP memmove_long_repeat_emit_encodeBlockAsm8B one_byte_repeat_emit_encodeBlockAsm8B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_repeat_emit_encodeBlockAsm8B emit_lit_memmove_repeat_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_repeat_emit_encodeBlockAsm8B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeBlockAsm8B memmove_long_repeat_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R11 SHRQ $0x05, R11 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R12 SUBQ R10, R12 DECQ R11 JA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R12*1), R10 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R10 ADDQ $0x20, R12 DECQ R11 JNA emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R12*1), X4 MOVOU -16(R9)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ R8, R12 JAE emit_lit_memmove_long_repeat_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeBlockAsm8B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), R8 SUBL CX, R8 LEAQ (DX)(CX*1), R9 LEAQ (DX)(BX*1), BX // matchLen XORL R11, R11 CMPL R8, $0x08 JL matchlen_match4_repeat_extend_encodeBlockAsm8B matchlen_loopback_repeat_extend_encodeBlockAsm8B: MOVQ (R9)(R11*1), R10 XORQ (BX)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_repeat_extend_encodeBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP repeat_extend_forward_end_encodeBlockAsm8B matchlen_loop_repeat_extend_encodeBlockAsm8B: LEAL -8(R8), R8 LEAL 8(R11), R11 CMPL R8, $0x08 JGE matchlen_loopback_repeat_extend_encodeBlockAsm8B JZ repeat_extend_forward_end_encodeBlockAsm8B matchlen_match4_repeat_extend_encodeBlockAsm8B: CMPL R8, $0x04 JL matchlen_match2_repeat_extend_encodeBlockAsm8B MOVL (R9)(R11*1), R10 CMPL (BX)(R11*1), R10 JNE matchlen_match2_repeat_extend_encodeBlockAsm8B SUBL $0x04, R8 LEAL 4(R11), R11 matchlen_match2_repeat_extend_encodeBlockAsm8B: CMPL R8, $0x02 JL matchlen_match1_repeat_extend_encodeBlockAsm8B MOVW (R9)(R11*1), R10 CMPW (BX)(R11*1), R10 JNE matchlen_match1_repeat_extend_encodeBlockAsm8B SUBL $0x02, R8 LEAL 2(R11), R11 matchlen_match1_repeat_extend_encodeBlockAsm8B: CMPL R8, $0x01 JL repeat_extend_forward_end_encodeBlockAsm8B MOVB (R9)(R11*1), R10 CMPB (BX)(R11*1), R10 JNE repeat_extend_forward_end_encodeBlockAsm8B LEAL 1(R11), R11 repeat_extend_forward_end_encodeBlockAsm8B: ADDL R11, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI TESTL DI, DI JZ repeat_as_copy_encodeBlockAsm8B // emitRepeat MOVL BX, SI LEAL -4(BX), BX CMPL SI, $0x08 JLE repeat_two_match_repeat_encodeBlockAsm8B CMPL SI, $0x0c JGE cant_repeat_two_offset_match_repeat_encodeBlockAsm8B cant_repeat_two_offset_match_repeat_encodeBlockAsm8B: CMPL BX, $0x00000104 JLT repeat_three_match_repeat_encodeBlockAsm8B LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_match_repeat_encodeBlockAsm8B: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_match_repeat_encodeBlockAsm8B: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_as_copy_encodeBlockAsm8B: // emitCopy CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B CMPL SI, $0x00000800 JAE long_offset_short_repeat_as_copy_encodeBlockAsm8B MOVL $0x00000001, DI LEAL 16(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX SUBL $0x08, BX // emitRepeat LEAL -4(BX), BX JMP cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b MOVL BX, SI LEAL -4(BX), BX CMPL SI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short_2b: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B long_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX // emitRepeat MOVL BX, SI LEAL -4(BX), BX CMPL SI, $0x08 JLE repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: CMPL BX, $0x00000104 JLT repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short LEAL -256(BX), BX MOVW $0x0019, (AX) MOVW BX, 2(AX) ADDQ $0x04, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_three_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: LEAL -4(BX), BX MOVW $0x0015, (AX) MOVB BL, 2(AX) ADDQ $0x03, AX JMP repeat_end_emit_encodeBlockAsm8B repeat_two_repeat_as_copy_encodeBlockAsm8B_emit_copy_short: SHLL $0x02, BX ORL $0x01, BX MOVW BX, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B XORQ DI, DI LEAL 1(DI)(BX*4), BX MOVB SI, 1(AX) SARL $0x08, SI SHLL $0x05, SI ORL SI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeBlockAsm8B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeBlockAsm8B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeBlockAsm8B emit_copy_three_repeat_as_copy_encodeBlockAsm8B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeBlockAsm8B: MOVL CX, 12(SP) JMP search_loop_encodeBlockAsm8B no_repeat_found_encodeBlockAsm8B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBlockAsm8B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeBlockAsm8B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBlockAsm8B candidate3_match_encodeBlockAsm8B: ADDL $0x02, CX JMP candidate_match_encodeBlockAsm8B candidate2_match_encodeBlockAsm8B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeBlockAsm8B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBlockAsm8B match_extend_back_loop_encodeBlockAsm8B: CMPL CX, SI JLE match_extend_back_end_encodeBlockAsm8B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBlockAsm8B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBlockAsm8B JMP match_extend_back_loop_encodeBlockAsm8B match_extend_back_end_encodeBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBlockAsm8B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeBlockAsm8B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeBlockAsm8B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBlockAsm8B two_bytes_match_emit_encodeBlockAsm8B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeBlockAsm8B JMP memmove_long_match_emit_encodeBlockAsm8B one_byte_match_emit_encodeBlockAsm8B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBlockAsm8B emit_lit_memmove_match_emit_encodeBlockAsm8B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBlockAsm8B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeBlockAsm8B memmove_long_match_emit_encodeBlockAsm8B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeBlockAsm8B: match_nolit_loop_encodeBlockAsm8B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeBlockAsm8B matchlen_loopback_match_nolit_encodeBlockAsm8B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeBlockAsm8B matchlen_loop_match_nolit_encodeBlockAsm8B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeBlockAsm8B JZ match_nolit_end_encodeBlockAsm8B matchlen_match4_match_nolit_encodeBlockAsm8B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeBlockAsm8B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeBlockAsm8B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeBlockAsm8B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeBlockAsm8B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeBlockAsm8B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeBlockAsm8B: CMPL SI, $0x01 JL match_nolit_end_encodeBlockAsm8B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeBlockAsm8B LEAL 1(R9), R9 match_nolit_end_encodeBlockAsm8B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeBlockAsm8B CMPL BX, $0x00000800 JAE long_offset_short_match_nolit_encodeBlockAsm8B MOVL $0x00000001, SI LEAL 16(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX SUBL $0x08, R9 // emitRepeat LEAL -4(R9), R9 JMP cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b MOVL R9, BX LEAL -4(R9), R9 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short_2b: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B long_offset_short_match_nolit_encodeBlockAsm8B: MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX // emitRepeat MOVL R9, BX LEAL -4(R9), R9 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBlockAsm8B_emit_copy_short: CMPL R9, $0x00000104 JLT repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short LEAL -256(R9), R9 MOVW $0x0019, (AX) MOVW R9, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_three_match_nolit_encodeBlockAsm8B_emit_copy_short: LEAL -4(R9), R9 MOVW $0x0015, (AX) MOVB R9, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B repeat_two_match_nolit_encodeBlockAsm8B_emit_copy_short: SHLL $0x02, R9 ORL $0x01, R9 MOVW R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B XORQ SI, SI LEAL 1(SI)(R9*4), R9 MOVB BL, 1(AX) SARL $0x08, BX SHLL $0x05, BX ORL BX, R9 MOVB R9, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B two_byte_offset_short_match_nolit_encodeBlockAsm8B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeBlockAsm8B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBlockAsm8B emit_copy_three_match_nolit_encodeBlockAsm8B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeBlockAsm8B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBlockAsm8B: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x20, DI IMULQ R8, DI SHRQ $0x38, DI SHLQ $0x20, BX IMULQ R8, BX SHRQ $0x38, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeBlockAsm8B INCL CX JMP search_loop_encodeBlockAsm8B emit_remainder_encodeBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBlockAsm8B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBlockAsm8B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBlockAsm8B two_bytes_emit_remainder_encodeBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBlockAsm8B JMP memmove_long_emit_remainder_encodeBlockAsm8B one_byte_emit_remainder_encodeBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBlockAsm8B emit_lit_memmove_emit_remainder_encodeBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBlockAsm8B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBlockAsm8B memmove_long_emit_remainder_encodeBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm: MOVL CX, BX SUBL 12(SP), BX SHRL $0x07, BX CMPL BX, $0x63 JLE check_maxskip_ok_encodeBetterBlockAsm LEAL 100(CX), BX JMP check_maxskip_cont_encodeBetterBlockAsm check_maxskip_ok_encodeBetterBlockAsm: LEAL 1(CX)(BX*1), BX check_maxskip_cont_encodeBetterBlockAsm: CMPL BX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 524312(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 524312(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeBetterBlockAsm CMPQ R10, SI JNE no_short_found_encodeBetterBlockAsm MOVL DI, BX JMP candidate_match_encodeBetterBlockAsm no_short_found_encodeBetterBlockAsm: CMPL R9, SI JEQ candidate_match_encodeBetterBlockAsm CMPL R10, SI JEQ candidateS_match_encodeBetterBlockAsm MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm candidateS_match_encodeBetterBlockAsm: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBetterBlockAsm DECL CX MOVL DI, BX candidate_match_encodeBetterBlockAsm: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBetterBlockAsm match_extend_back_loop_encodeBetterBlockAsm: CMPL CX, SI JLE match_extend_back_end_encodeBetterBlockAsm MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBetterBlockAsm JMP match_extend_back_loop_encodeBetterBlockAsm match_extend_back_end_encodeBetterBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeBetterBlockAsm matchlen_loopback_match_nolit_encodeBetterBlockAsm: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm matchlen_loop_match_nolit_encodeBetterBlockAsm: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm JZ match_nolit_end_encodeBetterBlockAsm matchlen_match4_match_nolit_encodeBetterBlockAsm: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeBetterBlockAsm MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeBetterBlockAsm MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeBetterBlockAsm: CMPL DI, $0x01 JL match_nolit_end_encodeBetterBlockAsm MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL 16(SP), DI JEQ match_is_repeat_encodeBetterBlockAsm CMPL R11, $0x01 JG match_length_ok_encodeBetterBlockAsm CMPL DI, $0x0000ffff JLE match_length_ok_encodeBetterBlockAsm MOVL 20(SP), CX INCL CX JMP search_loop_encodeBetterBlockAsm match_length_ok_encodeBetterBlockAsm: MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm CMPL BX, $0x00010000 JLT three_bytes_match_emit_encodeBetterBlockAsm CMPL BX, $0x01000000 JLT four_bytes_match_emit_encodeBetterBlockAsm MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeBetterBlockAsm four_bytes_match_emit_encodeBetterBlockAsm: MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBetterBlockAsm three_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm two_bytes_match_emit_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeBetterBlockAsm JMP memmove_long_match_emit_encodeBetterBlockAsm one_byte_match_emit_encodeBetterBlockAsm: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm emit_lit_memmove_match_emit_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm memmove_long_match_emit_encodeBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeBetterBlockAsm: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL DI, $0x00010000 JL two_byte_offset_match_nolit_encodeBetterBlockAsm CMPL R11, $0x40 JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm MOVB $0xff, (AX) MOVL DI, 1(AX) LEAL -64(R11), R11 ADDQ $0x05, AX CMPL R11, $0x04 JL four_bytes_remain_match_nolit_encodeBetterBlockAsm // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy: MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy CMPL R11, $0x0100ffff JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy LEAL -16842747(R11), R11 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm four_bytes_remain_match_nolit_encodeBetterBlockAsm: TESTL R11, R11 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm XORL BX, BX LEAL -1(BX)(R11*4), R11 MOVB R11, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_match_nolit_encodeBetterBlockAsm: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm CMPL DI, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm MOVL $0x00000001, BX LEAL 16(BX), BX MOVB DI, 1(AX) MOVL DI, R8 SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, BX MOVB BL, (AX) ADDQ $0x02, AX SUBL $0x08, R11 // emitRepeat LEAL -4(R11), R11 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b CMPL R11, $0x0100ffff JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b LEAL -16842747(R11), R11 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short_2b: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm long_offset_short_match_nolit_encodeBetterBlockAsm: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short: MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short CMPL R11, $0x0100ffff JLT repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short LEAL -16842747(R11), R11 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_encodeBetterBlockAsm_emit_copy_short repeat_five_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_encodeBetterBlockAsm_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_encodeBetterBlockAsm_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_encodeBetterBlockAsm_emit_copy_short: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm two_byte_offset_short_match_nolit_encodeBetterBlockAsm: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm emit_copy_three_match_nolit_encodeBetterBlockAsm: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm match_is_repeat_encodeBetterBlockAsm: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm CMPL BX, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm CMPL BX, $0x00010000 JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm CMPL BX, $0x01000000 JLT four_bytes_match_emit_repeat_encodeBetterBlockAsm MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm four_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm three_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm two_bytes_match_emit_repeat_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm one_byte_match_emit_repeat_encodeBetterBlockAsm: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm: MOVQ BX, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm memmove_long_match_emit_repeat_encodeBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitRepeat emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm: MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm CMPL R11, $0x00010100 JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm CMPL R11, $0x0100ffff JLT repeat_five_match_nolit_repeat_encodeBetterBlockAsm LEAL -16842747(R11), R11 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_match_nolit_repeat_encodeBetterBlockAsm repeat_five_match_nolit_repeat_encodeBetterBlockAsm: LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_four_match_nolit_repeat_encodeBetterBlockAsm: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_three_match_nolit_repeat_encodeBetterBlockAsm: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_match_nolit_repeat_encodeBetterBlockAsm: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x32, R10 SHLQ $0x08, R11 IMULQ BX, R11 SHRQ $0x2f, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x32, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 524312(SP)(R10*4) MOVL R13, 524312(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeBetterBlockAsm: CMPQ SI, R8 JAE search_loop_encodeBetterBlockAsm MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x08, DI IMULQ BX, DI SHRQ $0x2f, DI SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm emit_remainder_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBetterBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeBetterBlockAsm CMPL DX, $0x01000000 JLT four_bytes_emit_remainder_encodeBetterBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm four_bytes_emit_remainder_encodeBetterBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm three_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm two_bytes_emit_remainder_encodeBetterBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBetterBlockAsm JMP memmove_long_emit_remainder_encodeBetterBlockAsm one_byte_emit_remainder_encodeBetterBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm emit_lit_memmove_emit_remainder_encodeBetterBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm memmove_long_emit_remainder_encodeBetterBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBetterBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBetterBlockAsm4MB(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm4MB(SB), $589848-56 MOVQ dst_base+0(FP), AX MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm4MB: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBetterBlockAsm4MB MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm4MB: MOVL CX, BX SUBL 12(SP), BX SHRL $0x07, BX CMPL BX, $0x63 JLE check_maxskip_ok_encodeBetterBlockAsm4MB LEAL 100(CX), BX JMP check_maxskip_cont_encodeBetterBlockAsm4MB check_maxskip_ok_encodeBetterBlockAsm4MB: LEAL 1(CX)(BX*1), BX check_maxskip_cont_encodeBetterBlockAsm4MB: CMPL BX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm4MB MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 524312(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 524312(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeBetterBlockAsm4MB CMPQ R10, SI JNE no_short_found_encodeBetterBlockAsm4MB MOVL DI, BX JMP candidate_match_encodeBetterBlockAsm4MB no_short_found_encodeBetterBlockAsm4MB: CMPL R9, SI JEQ candidate_match_encodeBetterBlockAsm4MB CMPL R10, SI JEQ candidateS_match_encodeBetterBlockAsm4MB MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm4MB candidateS_match_encodeBetterBlockAsm4MB: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBetterBlockAsm4MB DECL CX MOVL DI, BX candidate_match_encodeBetterBlockAsm4MB: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBetterBlockAsm4MB match_extend_back_loop_encodeBetterBlockAsm4MB: CMPL CX, SI JLE match_extend_back_end_encodeBetterBlockAsm4MB MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm4MB LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBetterBlockAsm4MB JMP match_extend_back_loop_encodeBetterBlockAsm4MB match_extend_back_end_encodeBetterBlockAsm4MB: MOVL CX, SI SUBL 12(SP), SI LEAQ 4(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm4MB: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeBetterBlockAsm4MB matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm4MB #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm4MB matchlen_loop_match_nolit_encodeBetterBlockAsm4MB: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm4MB JZ match_nolit_end_encodeBetterBlockAsm4MB matchlen_match4_match_nolit_encodeBetterBlockAsm4MB: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeBetterBlockAsm4MB MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm4MB SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm4MB: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeBetterBlockAsm4MB MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm4MB SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeBetterBlockAsm4MB: CMPL DI, $0x01 JL match_nolit_end_encodeBetterBlockAsm4MB MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm4MB LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm4MB: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL 16(SP), DI JEQ match_is_repeat_encodeBetterBlockAsm4MB CMPL R11, $0x01 JG match_length_ok_encodeBetterBlockAsm4MB CMPL DI, $0x0000ffff JLE match_length_ok_encodeBetterBlockAsm4MB MOVL 20(SP), CX INCL CX JMP search_loop_encodeBetterBlockAsm4MB match_length_ok_encodeBetterBlockAsm4MB: MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm4MB CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm4MB CMPL BX, $0x00010000 JLT three_bytes_match_emit_encodeBetterBlockAsm4MB MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB three_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm4MB two_bytes_match_emit_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_encodeBetterBlockAsm4MB one_byte_match_emit_encodeBetterBlockAsm4MB: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm4MB: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm4MB memmove_long_match_emit_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeBetterBlockAsm4MB: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL DI, $0x00010000 JL two_byte_offset_match_nolit_encodeBetterBlockAsm4MB CMPL R11, $0x40 JLE four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB MOVB $0xff, (AX) MOVL DI, 1(AX) LEAL -64(R11), R11 ADDQ $0x05, AX CMPL R11, $0x04 JL four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB four_bytes_remain_match_nolit_encodeBetterBlockAsm4MB: TESTL R11, R11 JZ match_nolit_emitcopy_end_encodeBetterBlockAsm4MB XORL BX, BX LEAL -1(BX)(R11*4), R11 MOVB R11, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB two_byte_offset_match_nolit_encodeBetterBlockAsm4MB: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB CMPL DI, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm4MB MOVL $0x00000001, BX LEAL 16(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX SUBL $0x08, R11 // emitRepeat LEAL -4(R11), R11 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short_2b: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB long_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short CMPL R11, $0x00010100 JLT repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_encodeBetterBlockAsm4MB_emit_copy_short: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB two_byte_offset_short_match_nolit_encodeBetterBlockAsm4MB: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm4MB LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB emit_copy_three_match_nolit_encodeBetterBlockAsm4MB: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB match_is_repeat_encodeBetterBlockAsm4MB: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm4MB CMPL BX, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB CMPL BX, $0x00010000 JLT three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB three_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB two_bytes_match_emit_repeat_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm4MB JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB one_byte_match_emit_repeat_encodeBetterBlockAsm4MB: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm4MB: MOVQ BX, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB memmove_long_match_emit_repeat_encodeBetterBlockAsm4MB: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm4MB: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB CMPL R11, $0x00010100 JLT repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB LEAL -65536(R11), R11 MOVL R11, DI MOVW $0x001d, (AX) MOVW R11, 2(AX) SARL $0x10, DI MOVB DI, 4(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_four_match_nolit_repeat_encodeBetterBlockAsm4MB: LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_three_match_nolit_repeat_encodeBetterBlockAsm4MB: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_match_nolit_repeat_encodeBetterBlockAsm4MB: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm4MB repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm4MB: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm4MB: CMPL CX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm4MB CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm4MB: MOVQ $0x00cf1bbcdcbfa563, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x32, R10 SHLQ $0x08, R11 IMULQ BX, R11 SHRQ $0x2f, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x32, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 524312(SP)(R10*4) MOVL R13, 524312(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeBetterBlockAsm4MB: CMPQ SI, R8 JAE search_loop_encodeBetterBlockAsm4MB MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x08, DI IMULQ BX, DI SHRQ $0x2f, DI SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm4MB emit_remainder_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 4(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm4MB MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBetterBlockAsm4MB: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBetterBlockAsm4MB CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeBetterBlockAsm4MB MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB three_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB two_bytes_emit_remainder_encodeBetterBlockAsm4MB: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBetterBlockAsm4MB JMP memmove_long_emit_remainder_encodeBetterBlockAsm4MB one_byte_emit_remainder_encodeBetterBlockAsm4MB: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm4MB_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm4MB: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB memmove_long_emit_remainder_encodeBetterBlockAsm4MB: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm4MBlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBetterBlockAsm4MB: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBetterBlockAsm12B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm12B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm12B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BX MOVL 65560(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 65560(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeBetterBlockAsm12B CMPQ R10, SI JNE no_short_found_encodeBetterBlockAsm12B MOVL DI, BX JMP candidate_match_encodeBetterBlockAsm12B no_short_found_encodeBetterBlockAsm12B: CMPL R9, SI JEQ candidate_match_encodeBetterBlockAsm12B CMPL R10, SI JEQ candidateS_match_encodeBetterBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm12B candidateS_match_encodeBetterBlockAsm12B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBetterBlockAsm12B DECL CX MOVL DI, BX candidate_match_encodeBetterBlockAsm12B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBetterBlockAsm12B match_extend_back_loop_encodeBetterBlockAsm12B: CMPL CX, SI JLE match_extend_back_end_encodeBetterBlockAsm12B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm12B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBetterBlockAsm12B JMP match_extend_back_loop_encodeBetterBlockAsm12B match_extend_back_end_encodeBetterBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm12B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeBetterBlockAsm12B matchlen_loopback_match_nolit_encodeBetterBlockAsm12B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm12B matchlen_loop_match_nolit_encodeBetterBlockAsm12B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm12B JZ match_nolit_end_encodeBetterBlockAsm12B matchlen_match4_match_nolit_encodeBetterBlockAsm12B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeBetterBlockAsm12B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm12B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm12B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeBetterBlockAsm12B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm12B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeBetterBlockAsm12B: CMPL DI, $0x01 JL match_nolit_end_encodeBetterBlockAsm12B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm12B LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm12B: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL 16(SP), DI JEQ match_is_repeat_encodeBetterBlockAsm12B MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm12B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm12B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm12B two_bytes_match_emit_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeBetterBlockAsm12B JMP memmove_long_match_emit_encodeBetterBlockAsm12B one_byte_match_emit_encodeBetterBlockAsm12B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm12B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm12B memmove_long_match_emit_encodeBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeBetterBlockAsm12B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B CMPL DI, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm12B MOVL $0x00000001, BX LEAL 16(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX SUBL $0x08, R11 // emitRepeat LEAL -4(R11), R11 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short_2b: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B long_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_encodeBetterBlockAsm12B_emit_copy_short: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeBetterBlockAsm12B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm12B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B emit_copy_three_match_nolit_encodeBetterBlockAsm12B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B match_is_repeat_encodeBetterBlockAsm12B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm12B CMPL BX, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm12B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B two_bytes_match_emit_repeat_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm12B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm12B one_byte_match_emit_repeat_encodeBetterBlockAsm12B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm12B: MOVQ BX, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B memmove_long_match_emit_repeat_encodeBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm12B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_three_match_nolit_repeat_encodeBetterBlockAsm12B: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_match_nolit_repeat_encodeBetterBlockAsm12B: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm12B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm12B: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm12B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x32, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x34, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x32, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x34, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 65560(SP)(R10*4) MOVL R13, 65560(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeBetterBlockAsm12B: CMPQ SI, R8 JAE search_loop_encodeBetterBlockAsm12B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x32, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x32, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm12B emit_remainder_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBetterBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm12B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm12B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBetterBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B two_bytes_emit_remainder_encodeBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBetterBlockAsm12B JMP memmove_long_emit_remainder_encodeBetterBlockAsm12B one_byte_emit_remainder_encodeBetterBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm12B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm12B memmove_long_emit_remainder_encodeBetterBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBetterBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBetterBlockAsm10B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm10B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm10B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x34, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BX MOVL 16408(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 16408(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeBetterBlockAsm10B CMPQ R10, SI JNE no_short_found_encodeBetterBlockAsm10B MOVL DI, BX JMP candidate_match_encodeBetterBlockAsm10B no_short_found_encodeBetterBlockAsm10B: CMPL R9, SI JEQ candidate_match_encodeBetterBlockAsm10B CMPL R10, SI JEQ candidateS_match_encodeBetterBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm10B candidateS_match_encodeBetterBlockAsm10B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x34, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBetterBlockAsm10B DECL CX MOVL DI, BX candidate_match_encodeBetterBlockAsm10B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBetterBlockAsm10B match_extend_back_loop_encodeBetterBlockAsm10B: CMPL CX, SI JLE match_extend_back_end_encodeBetterBlockAsm10B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm10B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBetterBlockAsm10B JMP match_extend_back_loop_encodeBetterBlockAsm10B match_extend_back_end_encodeBetterBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm10B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeBetterBlockAsm10B matchlen_loopback_match_nolit_encodeBetterBlockAsm10B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm10B matchlen_loop_match_nolit_encodeBetterBlockAsm10B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm10B JZ match_nolit_end_encodeBetterBlockAsm10B matchlen_match4_match_nolit_encodeBetterBlockAsm10B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeBetterBlockAsm10B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm10B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm10B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeBetterBlockAsm10B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm10B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeBetterBlockAsm10B: CMPL DI, $0x01 JL match_nolit_end_encodeBetterBlockAsm10B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm10B LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm10B: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL 16(SP), DI JEQ match_is_repeat_encodeBetterBlockAsm10B MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm10B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm10B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm10B two_bytes_match_emit_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeBetterBlockAsm10B JMP memmove_long_match_emit_encodeBetterBlockAsm10B one_byte_match_emit_encodeBetterBlockAsm10B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm10B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm10B memmove_long_match_emit_encodeBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeBetterBlockAsm10B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B CMPL DI, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm10B MOVL $0x00000001, BX LEAL 16(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX SUBL $0x08, R11 // emitRepeat LEAL -4(R11), R11 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short_2b: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B long_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_encodeBetterBlockAsm10B_emit_copy_short: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeBetterBlockAsm10B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeBetterBlockAsm10B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B emit_copy_three_match_nolit_encodeBetterBlockAsm10B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B match_is_repeat_encodeBetterBlockAsm10B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm10B CMPL BX, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm10B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B two_bytes_match_emit_repeat_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm10B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm10B one_byte_match_emit_repeat_encodeBetterBlockAsm10B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm10B: MOVQ BX, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B memmove_long_match_emit_repeat_encodeBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm10B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B CMPL DI, $0x00000800 JLT repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_three_match_nolit_repeat_encodeBetterBlockAsm10B: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_match_nolit_repeat_encodeBetterBlockAsm10B: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm10B repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm10B: XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm10B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x34, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x36, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x34, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x36, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 16408(SP)(R10*4) MOVL R13, 16408(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeBetterBlockAsm10B: CMPQ SI, R8 JAE search_loop_encodeBetterBlockAsm10B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x34, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x34, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm10B emit_remainder_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBetterBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm10B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm10B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBetterBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B two_bytes_emit_remainder_encodeBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBetterBlockAsm10B JMP memmove_long_emit_remainder_encodeBetterBlockAsm10B one_byte_emit_remainder_encodeBetterBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm10B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm10B memmove_long_emit_remainder_encodeBetterBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBetterBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeBetterBlockAsm8B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeBetterBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -6(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeBetterBlockAsm8B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x04, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm8B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BX MOVL 4120(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 4120(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeBetterBlockAsm8B CMPQ R10, SI JNE no_short_found_encodeBetterBlockAsm8B MOVL DI, BX JMP candidate_match_encodeBetterBlockAsm8B no_short_found_encodeBetterBlockAsm8B: CMPL R9, SI JEQ candidate_match_encodeBetterBlockAsm8B CMPL R10, SI JEQ candidateS_match_encodeBetterBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeBetterBlockAsm8B candidateS_match_encodeBetterBlockAsm8B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x36, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeBetterBlockAsm8B DECL CX MOVL DI, BX candidate_match_encodeBetterBlockAsm8B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeBetterBlockAsm8B match_extend_back_loop_encodeBetterBlockAsm8B: CMPL CX, SI JLE match_extend_back_end_encodeBetterBlockAsm8B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeBetterBlockAsm8B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeBetterBlockAsm8B JMP match_extend_back_loop_encodeBetterBlockAsm8B match_extend_back_end_encodeBetterBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeBetterBlockAsm8B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeBetterBlockAsm8B matchlen_loopback_match_nolit_encodeBetterBlockAsm8B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeBetterBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeBetterBlockAsm8B matchlen_loop_match_nolit_encodeBetterBlockAsm8B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeBetterBlockAsm8B JZ match_nolit_end_encodeBetterBlockAsm8B matchlen_match4_match_nolit_encodeBetterBlockAsm8B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeBetterBlockAsm8B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeBetterBlockAsm8B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeBetterBlockAsm8B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeBetterBlockAsm8B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeBetterBlockAsm8B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeBetterBlockAsm8B: CMPL DI, $0x01 JL match_nolit_end_encodeBetterBlockAsm8B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeBetterBlockAsm8B LEAL 1(R11), R11 match_nolit_end_encodeBetterBlockAsm8B: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL 16(SP), DI JEQ match_is_repeat_encodeBetterBlockAsm8B MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeBetterBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeBetterBlockAsm8B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeBetterBlockAsm8B two_bytes_match_emit_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeBetterBlockAsm8B JMP memmove_long_match_emit_encodeBetterBlockAsm8B one_byte_match_emit_encodeBetterBlockAsm8B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeBetterBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x04 JLE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4 CMPQ R8, $0x08 JB emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4: MOVL (R9), R10 MOVL R10, (AX) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (R9), R10 MOVL -4(R9)(R8*1), R9 MOVL R10, (AX) MOVL R9, -4(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeBetterBlockAsm8B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeBetterBlockAsm8B memmove_long_match_emit_encodeBetterBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeBetterBlockAsm8B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B CMPL DI, $0x00000800 JAE long_offset_short_match_nolit_encodeBetterBlockAsm8B MOVL $0x00000001, BX LEAL 16(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX SUBL $0x08, R11 // emitRepeat LEAL -4(R11), R11 JMP cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short_2b: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B long_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short cant_repeat_two_offset_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_encodeBetterBlockAsm8B_emit_copy_short: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B two_byte_offset_short_match_nolit_encodeBetterBlockAsm8B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeBetterBlockAsm8B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B emit_copy_three_match_nolit_encodeBetterBlockAsm8B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B match_is_repeat_encodeBetterBlockAsm8B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_match_emit_repeat_encodeBetterBlockAsm8B CMPL BX, $0x00000100 JLT two_bytes_match_emit_repeat_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B two_bytes_match_emit_repeat_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_repeat_encodeBetterBlockAsm8B JMP memmove_long_match_emit_repeat_encodeBetterBlockAsm8B one_byte_match_emit_repeat_encodeBetterBlockAsm8B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_repeat_encodeBetterBlockAsm8B: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x04 JLE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4 CMPQ DI, $0x08 JB emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ DI, $0x10 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4: MOVL (R8), R9 MOVL R9, (AX) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (R8), R9 MOVL -4(R8)(DI*1), R8 MOVL R9, (AX) MOVL R8, -4(AX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B emit_lit_memmove_match_emit_repeat_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_match_emit_repeat_encodeBetterBlockAsm8B: MOVQ BX, AX JMP emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B memmove_long_match_emit_repeat_encodeBetterBlockAsm8B: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R12 SUBQ R9, R12 DECQ R10 JA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R8)(R12*1), R9 LEAQ -32(AX)(R12*1), R13 emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R13) MOVOA X5, 16(R13) ADDQ $0x20, R13 ADDQ $0x20, R9 ADDQ $0x20, R12 DECQ R10 JNA emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R8)(R12*1), X4 MOVOU -16(R8)(R12*1), X5 MOVOA X4, -32(AX)(R12*1) MOVOA X5, -16(AX)(R12*1) ADDQ $0x20, R12 CMPQ DI, R12 JAE emit_lit_memmove_long_match_emit_repeat_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_match_emit_repeat_encodeBetterBlockAsm8B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitRepeat MOVL R11, BX LEAL -4(R11), R11 CMPL BX, $0x08 JLE repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B CMPL BX, $0x0c JGE cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B cant_repeat_two_offset_match_nolit_repeat_encodeBetterBlockAsm8B: CMPL R11, $0x00000104 JLT repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B LEAL -256(R11), R11 MOVW $0x0019, (AX) MOVW R11, 2(AX) ADDQ $0x04, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_three_match_nolit_repeat_encodeBetterBlockAsm8B: LEAL -4(R11), R11 MOVW $0x0015, (AX) MOVB R11, 2(AX) ADDQ $0x03, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B repeat_two_match_nolit_repeat_encodeBetterBlockAsm8B: SHLL $0x02, R11 ORL $0x01, R11 MOVW R11, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeBetterBlockAsm8B XORQ BX, BX LEAL 1(BX)(R11*4), R11 MOVB DI, 1(AX) SARL $0x08, DI SHLL $0x05, DI ORL DI, R11 MOVB R11, (AX) ADDQ $0x02, AX match_nolit_emitcopy_end_encodeBetterBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeBetterBlockAsm8B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x38, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x36, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x38, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 4120(SP)(R10*4) MOVL R13, 4120(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeBetterBlockAsm8B: CMPQ SI, R8 JAE search_loop_encodeBetterBlockAsm8B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x36, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x36, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeBetterBlockAsm8B emit_remainder_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeBetterBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeBetterBlockAsm8B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeBetterBlockAsm8B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeBetterBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B two_bytes_emit_remainder_encodeBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeBetterBlockAsm8B JMP memmove_long_emit_remainder_encodeBetterBlockAsm8B one_byte_emit_remainder_encodeBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeBetterBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeBetterBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeBetterBlockAsm8B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeBetterBlockAsm8B memmove_long_emit_remainder_encodeBetterBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeBetterBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x10, R10 IMULQ R8, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm repeat_extend_back_loop_encodeSnappyBlockAsm: CMPL SI, BX JLE repeat_extend_back_end_encodeSnappyBlockAsm MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm repeat_extend_back_end_encodeSnappyBlockAsm: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm CMPL BX, $0x00010000 JLT three_bytes_repeat_emit_encodeSnappyBlockAsm CMPL BX, $0x01000000 JLT four_bytes_repeat_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm four_bytes_repeat_emit_encodeSnappyBlockAsm: MOVL BX, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm three_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm two_bytes_repeat_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm JMP memmove_long_repeat_emit_encodeSnappyBlockAsm one_byte_repeat_emit_encodeSnappyBlockAsm: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x08 JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm memmove_long_repeat_emit_encodeSnappyBlockAsm: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm matchlen_loopback_repeat_extend_encodeSnappyBlockAsm: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_loop_repeat_extend_encodeSnappyBlockAsm: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm JZ repeat_extend_forward_end_encodeSnappyBlockAsm matchlen_match4_repeat_extend_encodeSnappyBlockAsm: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeSnappyBlockAsm: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_encodeSnappyBlockAsm: CMPL DI, $0x01 JL repeat_extend_forward_end_encodeSnappyBlockAsm MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm LEAL 1(R10), R10 repeat_extend_forward_end_encodeSnappyBlockAsm: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm: CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xff, (AX) MOVL SI, 1(AX) LEAL -64(BX), BX ADDQ $0x05, AX CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm JMP four_bytes_loop_back_repeat_as_copy_encodeSnappyBlockAsm four_bytes_remain_repeat_as_copy_encodeSnappyBlockAsm: TESTL BX, BX JZ repeat_end_emit_encodeSnappyBlockAsm XORL DI, DI LEAL -1(DI)(BX*4), BX MOVB BL, (AX) MOVL SI, 1(AX) ADDQ $0x05, AX JMP repeat_end_emit_encodeSnappyBlockAsm two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm: MOVL CX, 12(SP) JMP search_loop_encodeSnappyBlockAsm no_repeat_found_encodeSnappyBlockAsm: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBlockAsm SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeSnappyBlockAsm MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm candidate3_match_encodeSnappyBlockAsm: ADDL $0x02, CX JMP candidate_match_encodeSnappyBlockAsm candidate2_match_encodeSnappyBlockAsm: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeSnappyBlockAsm: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBlockAsm match_extend_back_loop_encodeSnappyBlockAsm: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBlockAsm LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBlockAsm JMP match_extend_back_loop_encodeSnappyBlockAsm match_extend_back_end_encodeSnappyBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm CMPL DI, $0x00010000 JLT three_bytes_match_emit_encodeSnappyBlockAsm CMPL DI, $0x01000000 JLT four_bytes_match_emit_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm four_bytes_match_emit_encodeSnappyBlockAsm: MOVL DI, R9 SHRL $0x10, R9 MOVB $0xf8, (AX) MOVW DI, 1(AX) MOVB R9, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm three_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm two_bytes_match_emit_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm JMP memmove_long_match_emit_encodeSnappyBlockAsm one_byte_match_emit_encodeSnappyBlockAsm: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm emit_lit_memmove_match_emit_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm memmove_long_match_emit_encodeSnappyBlockAsm: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm: match_nolit_loop_encodeSnappyBlockAsm: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBlockAsm matchlen_loopback_match_nolit_encodeSnappyBlockAsm: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm matchlen_loop_match_nolit_encodeSnappyBlockAsm: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm JZ match_nolit_end_encodeSnappyBlockAsm matchlen_match4_match_nolit_encodeSnappyBlockAsm: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBlockAsm MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeSnappyBlockAsm: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBlockAsm MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeSnappyBlockAsm: CMPL SI, $0x01 JL match_nolit_end_encodeSnappyBlockAsm MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm LEAL 1(R9), R9 match_nolit_end_encodeSnappyBlockAsm: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL BX, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm: CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBlockAsm MOVB $0xff, (AX) MOVL BX, 1(AX) LEAL -64(R9), R9 ADDQ $0x05, AX CMPL R9, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBlockAsm four_bytes_remain_match_nolit_encodeSnappyBlockAsm: TESTL R9, R9 JZ match_nolit_emitcopy_end_encodeSnappyBlockAsm XORL SI, SI LEAL -1(SI)(R9*4), R9 MOVB R9, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm two_byte_offset_match_nolit_encodeSnappyBlockAsm: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBlockAsm: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm emit_copy_three_match_nolit_encodeSnappyBlockAsm: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm: MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x10, DI IMULQ R8, DI SHRQ $0x32, DI SHLQ $0x10, BX IMULQ R8, BX SHRQ $0x32, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm INCL CX JMP search_loop_encodeSnappyBlockAsm emit_remainder_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeSnappyBlockAsm CMPL DX, $0x01000000 JLT four_bytes_emit_remainder_encodeSnappyBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm four_bytes_emit_remainder_encodeSnappyBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm three_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm two_bytes_emit_remainder_encodeSnappyBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBlockAsm JMP memmove_long_emit_remainder_encodeSnappyBlockAsm one_byte_emit_remainder_encodeSnappyBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm memmove_long_emit_remainder_encodeSnappyBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBlockAsm64K(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm64K(SB), $65560-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm64K: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm64K: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm64K MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x10, R10 IMULQ R8, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm64K LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm64K repeat_extend_back_loop_encodeSnappyBlockAsm64K: CMPL SI, BX JLE repeat_extend_back_end_encodeSnappyBlockAsm64K MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm64K LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm64K repeat_extend_back_end_encodeSnappyBlockAsm64K: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm64K CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm64K MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K two_bytes_repeat_emit_encodeSnappyBlockAsm64K: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm64K JMP memmove_long_repeat_emit_encodeSnappyBlockAsm64K one_byte_repeat_emit_encodeSnappyBlockAsm64K: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm64K: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x08 JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm64K: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K memmove_long_repeat_emit_encodeSnappyBlockAsm64K: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm64K: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_loop_repeat_extend_encodeSnappyBlockAsm64K: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm64K JZ repeat_extend_forward_end_encodeSnappyBlockAsm64K matchlen_match4_repeat_extend_encodeSnappyBlockAsm64K: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeSnappyBlockAsm64K: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_encodeSnappyBlockAsm64K: CMPL DI, $0x01 JL repeat_extend_forward_end_encodeSnappyBlockAsm64K MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm64K LEAL 1(R10), R10 repeat_extend_forward_end_encodeSnappyBlockAsm64K: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm64K two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm64K: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm64K emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm64K: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm64K: MOVL CX, 12(SP) JMP search_loop_encodeSnappyBlockAsm64K no_repeat_found_encodeSnappyBlockAsm64K: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBlockAsm64K SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm64K MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeSnappyBlockAsm64K MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm64K candidate3_match_encodeSnappyBlockAsm64K: ADDL $0x02, CX JMP candidate_match_encodeSnappyBlockAsm64K candidate2_match_encodeSnappyBlockAsm64K: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeSnappyBlockAsm64K: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBlockAsm64K match_extend_back_loop_encodeSnappyBlockAsm64K: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm64K MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBlockAsm64K LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBlockAsm64K JMP match_extend_back_loop_encodeSnappyBlockAsm64K match_extend_back_end_encodeSnappyBlockAsm64K: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm64K: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm64K MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm64K CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm64K MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm64K two_bytes_match_emit_encodeSnappyBlockAsm64K: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm64K JMP memmove_long_match_emit_encodeSnappyBlockAsm64K one_byte_match_emit_encodeSnappyBlockAsm64K: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm64K: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm64K: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm64K memmove_long_match_emit_encodeSnappyBlockAsm64K: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm64K: match_nolit_loop_encodeSnappyBlockAsm64K: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm64K #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBlockAsm64K: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm64K JZ match_nolit_end_encodeSnappyBlockAsm64K matchlen_match4_match_nolit_encodeSnappyBlockAsm64K: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBlockAsm64K MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm64K SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeSnappyBlockAsm64K: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBlockAsm64K MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm64K SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeSnappyBlockAsm64K: CMPL SI, $0x01 JL match_nolit_end_encodeSnappyBlockAsm64K MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm64K LEAL 1(R9), R9 match_nolit_end_encodeSnappyBlockAsm64K: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm64K: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm64K two_byte_offset_short_match_nolit_encodeSnappyBlockAsm64K: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm64K LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm64K emit_copy_three_match_nolit_encodeSnappyBlockAsm64K: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm64K: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm64K MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm64K: MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x10, DI IMULQ R8, DI SHRQ $0x32, DI SHLQ $0x10, BX IMULQ R8, BX SHRQ $0x32, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm64K INCL CX JMP search_loop_encodeSnappyBlockAsm64K emit_remainder_encodeSnappyBlockAsm64K: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBlockAsm64K: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm64K CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBlockAsm64K MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K two_bytes_emit_remainder_encodeSnappyBlockAsm64K: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBlockAsm64K JMP memmove_long_emit_remainder_encodeSnappyBlockAsm64K one_byte_emit_remainder_encodeSnappyBlockAsm64K: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm64K: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm64K_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm64K: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K memmove_long_emit_remainder_encodeSnappyBlockAsm64K: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBlockAsm64K: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBlockAsm12B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm12B(SB), $16408-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000080, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm12B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm12B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x18, R9 IMULQ R8, R9 SHRQ $0x34, R9 SHLQ $0x18, R10 IMULQ R8, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x18, R9 IMULQ R8, R9 SHRQ $0x34, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm12B LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm12B repeat_extend_back_loop_encodeSnappyBlockAsm12B: CMPL SI, BX JLE repeat_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm12B repeat_extend_back_end_encodeSnappyBlockAsm12B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm12B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B two_bytes_repeat_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm12B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm12B one_byte_repeat_emit_encodeSnappyBlockAsm12B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x08 JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm12B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B memmove_long_repeat_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm12B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_loop_repeat_extend_encodeSnappyBlockAsm12B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm12B JZ repeat_extend_forward_end_encodeSnappyBlockAsm12B matchlen_match4_repeat_extend_encodeSnappyBlockAsm12B: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeSnappyBlockAsm12B: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_encodeSnappyBlockAsm12B: CMPL DI, $0x01 JL repeat_extend_forward_end_encodeSnappyBlockAsm12B MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm12B LEAL 1(R10), R10 repeat_extend_forward_end_encodeSnappyBlockAsm12B: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm12B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm12B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm12B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm12B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm12B: MOVL CX, 12(SP) JMP search_loop_encodeSnappyBlockAsm12B no_repeat_found_encodeSnappyBlockAsm12B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBlockAsm12B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm12B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeSnappyBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm12B candidate3_match_encodeSnappyBlockAsm12B: ADDL $0x02, CX JMP candidate_match_encodeSnappyBlockAsm12B candidate2_match_encodeSnappyBlockAsm12B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeSnappyBlockAsm12B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBlockAsm12B match_extend_back_loop_encodeSnappyBlockAsm12B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm12B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBlockAsm12B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBlockAsm12B JMP match_extend_back_loop_encodeSnappyBlockAsm12B match_extend_back_end_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm12B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm12B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm12B two_bytes_match_emit_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm12B JMP memmove_long_match_emit_encodeSnappyBlockAsm12B one_byte_match_emit_encodeSnappyBlockAsm12B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm12B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm12B memmove_long_match_emit_encodeSnappyBlockAsm12B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm12B: match_nolit_loop_encodeSnappyBlockAsm12B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBlockAsm12B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm12B JZ match_nolit_end_encodeSnappyBlockAsm12B matchlen_match4_match_nolit_encodeSnappyBlockAsm12B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBlockAsm12B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm12B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeSnappyBlockAsm12B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBlockAsm12B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm12B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeSnappyBlockAsm12B: CMPL SI, $0x01 JL match_nolit_end_encodeSnappyBlockAsm12B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm12B LEAL 1(R9), R9 match_nolit_end_encodeSnappyBlockAsm12B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm12B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm12B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm12B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBlockAsm12B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm12B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm12B: MOVQ $0x000000cf1bbcdcbb, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x18, DI IMULQ R8, DI SHRQ $0x34, DI SHLQ $0x18, BX IMULQ R8, BX SHRQ $0x34, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm12B INCL CX JMP search_loop_encodeSnappyBlockAsm12B emit_remainder_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm12B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B two_bytes_emit_remainder_encodeSnappyBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBlockAsm12B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm12B one_byte_emit_remainder_encodeSnappyBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm12B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B memmove_long_emit_remainder_encodeSnappyBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBlockAsm10B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm10B(SB), $4120-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000020, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm10B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm10B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ R8, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x36, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm10B LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm10B repeat_extend_back_loop_encodeSnappyBlockAsm10B: CMPL SI, BX JLE repeat_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm10B repeat_extend_back_end_encodeSnappyBlockAsm10B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm10B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B two_bytes_repeat_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm10B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm10B one_byte_repeat_emit_encodeSnappyBlockAsm10B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x08 JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm10B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B memmove_long_repeat_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm10B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_loop_repeat_extend_encodeSnappyBlockAsm10B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm10B JZ repeat_extend_forward_end_encodeSnappyBlockAsm10B matchlen_match4_repeat_extend_encodeSnappyBlockAsm10B: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeSnappyBlockAsm10B: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_encodeSnappyBlockAsm10B: CMPL DI, $0x01 JL repeat_extend_forward_end_encodeSnappyBlockAsm10B MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm10B LEAL 1(R10), R10 repeat_extend_forward_end_encodeSnappyBlockAsm10B: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm10B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm10B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm10B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm10B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm10B: MOVL CX, 12(SP) JMP search_loop_encodeSnappyBlockAsm10B no_repeat_found_encodeSnappyBlockAsm10B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBlockAsm10B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm10B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeSnappyBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm10B candidate3_match_encodeSnappyBlockAsm10B: ADDL $0x02, CX JMP candidate_match_encodeSnappyBlockAsm10B candidate2_match_encodeSnappyBlockAsm10B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeSnappyBlockAsm10B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBlockAsm10B match_extend_back_loop_encodeSnappyBlockAsm10B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm10B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBlockAsm10B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBlockAsm10B JMP match_extend_back_loop_encodeSnappyBlockAsm10B match_extend_back_end_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm10B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm10B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm10B two_bytes_match_emit_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm10B JMP memmove_long_match_emit_encodeSnappyBlockAsm10B one_byte_match_emit_encodeSnappyBlockAsm10B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm10B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm10B memmove_long_match_emit_encodeSnappyBlockAsm10B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm10B: match_nolit_loop_encodeSnappyBlockAsm10B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBlockAsm10B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm10B JZ match_nolit_end_encodeSnappyBlockAsm10B matchlen_match4_match_nolit_encodeSnappyBlockAsm10B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBlockAsm10B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm10B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeSnappyBlockAsm10B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBlockAsm10B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm10B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeSnappyBlockAsm10B: CMPL SI, $0x01 JL match_nolit_end_encodeSnappyBlockAsm10B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm10B LEAL 1(R9), R9 match_nolit_end_encodeSnappyBlockAsm10B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm10B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm10B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm10B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBlockAsm10B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm10B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm10B: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x20, DI IMULQ R8, DI SHRQ $0x36, DI SHLQ $0x20, BX IMULQ R8, BX SHRQ $0x36, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm10B INCL CX JMP search_loop_encodeSnappyBlockAsm10B emit_remainder_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm10B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B two_bytes_emit_remainder_encodeSnappyBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBlockAsm10B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm10B one_byte_emit_remainder_encodeSnappyBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm10B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B memmove_long_emit_remainder_encodeSnappyBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBlockAsm8B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBlockAsm8B(SB), $1048-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000008, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBlockAsm8B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x04, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm8B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x38, R9 SHLQ $0x20, R10 IMULQ R8, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x38, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_encodeSnappyBlockAsm8B LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_encodeSnappyBlockAsm8B repeat_extend_back_loop_encodeSnappyBlockAsm8B: CMPL SI, BX JLE repeat_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_encodeSnappyBlockAsm8B repeat_extend_back_end_encodeSnappyBlockAsm8B: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_encodeSnappyBlockAsm8B CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B two_bytes_repeat_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_encodeSnappyBlockAsm8B JMP memmove_long_repeat_emit_encodeSnappyBlockAsm8B one_byte_repeat_emit_encodeSnappyBlockAsm8B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(DI*1), BX // genMemMoveShort CMPQ DI, $0x08 JLE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8 CMPQ DI, $0x10 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ DI, $0x20 JBE emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8: MOVQ (R8), R9 MOVQ R9, (AX) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (R8), R9 MOVQ -8(R8)(DI*1), R8 MOVQ R9, (AX) MOVQ R8, -8(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (R8), X0 MOVOU -16(R8)(DI*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DI*1) JMP memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B emit_lit_memmove_repeat_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) memmove_end_copy_repeat_emit_encodeSnappyBlockAsm8B: MOVQ BX, AX JMP emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B memmove_long_repeat_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(DI*1), BX // genMemMoveLong MOVOU (R8), X0 MOVOU 16(R8), X1 MOVOU -32(R8)(DI*1), X2 MOVOU -16(R8)(DI*1), X3 MOVQ DI, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R8)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R8)(R11*1), X4 MOVOU -16(R8)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ DI, R11 JAE emit_lit_memmove_long_repeat_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DI*1) MOVOU X3, -16(AX)(DI*1) MOVQ BX, AX emit_literal_done_repeat_emit_encodeSnappyBlockAsm8B: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_loop_repeat_extend_encodeSnappyBlockAsm8B: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_encodeSnappyBlockAsm8B JZ repeat_extend_forward_end_encodeSnappyBlockAsm8B matchlen_match4_repeat_extend_encodeSnappyBlockAsm8B: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_encodeSnappyBlockAsm8B: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_encodeSnappyBlockAsm8B: CMPL DI, $0x01 JL repeat_extend_forward_end_encodeSnappyBlockAsm8B MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_encodeSnappyBlockAsm8B LEAL 1(R10), R10 repeat_extend_forward_end_encodeSnappyBlockAsm8B: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B MOVB $0xee, (AX) MOVW SI, 1(AX) LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_encodeSnappyBlockAsm8B two_byte_offset_short_repeat_as_copy_encodeSnappyBlockAsm8B: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B LEAL -15(DI), DI MOVB SI, 1(AX) SHRL $0x08, SI SHLL $0x05, SI ORL SI, DI MOVB DI, (AX) ADDQ $0x02, AX JMP repeat_end_emit_encodeSnappyBlockAsm8B emit_copy_three_repeat_as_copy_encodeSnappyBlockAsm8B: LEAL -2(DI), DI MOVB DI, (AX) MOVW SI, 1(AX) ADDQ $0x03, AX repeat_end_emit_encodeSnappyBlockAsm8B: MOVL CX, 12(SP) JMP search_loop_encodeSnappyBlockAsm8B no_repeat_found_encodeSnappyBlockAsm8B: CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBlockAsm8B SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_encodeSnappyBlockAsm8B MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_encodeSnappyBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeSnappyBlockAsm8B candidate3_match_encodeSnappyBlockAsm8B: ADDL $0x02, CX JMP candidate_match_encodeSnappyBlockAsm8B candidate2_match_encodeSnappyBlockAsm8B: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_encodeSnappyBlockAsm8B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBlockAsm8B match_extend_back_loop_encodeSnappyBlockAsm8B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBlockAsm8B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBlockAsm8B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBlockAsm8B JMP match_extend_back_loop_encodeSnappyBlockAsm8B match_extend_back_end_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBlockAsm8B: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_encodeSnappyBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), DI CMPL DI, $0x3c JLT one_byte_match_emit_encodeSnappyBlockAsm8B CMPL DI, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBlockAsm8B two_bytes_match_emit_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB DI, 1(AX) ADDQ $0x02, AX CMPL DI, $0x40 JL memmove_match_emit_encodeSnappyBlockAsm8B JMP memmove_long_match_emit_encodeSnappyBlockAsm8B one_byte_match_emit_encodeSnappyBlockAsm8B: SHLB $0x02, DI MOVB DI, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(R8*1), DI // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8: MOVQ (SI), R9 MOVQ R9, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (SI), R9 MOVQ -8(SI)(R8*1), SI MOVQ R9, (AX) MOVQ SI, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (SI), X0 MOVOU -16(SI)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBlockAsm8B: MOVQ DI, AX JMP emit_literal_done_match_emit_encodeSnappyBlockAsm8B memmove_long_match_emit_encodeSnappyBlockAsm8B: LEAQ (AX)(R8*1), DI // genMemMoveLong MOVOU (SI), X0 MOVOU 16(SI), X1 MOVOU -32(SI)(R8*1), X2 MOVOU -16(SI)(R8*1), X3 MOVQ R8, R10 SHRQ $0x05, R10 MOVQ AX, R9 ANDL $0x0000001f, R9 MOVQ $0x00000040, R11 SUBQ R9, R11 DECQ R10 JA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(SI)(R11*1), R9 LEAQ -32(AX)(R11*1), R12 emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (R9), X4 MOVOU 16(R9), X5 MOVOA X4, (R12) MOVOA X5, 16(R12) ADDQ $0x20, R12 ADDQ $0x20, R9 ADDQ $0x20, R11 DECQ R10 JNA emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(SI)(R11*1), X4 MOVOU -16(SI)(R11*1), X5 MOVOA X4, -32(AX)(R11*1) MOVOA X5, -16(AX)(R11*1) ADDQ $0x20, R11 CMPQ R8, R11 JAE emit_lit_memmove_long_match_emit_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ DI, AX emit_literal_done_match_emit_encodeSnappyBlockAsm8B: match_nolit_loop_encodeSnappyBlockAsm8B: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_encodeSnappyBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_encodeSnappyBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBlockAsm8B: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBlockAsm8B JZ match_nolit_end_encodeSnappyBlockAsm8B matchlen_match4_match_nolit_encodeSnappyBlockAsm8B: CMPL SI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBlockAsm8B MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_encodeSnappyBlockAsm8B SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_encodeSnappyBlockAsm8B: CMPL SI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBlockAsm8B MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_encodeSnappyBlockAsm8B SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_encodeSnappyBlockAsm8B: CMPL SI, $0x01 JL match_nolit_end_encodeSnappyBlockAsm8B MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_encodeSnappyBlockAsm8B LEAL 1(R9), R9 match_nolit_end_encodeSnappyBlockAsm8B: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBlockAsm8B: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B MOVB $0xee, (AX) MOVW BX, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBlockAsm8B: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBlockAsm8B LEAL -15(SI), SI MOVB BL, 1(AX) SHRL $0x08, BX SHLL $0x05, BX ORL BX, SI MOVB SI, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBlockAsm8B: LEAL -2(SI), SI MOVB SI, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBlockAsm8B MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBlockAsm8B: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x20, DI IMULQ R8, DI SHRQ $0x38, DI SHLQ $0x20, BX IMULQ R8, BX SHRQ $0x38, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_encodeSnappyBlockAsm8B INCL CX JMP search_loop_encodeSnappyBlockAsm8B emit_remainder_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBlockAsm8B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B two_bytes_emit_remainder_encodeSnappyBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBlockAsm8B JMP memmove_long_emit_remainder_encodeSnappyBlockAsm8B one_byte_emit_remainder_encodeSnappyBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBlockAsm8B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B memmove_long_emit_remainder_encodeSnappyBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBetterBlockAsm(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm(SB), $589848-56 MOVQ dst_base+0(FP), AX MOVQ $0x00001200, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBetterBlockAsm MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBetterBlockAsm: MOVL CX, BX SUBL 12(SP), BX SHRL $0x07, BX CMPL BX, $0x63 JLE check_maxskip_ok_encodeSnappyBetterBlockAsm LEAL 100(CX), BX JMP check_maxskip_cont_encodeSnappyBetterBlockAsm check_maxskip_ok_encodeSnappyBetterBlockAsm: LEAL 1(CX)(BX*1), BX check_maxskip_cont_encodeSnappyBetterBlockAsm: CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 524312(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 524312(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm CMPQ R10, SI JNE no_short_found_encodeSnappyBetterBlockAsm MOVL DI, BX JMP candidate_match_encodeSnappyBetterBlockAsm no_short_found_encodeSnappyBetterBlockAsm: CMPL R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm CMPL R10, SI JEQ candidateS_match_encodeSnappyBetterBlockAsm MOVL 20(SP), CX JMP search_loop_encodeSnappyBetterBlockAsm candidateS_match_encodeSnappyBetterBlockAsm: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x2f, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBetterBlockAsm DECL CX MOVL DI, BX candidate_match_encodeSnappyBetterBlockAsm: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm match_extend_back_loop_encodeSnappyBetterBlockAsm: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBetterBlockAsm MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBetterBlockAsm LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm JMP match_extend_back_loop_encodeSnappyBetterBlockAsm match_extend_back_end_encodeSnappyBetterBlockAsm: MOVL CX, SI SUBL 12(SP), SI LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm JZ match_nolit_end_encodeSnappyBetterBlockAsm matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm: CMPL DI, $0x01 JL match_nolit_end_encodeSnappyBetterBlockAsm MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeSnappyBetterBlockAsm LEAL 1(R11), R11 match_nolit_end_encodeSnappyBetterBlockAsm: MOVL CX, DI SUBL BX, DI // Check if repeat CMPL R11, $0x01 JG match_length_ok_encodeSnappyBetterBlockAsm CMPL DI, $0x0000ffff JLE match_length_ok_encodeSnappyBetterBlockAsm MOVL 20(SP), CX INCL CX JMP search_loop_encodeSnappyBetterBlockAsm match_length_ok_encodeSnappyBetterBlockAsm: MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeSnappyBetterBlockAsm CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm CMPL BX, $0x00010000 JLT three_bytes_match_emit_encodeSnappyBetterBlockAsm CMPL BX, $0x01000000 JLT four_bytes_match_emit_encodeSnappyBetterBlockAsm MOVB $0xfc, (AX) MOVL BX, 1(AX) ADDQ $0x05, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm four_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVL BX, R10 SHRL $0x10, R10 MOVB $0xf8, (AX) MOVW BX, 1(AX) MOVB R10, 3(AX) ADDQ $0x04, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm three_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm two_bytes_match_emit_encodeSnappyBetterBlockAsm: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeSnappyBetterBlockAsm JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm one_byte_match_emit_encodeSnappyBetterBlockAsm: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm memmove_long_match_emit_encodeSnappyBetterBlockAsm: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy CMPL DI, $0x00010000 JL two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm: CMPL R11, $0x40 JLE four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm MOVB $0xff, (AX) MOVL DI, 1(AX) LEAL -64(R11), R11 ADDQ $0x05, AX CMPL R11, $0x04 JL four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm JMP four_bytes_loop_back_match_nolit_encodeSnappyBetterBlockAsm four_bytes_remain_match_nolit_encodeSnappyBetterBlockAsm: TESTL R11, R11 JZ match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm XORL BX, BX LEAL -1(BX)(R11*4), R11 MOVB R11, (AX) MOVL DI, 1(AX) ADDQ $0x05, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm: MOVQ $0x00cf1bbcdcbfa563, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x32, R10 SHLQ $0x08, R11 IMULQ BX, R11 SHRQ $0x2f, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x32, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 524312(SP)(R10*4) MOVL R13, 524312(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeSnappyBetterBlockAsm: CMPQ SI, R8 JAE search_loop_encodeSnappyBetterBlockAsm MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x08, DI IMULQ BX, DI SHRQ $0x2f, DI SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x2f, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm emit_remainder_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBetterBlockAsm MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x00010000 JLT three_bytes_emit_remainder_encodeSnappyBetterBlockAsm CMPL DX, $0x01000000 JLT four_bytes_emit_remainder_encodeSnappyBetterBlockAsm MOVB $0xfc, (AX) MOVL DX, 1(AX) ADDQ $0x05, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm four_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVL DX, BX SHRL $0x10, BX MOVB $0xf8, (AX) MOVW DX, 1(AX) MOVB BL, 3(AX) ADDQ $0x04, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm three_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm two_bytes_emit_remainder_encodeSnappyBetterBlockAsm: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBetterBlockAsm JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm one_byte_emit_remainder_encodeSnappyBetterBlockAsm: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBetterBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm memmove_long_emit_remainder_encodeSnappyBetterBlockAsm: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsmlarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBetterBlockAsm64K(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm64K(SB), $327704-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000a00, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm64K: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBetterBlockAsm64K MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBetterBlockAsm64K: MOVL CX, BX SUBL 12(SP), BX SHRL $0x07, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm64K MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x00cf1bbcdcbfa563, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x30, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x32, R10 MOVL 24(SP)(R9*4), BX MOVL 262168(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 262168(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm64K CMPQ R10, SI JNE no_short_found_encodeSnappyBetterBlockAsm64K MOVL DI, BX JMP candidate_match_encodeSnappyBetterBlockAsm64K no_short_found_encodeSnappyBetterBlockAsm64K: CMPL R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm64K CMPL R10, SI JEQ candidateS_match_encodeSnappyBetterBlockAsm64K MOVL 20(SP), CX JMP search_loop_encodeSnappyBetterBlockAsm64K candidateS_match_encodeSnappyBetterBlockAsm64K: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x08, R9 IMULQ R8, R9 SHRQ $0x30, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBetterBlockAsm64K DECL CX MOVL DI, BX candidate_match_encodeSnappyBetterBlockAsm64K: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K match_extend_back_loop_encodeSnappyBetterBlockAsm64K: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBetterBlockAsm64K MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBetterBlockAsm64K LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm64K JMP match_extend_back_loop_encodeSnappyBetterBlockAsm64K match_extend_back_end_encodeSnappyBetterBlockAsm64K: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm64K: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm64K JZ match_nolit_end_encodeSnappyBetterBlockAsm64K matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL DI, $0x01 JL match_nolit_end_encodeSnappyBetterBlockAsm64K MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeSnappyBetterBlockAsm64K LEAL 1(R11), R11 match_nolit_end_encodeSnappyBetterBlockAsm64K: MOVL CX, DI SUBL BX, DI // Check if repeat MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeSnappyBetterBlockAsm64K CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm64K MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K two_bytes_match_emit_encodeSnappyBetterBlockAsm64K: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeSnappyBetterBlockAsm64K JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm64K one_byte_match_emit_encodeSnappyBetterBlockAsm64K: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBetterBlockAsm64K: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm64K: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K memmove_long_match_emit_encodeSnappyBetterBlockAsm64K: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm64K: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm64K two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm64K: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm64K: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm64K: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm64K CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm64K: MOVQ $0x00cf1bbcdcbfa563, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x30, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x32, R10 SHLQ $0x08, R11 IMULQ BX, R11 SHRQ $0x30, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x32, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 262168(SP)(R10*4) MOVL R13, 262168(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeSnappyBetterBlockAsm64K: CMPQ SI, R8 JAE search_loop_encodeSnappyBetterBlockAsm64K MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x08, DI IMULQ BX, DI SHRQ $0x30, DI SHLQ $0x08, R9 IMULQ BX, R9 SHRQ $0x30, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm64K emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBetterBlockAsm64K MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm64K: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K two_bytes_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBetterBlockAsm64K JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K one_byte_emit_remainder_encodeSnappyBetterBlockAsm64K: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBetterBlockAsm64K: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm64K_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64K: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm64Klarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm64K: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBetterBlockAsm12B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm12B(SB), $81944-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000280, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm12B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBetterBlockAsm12B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBetterBlockAsm12B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x06, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm12B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x34, R10 MOVL 24(SP)(R9*4), BX MOVL 65560(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 65560(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm12B CMPQ R10, SI JNE no_short_found_encodeSnappyBetterBlockAsm12B MOVL DI, BX JMP candidate_match_encodeSnappyBetterBlockAsm12B no_short_found_encodeSnappyBetterBlockAsm12B: CMPL R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm12B CMPL R10, SI JEQ candidateS_match_encodeSnappyBetterBlockAsm12B MOVL 20(SP), CX JMP search_loop_encodeSnappyBetterBlockAsm12B candidateS_match_encodeSnappyBetterBlockAsm12B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x32, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBetterBlockAsm12B DECL CX MOVL DI, BX candidate_match_encodeSnappyBetterBlockAsm12B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B match_extend_back_loop_encodeSnappyBetterBlockAsm12B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBetterBlockAsm12B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBetterBlockAsm12B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm12B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm12B match_extend_back_end_encodeSnappyBetterBlockAsm12B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm12B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm12B JZ match_nolit_end_encodeSnappyBetterBlockAsm12B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL DI, $0x01 JL match_nolit_end_encodeSnappyBetterBlockAsm12B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeSnappyBetterBlockAsm12B LEAL 1(R11), R11 match_nolit_end_encodeSnappyBetterBlockAsm12B: MOVL CX, DI SUBL BX, DI // Check if repeat MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeSnappyBetterBlockAsm12B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm12B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B two_bytes_match_emit_encodeSnappyBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeSnappyBetterBlockAsm12B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm12B one_byte_match_emit_encodeSnappyBetterBlockAsm12B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm12B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B memmove_long_match_emit_encodeSnappyBetterBlockAsm12B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm12B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm12B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm12B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm12B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm12B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm12B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm12B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x32, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x34, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x32, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x34, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 65560(SP)(R10*4) MOVL R13, 65560(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeSnappyBetterBlockAsm12B: CMPQ SI, R8 JAE search_loop_encodeSnappyBetterBlockAsm12B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x32, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x32, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm12B emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBetterBlockAsm12B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm12B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBetterBlockAsm12B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B one_byte_emit_remainder_encodeSnappyBetterBlockAsm12B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBetterBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm12B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm12Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm12B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBetterBlockAsm10B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm10B(SB), $20504-56 MOVQ dst_base+0(FP), AX MOVQ $0x000000a0, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm10B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBetterBlockAsm10B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBetterBlockAsm10B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm10B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x34, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x36, R10 MOVL 24(SP)(R9*4), BX MOVL 16408(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 16408(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm10B CMPQ R10, SI JNE no_short_found_encodeSnappyBetterBlockAsm10B MOVL DI, BX JMP candidate_match_encodeSnappyBetterBlockAsm10B no_short_found_encodeSnappyBetterBlockAsm10B: CMPL R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm10B CMPL R10, SI JEQ candidateS_match_encodeSnappyBetterBlockAsm10B MOVL 20(SP), CX JMP search_loop_encodeSnappyBetterBlockAsm10B candidateS_match_encodeSnappyBetterBlockAsm10B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x34, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBetterBlockAsm10B DECL CX MOVL DI, BX candidate_match_encodeSnappyBetterBlockAsm10B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B match_extend_back_loop_encodeSnappyBetterBlockAsm10B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBetterBlockAsm10B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBetterBlockAsm10B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm10B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm10B match_extend_back_end_encodeSnappyBetterBlockAsm10B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm10B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm10B JZ match_nolit_end_encodeSnappyBetterBlockAsm10B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL DI, $0x01 JL match_nolit_end_encodeSnappyBetterBlockAsm10B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeSnappyBetterBlockAsm10B LEAL 1(R11), R11 match_nolit_end_encodeSnappyBetterBlockAsm10B: MOVL CX, DI SUBL BX, DI // Check if repeat MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeSnappyBetterBlockAsm10B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm10B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B two_bytes_match_emit_encodeSnappyBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeSnappyBetterBlockAsm10B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm10B one_byte_match_emit_encodeSnappyBetterBlockAsm10B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm10B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B memmove_long_match_emit_encodeSnappyBetterBlockAsm10B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm10B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm10B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm10B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B CMPL DI, $0x00000800 JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm10B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm10B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm10B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm10B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x34, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x36, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x34, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x36, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 16408(SP)(R10*4) MOVL R13, 16408(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeSnappyBetterBlockAsm10B: CMPQ SI, R8 JAE search_loop_encodeSnappyBetterBlockAsm10B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x34, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x34, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm10B emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBetterBlockAsm10B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm10B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBetterBlockAsm10B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B one_byte_emit_remainder_encodeSnappyBetterBlockAsm10B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBetterBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm10B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm10Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm10B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func encodeSnappyBetterBlockAsm8B(dst []byte, src []byte) int // Requires: BMI, SSE2 TEXT ·encodeSnappyBetterBlockAsm8B(SB), $5144-56 MOVQ dst_base+0(FP), AX MOVQ $0x00000028, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_encodeSnappyBetterBlockAsm8B: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_encodeSnappyBetterBlockAsm8B MOVL $0x00000000, 12(SP) MOVQ src_len+32(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL $0x00000000, 16(SP) MOVQ src_base+24(FP), DX search_loop_encodeSnappyBetterBlockAsm8B: MOVL CX, BX SUBL 12(SP), BX SHRL $0x04, BX LEAL 1(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm8B MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ $0x9e3779b1, BX MOVQ SI, R9 MOVQ SI, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ BX, R10 SHRQ $0x38, R10 MOVL 24(SP)(R9*4), BX MOVL 4120(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) MOVL CX, 4120(SP)(R10*4) MOVQ (DX)(BX*1), R9 MOVQ (DX)(DI*1), R10 CMPQ R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm8B CMPQ R10, SI JNE no_short_found_encodeSnappyBetterBlockAsm8B MOVL DI, BX JMP candidate_match_encodeSnappyBetterBlockAsm8B no_short_found_encodeSnappyBetterBlockAsm8B: CMPL R9, SI JEQ candidate_match_encodeSnappyBetterBlockAsm8B CMPL R10, SI JEQ candidateS_match_encodeSnappyBetterBlockAsm8B MOVL 20(SP), CX JMP search_loop_encodeSnappyBetterBlockAsm8B candidateS_match_encodeSnappyBetterBlockAsm8B: SHRQ $0x08, SI MOVQ SI, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x36, R9 MOVL 24(SP)(R9*4), BX INCL CX MOVL CX, 24(SP)(R9*4) CMPL (DX)(BX*1), SI JEQ candidate_match_encodeSnappyBetterBlockAsm8B DECL CX MOVL DI, BX candidate_match_encodeSnappyBetterBlockAsm8B: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B match_extend_back_loop_encodeSnappyBetterBlockAsm8B: CMPL CX, SI JLE match_extend_back_end_encodeSnappyBetterBlockAsm8B MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_encodeSnappyBetterBlockAsm8B LEAL -1(CX), CX DECL BX JZ match_extend_back_end_encodeSnappyBetterBlockAsm8B JMP match_extend_back_loop_encodeSnappyBetterBlockAsm8B match_extend_back_end_encodeSnappyBetterBlockAsm8B: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_dst_size_check_encodeSnappyBetterBlockAsm8B: MOVL CX, SI ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+32(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), R9 // matchLen XORL R11, R11 CMPL DI, $0x08 JL matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B: MOVQ (R8)(R11*1), R10 XORQ (R9)(R11*1), R10 TESTQ R10, R10 JZ matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B #ifdef GOAMD64_v3 TZCNTQ R10, R10 #else BSFQ R10, R10 #endif SARQ $0x03, R10 LEAL (R11)(R10*1), R11 JMP match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_loop_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -8(DI), DI LEAL 8(R11), R11 CMPL DI, $0x08 JGE matchlen_loopback_match_nolit_encodeSnappyBetterBlockAsm8B JZ match_nolit_end_encodeSnappyBetterBlockAsm8B matchlen_match4_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL DI, $0x04 JL matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B MOVL (R8)(R11*1), R10 CMPL (R9)(R11*1), R10 JNE matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B SUBL $0x04, DI LEAL 4(R11), R11 matchlen_match2_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL DI, $0x02 JL matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B MOVW (R8)(R11*1), R10 CMPW (R9)(R11*1), R10 JNE matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B SUBL $0x02, DI LEAL 2(R11), R11 matchlen_match1_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL DI, $0x01 JL match_nolit_end_encodeSnappyBetterBlockAsm8B MOVB (R8)(R11*1), R10 CMPB (R9)(R11*1), R10 JNE match_nolit_end_encodeSnappyBetterBlockAsm8B LEAL 1(R11), R11 match_nolit_end_encodeSnappyBetterBlockAsm8B: MOVL CX, DI SUBL BX, DI // Check if repeat MOVL DI, 16(SP) MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(BX*1), R9 SUBL BX, R8 LEAL -1(R8), BX CMPL BX, $0x3c JLT one_byte_match_emit_encodeSnappyBetterBlockAsm8B CMPL BX, $0x00000100 JLT two_bytes_match_emit_encodeSnappyBetterBlockAsm8B MOVB $0xf4, (AX) MOVW BX, 1(AX) ADDQ $0x03, AX JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B two_bytes_match_emit_encodeSnappyBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB BL, 1(AX) ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_match_emit_encodeSnappyBetterBlockAsm8B JMP memmove_long_match_emit_encodeSnappyBetterBlockAsm8B one_byte_match_emit_encodeSnappyBetterBlockAsm8B: SHLB $0x02, BL MOVB BL, (AX) ADDQ $0x01, AX memmove_match_emit_encodeSnappyBetterBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8: MOVQ (R9), R10 MOVQ R10, (AX) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: MOVQ (R9), R10 MOVQ -8(R9)(R8*1), R9 MOVQ R10, (AX) MOVQ R9, -8(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: MOVOU (R9), X0 MOVOU -16(R9)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B emit_lit_memmove_match_emit_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_match_emit_encodeSnappyBetterBlockAsm8B: MOVQ BX, AX JMP emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B memmove_long_match_emit_encodeSnappyBetterBlockAsm8B: LEAQ (AX)(R8*1), BX // genMemMoveLong MOVOU (R9), X0 MOVOU 16(R9), X1 MOVOU -32(R9)(R8*1), X2 MOVOU -16(R9)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R10 ANDL $0x0000001f, R10 MOVQ $0x00000040, R13 SUBQ R10, R13 DECQ R12 JA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(R9)(R13*1), R10 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: MOVOU (R10), X4 MOVOU 16(R10), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R10 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(R9)(R13*1), X4 MOVOU -16(R9)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_match_emit_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ BX, AX emit_literal_done_match_emit_encodeSnappyBetterBlockAsm8B: ADDL R11, CX ADDL $0x04, R11 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B: CMPL R11, $0x40 JLE two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B MOVB $0xee, (AX) MOVW DI, 1(AX) LEAL -60(R11), R11 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_encodeSnappyBetterBlockAsm8B two_byte_offset_short_match_nolit_encodeSnappyBetterBlockAsm8B: MOVL R11, BX SHLL $0x02, BX CMPL R11, $0x0c JGE emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B LEAL -15(BX), BX MOVB DI, 1(AX) SHRL $0x08, DI SHLL $0x05, DI ORL DI, BX MOVB BL, (AX) ADDQ $0x02, AX JMP match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B emit_copy_three_match_nolit_encodeSnappyBetterBlockAsm8B: LEAL -2(BX), BX MOVB BL, (AX) MOVW DI, 1(AX) ADDQ $0x03, AX match_nolit_emitcopy_end_encodeSnappyBetterBlockAsm8B: CMPL CX, 8(SP) JGE emit_remainder_encodeSnappyBetterBlockAsm8B CMPQ AX, (SP) JL match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET match_nolit_dst_ok_encodeSnappyBetterBlockAsm8B: MOVQ $0x0000cf1bbcdcbf9b, BX MOVQ $0x9e3779b1, DI LEAQ 1(SI), SI LEAQ -2(CX), R8 MOVQ (DX)(SI*1), R9 MOVQ 1(DX)(SI*1), R10 MOVQ (DX)(R8*1), R11 MOVQ 1(DX)(R8*1), R12 SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x36, R9 SHLQ $0x20, R10 IMULQ DI, R10 SHRQ $0x38, R10 SHLQ $0x10, R11 IMULQ BX, R11 SHRQ $0x36, R11 SHLQ $0x20, R12 IMULQ DI, R12 SHRQ $0x38, R12 LEAQ 1(SI), DI LEAQ 1(R8), R13 MOVL SI, 24(SP)(R9*4) MOVL R8, 24(SP)(R11*4) MOVL DI, 4120(SP)(R10*4) MOVL R13, 4120(SP)(R12*4) ADDQ $0x01, SI SUBQ $0x01, R8 index_loop_encodeSnappyBetterBlockAsm8B: CMPQ SI, R8 JAE search_loop_encodeSnappyBetterBlockAsm8B MOVQ (DX)(SI*1), DI MOVQ (DX)(R8*1), R9 SHLQ $0x10, DI IMULQ BX, DI SHRQ $0x36, DI SHLQ $0x10, R9 IMULQ BX, R9 SHRQ $0x36, R9 MOVL SI, 24(SP)(DI*4) MOVL R8, 24(SP)(R9*4) ADDQ $0x02, SI SUBQ $0x02, R8 JMP index_loop_encodeSnappyBetterBlockAsm8B emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_encodeSnappyBetterBlockAsm8B MOVQ $0x00000000, ret+48(FP) RET emit_remainder_ok_encodeSnappyBetterBlockAsm8B: MOVQ src_len+32(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), DX CMPL DX, $0x3c JLT one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B CMPL DX, $0x00000100 JLT two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B MOVB $0xf4, (AX) MOVW DX, 1(AX) ADDQ $0x03, AX JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B two_bytes_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVB $0xf0, (AX) MOVB DL, 1(AX) ADDQ $0x02, AX CMPL DX, $0x40 JL memmove_emit_remainder_encodeSnappyBetterBlockAsm8B JMP memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B one_byte_emit_remainder_encodeSnappyBetterBlockAsm8B: SHLB $0x02, DL MOVB DL, (AX) ADDQ $0x01, AX memmove_emit_remainder_encodeSnappyBetterBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveShort CMPQ BX, $0x03 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2 JE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3 CMPQ BX, $0x08 JB emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7 CMPQ BX, $0x10 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16 CMPQ BX, $0x20 JBE emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32 JMP emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64 emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(BX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(BX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(BX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(BX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(BX*1) JMP memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B emit_lit_memmove_emit_remainder_encodeSnappyBetterBlockAsm8B_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) memmove_end_copy_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ DX, AX JMP emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8B: LEAQ (AX)(SI*1), DX MOVL SI, BX // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(BX*1), X2 MOVOU -16(CX)(BX*1), X3 MOVQ BX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_big_loop_back emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ BX, R8 JAE emit_lit_memmove_long_emit_remainder_encodeSnappyBetterBlockAsm8Blarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(BX*1) MOVOU X3, -16(AX)(BX*1) MOVQ DX, AX emit_literal_done_emit_remainder_encodeSnappyBetterBlockAsm8B: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ AX, ret+48(FP) RET // func calcBlockSize(src []byte) int // Requires: BMI, SSE2 TEXT ·calcBlockSize(SB), $32792-32 XORQ AX, AX MOVQ $0x00000100, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_calcBlockSize: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_calcBlockSize MOVL $0x00000000, 12(SP) MOVQ src_len+8(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+0(FP), DX search_loop_calcBlockSize: MOVL CX, BX SUBL 12(SP), BX SHRL $0x05, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_calcBlockSize MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x33, R9 SHLQ $0x10, R10 IMULQ R8, R10 SHRQ $0x33, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x10, R9 IMULQ R8, R9 SHRQ $0x33, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_calcBlockSize LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_calcBlockSize repeat_extend_back_loop_calcBlockSize: CMPL SI, BX JLE repeat_extend_back_end_calcBlockSize MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_calcBlockSize LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_calcBlockSize repeat_extend_back_end_calcBlockSize: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_calcBlockSize MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_calcBlockSize CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_calcBlockSize CMPL BX, $0x00010000 JLT three_bytes_repeat_emit_calcBlockSize CMPL BX, $0x01000000 JLT four_bytes_repeat_emit_calcBlockSize ADDQ $0x05, AX JMP memmove_long_repeat_emit_calcBlockSize four_bytes_repeat_emit_calcBlockSize: ADDQ $0x04, AX JMP memmove_long_repeat_emit_calcBlockSize three_bytes_repeat_emit_calcBlockSize: ADDQ $0x03, AX JMP memmove_long_repeat_emit_calcBlockSize two_bytes_repeat_emit_calcBlockSize: ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_calcBlockSize JMP memmove_long_repeat_emit_calcBlockSize one_byte_repeat_emit_calcBlockSize: ADDQ $0x01, AX memmove_repeat_emit_calcBlockSize: LEAQ (AX)(DI*1), AX JMP emit_literal_done_repeat_emit_calcBlockSize memmove_long_repeat_emit_calcBlockSize: LEAQ (AX)(DI*1), AX emit_literal_done_repeat_emit_calcBlockSize: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+8(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_calcBlockSize matchlen_loopback_repeat_extend_calcBlockSize: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_calcBlockSize #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_calcBlockSize matchlen_loop_repeat_extend_calcBlockSize: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_calcBlockSize JZ repeat_extend_forward_end_calcBlockSize matchlen_match4_repeat_extend_calcBlockSize: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_calcBlockSize MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_calcBlockSize SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_calcBlockSize: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_calcBlockSize MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_calcBlockSize SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_calcBlockSize: CMPL DI, $0x01 JL repeat_extend_forward_end_calcBlockSize MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_calcBlockSize LEAL 1(R10), R10 repeat_extend_forward_end_calcBlockSize: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy CMPL SI, $0x00010000 JL two_byte_offset_repeat_as_copy_calcBlockSize four_bytes_loop_back_repeat_as_copy_calcBlockSize: CMPL BX, $0x40 JLE four_bytes_remain_repeat_as_copy_calcBlockSize LEAL -64(BX), BX ADDQ $0x05, AX CMPL BX, $0x04 JL four_bytes_remain_repeat_as_copy_calcBlockSize JMP four_bytes_loop_back_repeat_as_copy_calcBlockSize four_bytes_remain_repeat_as_copy_calcBlockSize: TESTL BX, BX JZ repeat_end_emit_calcBlockSize XORL BX, BX ADDQ $0x05, AX JMP repeat_end_emit_calcBlockSize two_byte_offset_repeat_as_copy_calcBlockSize: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_calcBlockSize LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_calcBlockSize two_byte_offset_short_repeat_as_copy_calcBlockSize: MOVL BX, DI SHLL $0x02, DI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_calcBlockSize CMPL SI, $0x00000800 JGE emit_copy_three_repeat_as_copy_calcBlockSize ADDQ $0x02, AX JMP repeat_end_emit_calcBlockSize emit_copy_three_repeat_as_copy_calcBlockSize: ADDQ $0x03, AX repeat_end_emit_calcBlockSize: MOVL CX, 12(SP) JMP search_loop_calcBlockSize no_repeat_found_calcBlockSize: CMPL (DX)(BX*1), SI JEQ candidate_match_calcBlockSize SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_calcBlockSize MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_calcBlockSize MOVL 20(SP), CX JMP search_loop_calcBlockSize candidate3_match_calcBlockSize: ADDL $0x02, CX JMP candidate_match_calcBlockSize candidate2_match_calcBlockSize: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_calcBlockSize: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_calcBlockSize match_extend_back_loop_calcBlockSize: CMPL CX, SI JLE match_extend_back_end_calcBlockSize MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_calcBlockSize LEAL -1(CX), CX DECL BX JZ match_extend_back_end_calcBlockSize JMP match_extend_back_loop_calcBlockSize match_extend_back_end_calcBlockSize: MOVL CX, SI SUBL 12(SP), SI LEAQ 5(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_calcBlockSize MOVQ $0x00000000, ret+24(FP) RET match_dst_size_check_calcBlockSize: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_calcBlockSize MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JLT one_byte_match_emit_calcBlockSize CMPL SI, $0x00000100 JLT two_bytes_match_emit_calcBlockSize CMPL SI, $0x00010000 JLT three_bytes_match_emit_calcBlockSize CMPL SI, $0x01000000 JLT four_bytes_match_emit_calcBlockSize ADDQ $0x05, AX JMP memmove_long_match_emit_calcBlockSize four_bytes_match_emit_calcBlockSize: ADDQ $0x04, AX JMP memmove_long_match_emit_calcBlockSize three_bytes_match_emit_calcBlockSize: ADDQ $0x03, AX JMP memmove_long_match_emit_calcBlockSize two_bytes_match_emit_calcBlockSize: ADDQ $0x02, AX CMPL SI, $0x40 JL memmove_match_emit_calcBlockSize JMP memmove_long_match_emit_calcBlockSize one_byte_match_emit_calcBlockSize: ADDQ $0x01, AX memmove_match_emit_calcBlockSize: LEAQ (AX)(R8*1), AX JMP emit_literal_done_match_emit_calcBlockSize memmove_long_match_emit_calcBlockSize: LEAQ (AX)(R8*1), AX emit_literal_done_match_emit_calcBlockSize: match_nolit_loop_calcBlockSize: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+8(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_calcBlockSize matchlen_loopback_match_nolit_calcBlockSize: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_calcBlockSize #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_calcBlockSize matchlen_loop_match_nolit_calcBlockSize: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_calcBlockSize JZ match_nolit_end_calcBlockSize matchlen_match4_match_nolit_calcBlockSize: CMPL SI, $0x04 JL matchlen_match2_match_nolit_calcBlockSize MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_calcBlockSize SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_calcBlockSize: CMPL SI, $0x02 JL matchlen_match1_match_nolit_calcBlockSize MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_calcBlockSize SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_calcBlockSize: CMPL SI, $0x01 JL match_nolit_end_calcBlockSize MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_calcBlockSize LEAL 1(R9), R9 match_nolit_end_calcBlockSize: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy CMPL BX, $0x00010000 JL two_byte_offset_match_nolit_calcBlockSize four_bytes_loop_back_match_nolit_calcBlockSize: CMPL R9, $0x40 JLE four_bytes_remain_match_nolit_calcBlockSize LEAL -64(R9), R9 ADDQ $0x05, AX CMPL R9, $0x04 JL four_bytes_remain_match_nolit_calcBlockSize JMP four_bytes_loop_back_match_nolit_calcBlockSize four_bytes_remain_match_nolit_calcBlockSize: TESTL R9, R9 JZ match_nolit_emitcopy_end_calcBlockSize XORL BX, BX ADDQ $0x05, AX JMP match_nolit_emitcopy_end_calcBlockSize two_byte_offset_match_nolit_calcBlockSize: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_calcBlockSize LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_calcBlockSize two_byte_offset_short_match_nolit_calcBlockSize: MOVL R9, SI SHLL $0x02, SI CMPL R9, $0x0c JGE emit_copy_three_match_nolit_calcBlockSize CMPL BX, $0x00000800 JGE emit_copy_three_match_nolit_calcBlockSize ADDQ $0x02, AX JMP match_nolit_emitcopy_end_calcBlockSize emit_copy_three_match_nolit_calcBlockSize: ADDQ $0x03, AX match_nolit_emitcopy_end_calcBlockSize: CMPL CX, 8(SP) JGE emit_remainder_calcBlockSize MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_calcBlockSize MOVQ $0x00000000, ret+24(FP) RET match_nolit_dst_ok_calcBlockSize: MOVQ $0x0000cf1bbcdcbf9b, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x10, DI IMULQ R8, DI SHRQ $0x33, DI SHLQ $0x10, BX IMULQ R8, BX SHRQ $0x33, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_calcBlockSize INCL CX JMP search_loop_calcBlockSize emit_remainder_calcBlockSize: MOVQ src_len+8(FP), CX SUBL 12(SP), CX LEAQ 5(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_calcBlockSize MOVQ $0x00000000, ret+24(FP) RET emit_remainder_ok_calcBlockSize: MOVQ src_len+8(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_calcBlockSize MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), CX CMPL CX, $0x3c JLT one_byte_emit_remainder_calcBlockSize CMPL CX, $0x00000100 JLT two_bytes_emit_remainder_calcBlockSize CMPL CX, $0x00010000 JLT three_bytes_emit_remainder_calcBlockSize CMPL CX, $0x01000000 JLT four_bytes_emit_remainder_calcBlockSize ADDQ $0x05, AX JMP memmove_long_emit_remainder_calcBlockSize four_bytes_emit_remainder_calcBlockSize: ADDQ $0x04, AX JMP memmove_long_emit_remainder_calcBlockSize three_bytes_emit_remainder_calcBlockSize: ADDQ $0x03, AX JMP memmove_long_emit_remainder_calcBlockSize two_bytes_emit_remainder_calcBlockSize: ADDQ $0x02, AX CMPL CX, $0x40 JL memmove_emit_remainder_calcBlockSize JMP memmove_long_emit_remainder_calcBlockSize one_byte_emit_remainder_calcBlockSize: ADDQ $0x01, AX memmove_emit_remainder_calcBlockSize: LEAQ (AX)(SI*1), AX JMP emit_literal_done_emit_remainder_calcBlockSize memmove_long_emit_remainder_calcBlockSize: LEAQ (AX)(SI*1), AX emit_literal_done_emit_remainder_calcBlockSize: MOVQ AX, ret+24(FP) RET // func calcBlockSizeSmall(src []byte) int // Requires: BMI, SSE2 TEXT ·calcBlockSizeSmall(SB), $2072-32 XORQ AX, AX MOVQ $0x00000010, CX LEAQ 24(SP), DX PXOR X0, X0 zero_loop_calcBlockSizeSmall: MOVOU X0, (DX) MOVOU X0, 16(DX) MOVOU X0, 32(DX) MOVOU X0, 48(DX) MOVOU X0, 64(DX) MOVOU X0, 80(DX) MOVOU X0, 96(DX) MOVOU X0, 112(DX) ADDQ $0x80, DX DECQ CX JNZ zero_loop_calcBlockSizeSmall MOVL $0x00000000, 12(SP) MOVQ src_len+8(FP), CX LEAQ -9(CX), DX LEAQ -8(CX), BX MOVL BX, 8(SP) SHRQ $0x05, CX SUBL CX, DX LEAQ (AX)(DX*1), DX MOVQ DX, (SP) MOVL $0x00000001, CX MOVL CX, 16(SP) MOVQ src_base+0(FP), DX search_loop_calcBlockSizeSmall: MOVL CX, BX SUBL 12(SP), BX SHRL $0x04, BX LEAL 4(CX)(BX*1), BX CMPL BX, 8(SP) JGE emit_remainder_calcBlockSizeSmall MOVQ (DX)(CX*1), SI MOVL BX, 20(SP) MOVQ $0x9e3779b1, R8 MOVQ SI, R9 MOVQ SI, R10 SHRQ $0x08, R10 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x37, R9 SHLQ $0x20, R10 IMULQ R8, R10 SHRQ $0x37, R10 MOVL 24(SP)(R9*4), BX MOVL 24(SP)(R10*4), DI MOVL CX, 24(SP)(R9*4) LEAL 1(CX), R9 MOVL R9, 24(SP)(R10*4) MOVQ SI, R9 SHRQ $0x10, R9 SHLQ $0x20, R9 IMULQ R8, R9 SHRQ $0x37, R9 MOVL CX, R8 SUBL 16(SP), R8 MOVL 1(DX)(R8*1), R10 MOVQ SI, R8 SHRQ $0x08, R8 CMPL R8, R10 JNE no_repeat_found_calcBlockSizeSmall LEAL 1(CX), SI MOVL 12(SP), BX MOVL SI, DI SUBL 16(SP), DI JZ repeat_extend_back_end_calcBlockSizeSmall repeat_extend_back_loop_calcBlockSizeSmall: CMPL SI, BX JLE repeat_extend_back_end_calcBlockSizeSmall MOVB -1(DX)(DI*1), R8 MOVB -1(DX)(SI*1), R9 CMPB R8, R9 JNE repeat_extend_back_end_calcBlockSizeSmall LEAL -1(SI), SI DECL DI JNZ repeat_extend_back_loop_calcBlockSizeSmall repeat_extend_back_end_calcBlockSizeSmall: MOVL 12(SP), BX CMPL BX, SI JEQ emit_literal_done_repeat_emit_calcBlockSizeSmall MOVL SI, DI MOVL SI, 12(SP) LEAQ (DX)(BX*1), R8 SUBL BX, DI LEAL -1(DI), BX CMPL BX, $0x3c JLT one_byte_repeat_emit_calcBlockSizeSmall CMPL BX, $0x00000100 JLT two_bytes_repeat_emit_calcBlockSizeSmall ADDQ $0x03, AX JMP memmove_long_repeat_emit_calcBlockSizeSmall two_bytes_repeat_emit_calcBlockSizeSmall: ADDQ $0x02, AX CMPL BX, $0x40 JL memmove_repeat_emit_calcBlockSizeSmall JMP memmove_long_repeat_emit_calcBlockSizeSmall one_byte_repeat_emit_calcBlockSizeSmall: ADDQ $0x01, AX memmove_repeat_emit_calcBlockSizeSmall: LEAQ (AX)(DI*1), AX JMP emit_literal_done_repeat_emit_calcBlockSizeSmall memmove_long_repeat_emit_calcBlockSizeSmall: LEAQ (AX)(DI*1), AX emit_literal_done_repeat_emit_calcBlockSizeSmall: ADDL $0x05, CX MOVL CX, BX SUBL 16(SP), BX MOVQ src_len+8(FP), DI SUBL CX, DI LEAQ (DX)(CX*1), R8 LEAQ (DX)(BX*1), BX // matchLen XORL R10, R10 CMPL DI, $0x08 JL matchlen_match4_repeat_extend_calcBlockSizeSmall matchlen_loopback_repeat_extend_calcBlockSizeSmall: MOVQ (R8)(R10*1), R9 XORQ (BX)(R10*1), R9 TESTQ R9, R9 JZ matchlen_loop_repeat_extend_calcBlockSizeSmall #ifdef GOAMD64_v3 TZCNTQ R9, R9 #else BSFQ R9, R9 #endif SARQ $0x03, R9 LEAL (R10)(R9*1), R10 JMP repeat_extend_forward_end_calcBlockSizeSmall matchlen_loop_repeat_extend_calcBlockSizeSmall: LEAL -8(DI), DI LEAL 8(R10), R10 CMPL DI, $0x08 JGE matchlen_loopback_repeat_extend_calcBlockSizeSmall JZ repeat_extend_forward_end_calcBlockSizeSmall matchlen_match4_repeat_extend_calcBlockSizeSmall: CMPL DI, $0x04 JL matchlen_match2_repeat_extend_calcBlockSizeSmall MOVL (R8)(R10*1), R9 CMPL (BX)(R10*1), R9 JNE matchlen_match2_repeat_extend_calcBlockSizeSmall SUBL $0x04, DI LEAL 4(R10), R10 matchlen_match2_repeat_extend_calcBlockSizeSmall: CMPL DI, $0x02 JL matchlen_match1_repeat_extend_calcBlockSizeSmall MOVW (R8)(R10*1), R9 CMPW (BX)(R10*1), R9 JNE matchlen_match1_repeat_extend_calcBlockSizeSmall SUBL $0x02, DI LEAL 2(R10), R10 matchlen_match1_repeat_extend_calcBlockSizeSmall: CMPL DI, $0x01 JL repeat_extend_forward_end_calcBlockSizeSmall MOVB (R8)(R10*1), R9 CMPB (BX)(R10*1), R9 JNE repeat_extend_forward_end_calcBlockSizeSmall LEAL 1(R10), R10 repeat_extend_forward_end_calcBlockSizeSmall: ADDL R10, CX MOVL CX, BX SUBL SI, BX MOVL 16(SP), SI // emitCopy two_byte_offset_repeat_as_copy_calcBlockSizeSmall: CMPL BX, $0x40 JLE two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall LEAL -60(BX), BX ADDQ $0x03, AX JMP two_byte_offset_repeat_as_copy_calcBlockSizeSmall two_byte_offset_short_repeat_as_copy_calcBlockSizeSmall: MOVL BX, SI SHLL $0x02, SI CMPL BX, $0x0c JGE emit_copy_three_repeat_as_copy_calcBlockSizeSmall ADDQ $0x02, AX JMP repeat_end_emit_calcBlockSizeSmall emit_copy_three_repeat_as_copy_calcBlockSizeSmall: ADDQ $0x03, AX repeat_end_emit_calcBlockSizeSmall: MOVL CX, 12(SP) JMP search_loop_calcBlockSizeSmall no_repeat_found_calcBlockSizeSmall: CMPL (DX)(BX*1), SI JEQ candidate_match_calcBlockSizeSmall SHRQ $0x08, SI MOVL 24(SP)(R9*4), BX LEAL 2(CX), R8 CMPL (DX)(DI*1), SI JEQ candidate2_match_calcBlockSizeSmall MOVL R8, 24(SP)(R9*4) SHRQ $0x08, SI CMPL (DX)(BX*1), SI JEQ candidate3_match_calcBlockSizeSmall MOVL 20(SP), CX JMP search_loop_calcBlockSizeSmall candidate3_match_calcBlockSizeSmall: ADDL $0x02, CX JMP candidate_match_calcBlockSizeSmall candidate2_match_calcBlockSizeSmall: MOVL R8, 24(SP)(R9*4) INCL CX MOVL DI, BX candidate_match_calcBlockSizeSmall: MOVL 12(SP), SI TESTL BX, BX JZ match_extend_back_end_calcBlockSizeSmall match_extend_back_loop_calcBlockSizeSmall: CMPL CX, SI JLE match_extend_back_end_calcBlockSizeSmall MOVB -1(DX)(BX*1), DI MOVB -1(DX)(CX*1), R8 CMPB DI, R8 JNE match_extend_back_end_calcBlockSizeSmall LEAL -1(CX), CX DECL BX JZ match_extend_back_end_calcBlockSizeSmall JMP match_extend_back_loop_calcBlockSizeSmall match_extend_back_end_calcBlockSizeSmall: MOVL CX, SI SUBL 12(SP), SI LEAQ 3(AX)(SI*1), SI CMPQ SI, (SP) JL match_dst_size_check_calcBlockSizeSmall MOVQ $0x00000000, ret+24(FP) RET match_dst_size_check_calcBlockSizeSmall: MOVL CX, SI MOVL 12(SP), DI CMPL DI, SI JEQ emit_literal_done_match_emit_calcBlockSizeSmall MOVL SI, R8 MOVL SI, 12(SP) LEAQ (DX)(DI*1), SI SUBL DI, R8 LEAL -1(R8), SI CMPL SI, $0x3c JLT one_byte_match_emit_calcBlockSizeSmall CMPL SI, $0x00000100 JLT two_bytes_match_emit_calcBlockSizeSmall ADDQ $0x03, AX JMP memmove_long_match_emit_calcBlockSizeSmall two_bytes_match_emit_calcBlockSizeSmall: ADDQ $0x02, AX CMPL SI, $0x40 JL memmove_match_emit_calcBlockSizeSmall JMP memmove_long_match_emit_calcBlockSizeSmall one_byte_match_emit_calcBlockSizeSmall: ADDQ $0x01, AX memmove_match_emit_calcBlockSizeSmall: LEAQ (AX)(R8*1), AX JMP emit_literal_done_match_emit_calcBlockSizeSmall memmove_long_match_emit_calcBlockSizeSmall: LEAQ (AX)(R8*1), AX emit_literal_done_match_emit_calcBlockSizeSmall: match_nolit_loop_calcBlockSizeSmall: MOVL CX, SI SUBL BX, SI MOVL SI, 16(SP) ADDL $0x04, CX ADDL $0x04, BX MOVQ src_len+8(FP), SI SUBL CX, SI LEAQ (DX)(CX*1), DI LEAQ (DX)(BX*1), BX // matchLen XORL R9, R9 CMPL SI, $0x08 JL matchlen_match4_match_nolit_calcBlockSizeSmall matchlen_loopback_match_nolit_calcBlockSizeSmall: MOVQ (DI)(R9*1), R8 XORQ (BX)(R9*1), R8 TESTQ R8, R8 JZ matchlen_loop_match_nolit_calcBlockSizeSmall #ifdef GOAMD64_v3 TZCNTQ R8, R8 #else BSFQ R8, R8 #endif SARQ $0x03, R8 LEAL (R9)(R8*1), R9 JMP match_nolit_end_calcBlockSizeSmall matchlen_loop_match_nolit_calcBlockSizeSmall: LEAL -8(SI), SI LEAL 8(R9), R9 CMPL SI, $0x08 JGE matchlen_loopback_match_nolit_calcBlockSizeSmall JZ match_nolit_end_calcBlockSizeSmall matchlen_match4_match_nolit_calcBlockSizeSmall: CMPL SI, $0x04 JL matchlen_match2_match_nolit_calcBlockSizeSmall MOVL (DI)(R9*1), R8 CMPL (BX)(R9*1), R8 JNE matchlen_match2_match_nolit_calcBlockSizeSmall SUBL $0x04, SI LEAL 4(R9), R9 matchlen_match2_match_nolit_calcBlockSizeSmall: CMPL SI, $0x02 JL matchlen_match1_match_nolit_calcBlockSizeSmall MOVW (DI)(R9*1), R8 CMPW (BX)(R9*1), R8 JNE matchlen_match1_match_nolit_calcBlockSizeSmall SUBL $0x02, SI LEAL 2(R9), R9 matchlen_match1_match_nolit_calcBlockSizeSmall: CMPL SI, $0x01 JL match_nolit_end_calcBlockSizeSmall MOVB (DI)(R9*1), R8 CMPB (BX)(R9*1), R8 JNE match_nolit_end_calcBlockSizeSmall LEAL 1(R9), R9 match_nolit_end_calcBlockSizeSmall: ADDL R9, CX MOVL 16(SP), BX ADDL $0x04, R9 MOVL CX, 12(SP) // emitCopy two_byte_offset_match_nolit_calcBlockSizeSmall: CMPL R9, $0x40 JLE two_byte_offset_short_match_nolit_calcBlockSizeSmall LEAL -60(R9), R9 ADDQ $0x03, AX JMP two_byte_offset_match_nolit_calcBlockSizeSmall two_byte_offset_short_match_nolit_calcBlockSizeSmall: MOVL R9, BX SHLL $0x02, BX CMPL R9, $0x0c JGE emit_copy_three_match_nolit_calcBlockSizeSmall ADDQ $0x02, AX JMP match_nolit_emitcopy_end_calcBlockSizeSmall emit_copy_three_match_nolit_calcBlockSizeSmall: ADDQ $0x03, AX match_nolit_emitcopy_end_calcBlockSizeSmall: CMPL CX, 8(SP) JGE emit_remainder_calcBlockSizeSmall MOVQ -2(DX)(CX*1), SI CMPQ AX, (SP) JL match_nolit_dst_ok_calcBlockSizeSmall MOVQ $0x00000000, ret+24(FP) RET match_nolit_dst_ok_calcBlockSizeSmall: MOVQ $0x9e3779b1, R8 MOVQ SI, DI SHRQ $0x10, SI MOVQ SI, BX SHLQ $0x20, DI IMULQ R8, DI SHRQ $0x37, DI SHLQ $0x20, BX IMULQ R8, BX SHRQ $0x37, BX LEAL -2(CX), R8 LEAQ 24(SP)(BX*4), R9 MOVL (R9), BX MOVL R8, 24(SP)(DI*4) MOVL CX, (R9) CMPL (DX)(BX*1), SI JEQ match_nolit_loop_calcBlockSizeSmall INCL CX JMP search_loop_calcBlockSizeSmall emit_remainder_calcBlockSizeSmall: MOVQ src_len+8(FP), CX SUBL 12(SP), CX LEAQ 3(AX)(CX*1), CX CMPQ CX, (SP) JL emit_remainder_ok_calcBlockSizeSmall MOVQ $0x00000000, ret+24(FP) RET emit_remainder_ok_calcBlockSizeSmall: MOVQ src_len+8(FP), CX MOVL 12(SP), BX CMPL BX, CX JEQ emit_literal_done_emit_remainder_calcBlockSizeSmall MOVL CX, SI MOVL CX, 12(SP) LEAQ (DX)(BX*1), CX SUBL BX, SI LEAL -1(SI), CX CMPL CX, $0x3c JLT one_byte_emit_remainder_calcBlockSizeSmall CMPL CX, $0x00000100 JLT two_bytes_emit_remainder_calcBlockSizeSmall ADDQ $0x03, AX JMP memmove_long_emit_remainder_calcBlockSizeSmall two_bytes_emit_remainder_calcBlockSizeSmall: ADDQ $0x02, AX CMPL CX, $0x40 JL memmove_emit_remainder_calcBlockSizeSmall JMP memmove_long_emit_remainder_calcBlockSizeSmall one_byte_emit_remainder_calcBlockSizeSmall: ADDQ $0x01, AX memmove_emit_remainder_calcBlockSizeSmall: LEAQ (AX)(SI*1), AX JMP emit_literal_done_emit_remainder_calcBlockSizeSmall memmove_long_emit_remainder_calcBlockSizeSmall: LEAQ (AX)(SI*1), AX emit_literal_done_emit_remainder_calcBlockSizeSmall: MOVQ AX, ret+24(FP) RET // func emitLiteral(dst []byte, lit []byte) int // Requires: SSE2 TEXT ·emitLiteral(SB), NOSPLIT, $0-56 MOVQ lit_len+32(FP), DX MOVQ dst_base+0(FP), AX MOVQ lit_base+24(FP), CX TESTQ DX, DX JZ emit_literal_end_standalone_skip MOVL DX, BX LEAL -1(DX), SI CMPL SI, $0x3c JLT one_byte_standalone CMPL SI, $0x00000100 JLT two_bytes_standalone CMPL SI, $0x00010000 JLT three_bytes_standalone CMPL SI, $0x01000000 JLT four_bytes_standalone MOVB $0xfc, (AX) MOVL SI, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP memmove_long_standalone four_bytes_standalone: MOVL SI, DI SHRL $0x10, DI MOVB $0xf8, (AX) MOVW SI, 1(AX) MOVB DI, 3(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP memmove_long_standalone three_bytes_standalone: MOVB $0xf4, (AX) MOVW SI, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP memmove_long_standalone two_bytes_standalone: MOVB $0xf0, (AX) MOVB SI, 1(AX) ADDQ $0x02, BX ADDQ $0x02, AX CMPL SI, $0x40 JL memmove_standalone JMP memmove_long_standalone one_byte_standalone: SHLB $0x02, SI MOVB SI, (AX) ADDQ $0x01, BX ADDQ $0x01, AX memmove_standalone: // genMemMoveShort CMPQ DX, $0x03 JB emit_lit_memmove_standalone_memmove_move_1or2 JE emit_lit_memmove_standalone_memmove_move_3 CMPQ DX, $0x08 JB emit_lit_memmove_standalone_memmove_move_4through7 CMPQ DX, $0x10 JBE emit_lit_memmove_standalone_memmove_move_8through16 CMPQ DX, $0x20 JBE emit_lit_memmove_standalone_memmove_move_17through32 JMP emit_lit_memmove_standalone_memmove_move_33through64 emit_lit_memmove_standalone_memmove_move_1or2: MOVB (CX), SI MOVB -1(CX)(DX*1), CL MOVB SI, (AX) MOVB CL, -1(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_3: MOVW (CX), SI MOVB 2(CX), CL MOVW SI, (AX) MOVB CL, 2(AX) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_4through7: MOVL (CX), SI MOVL -4(CX)(DX*1), CX MOVL SI, (AX) MOVL CX, -4(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_8through16: MOVQ (CX), SI MOVQ -8(CX)(DX*1), CX MOVQ SI, (AX) MOVQ CX, -8(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_17through32: MOVOU (CX), X0 MOVOU -16(CX)(DX*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(DX*1) JMP emit_literal_end_standalone emit_lit_memmove_standalone_memmove_move_33through64: MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone memmove_long_standalone: // genMemMoveLong MOVOU (CX), X0 MOVOU 16(CX), X1 MOVOU -32(CX)(DX*1), X2 MOVOU -16(CX)(DX*1), X3 MOVQ DX, DI SHRQ $0x05, DI MOVQ AX, SI ANDL $0x0000001f, SI MOVQ $0x00000040, R8 SUBQ SI, R8 DECQ DI JA emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 LEAQ -32(CX)(R8*1), SI LEAQ -32(AX)(R8*1), R9 emit_lit_memmove_long_standalonelarge_big_loop_back: MOVOU (SI), X4 MOVOU 16(SI), X5 MOVOA X4, (R9) MOVOA X5, 16(R9) ADDQ $0x20, R9 ADDQ $0x20, SI ADDQ $0x20, R8 DECQ DI JNA emit_lit_memmove_long_standalonelarge_big_loop_back emit_lit_memmove_long_standalonelarge_forward_sse_loop_32: MOVOU -32(CX)(R8*1), X4 MOVOU -16(CX)(R8*1), X5 MOVOA X4, -32(AX)(R8*1) MOVOA X5, -16(AX)(R8*1) ADDQ $0x20, R8 CMPQ DX, R8 JAE emit_lit_memmove_long_standalonelarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(DX*1) MOVOU X3, -16(AX)(DX*1) JMP emit_literal_end_standalone JMP emit_literal_end_standalone emit_literal_end_standalone_skip: XORQ BX, BX emit_literal_end_standalone: MOVQ BX, ret+48(FP) RET // func emitRepeat(dst []byte, offset int, length int) int TEXT ·emitRepeat(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitRepeat emit_repeat_again_standalone: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JLE repeat_two_standalone CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone CMPL CX, $0x00000800 JLT repeat_two_offset_standalone cant_repeat_two_offset_standalone: CMPL DX, $0x00000104 JLT repeat_three_standalone CMPL DX, $0x00010100 JLT repeat_four_standalone CMPL DX, $0x0100ffff JLT repeat_five_standalone LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone repeat_five_standalone: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_repeat_end repeat_four_standalone: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_repeat_end repeat_three_standalone: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_repeat_end repeat_two_standalone: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_repeat_end repeat_two_offset_standalone: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX gen_emit_repeat_end: MOVQ BX, ret+40(FP) RET // func emitCopy(dst []byte, offset int, length int) int TEXT ·emitCopy(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone CMPL DX, $0x40 JLE four_bytes_remain_standalone MOVB $0xff, (AX) MOVL CX, 1(AX) LEAL -64(DX), DX ADDQ $0x05, BX ADDQ $0x05, AX CMPL DX, $0x04 JL four_bytes_remain_standalone // emitRepeat emit_repeat_again_standalone_emit_copy: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy cant_repeat_two_offset_standalone_emit_copy: CMPL DX, $0x00000104 JLT repeat_three_standalone_emit_copy CMPL DX, $0x00010100 JLT repeat_four_standalone_emit_copy CMPL DX, $0x0100ffff JLT repeat_five_standalone_emit_copy LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy repeat_five_standalone_emit_copy: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end four_bytes_remain_standalone: TESTL DX, DX JZ gen_emit_copy_end XORL SI, SI LEAL -1(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end two_byte_offset_standalone: CMPL DX, $0x40 JLE two_byte_offset_short_standalone CMPL CX, $0x00000800 JAE long_offset_short_standalone MOVL $0x00000001, SI LEAL 16(SI), SI MOVB CL, 1(AX) MOVL CX, DI SHRL $0x08, DI SHLL $0x05, DI ORL DI, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX SUBL $0x08, DX // emitRepeat LEAL -4(DX), DX JMP cant_repeat_two_offset_standalone_emit_copy_short_2b emit_repeat_again_standalone_emit_copy_short_2b: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy_short_2b CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy_short_2b CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy_short_2b cant_repeat_two_offset_standalone_emit_copy_short_2b: CMPL DX, $0x00000104 JLT repeat_three_standalone_emit_copy_short_2b CMPL DX, $0x00010100 JLT repeat_four_standalone_emit_copy_short_2b CMPL DX, $0x0100ffff JLT repeat_five_standalone_emit_copy_short_2b LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy_short_2b repeat_five_standalone_emit_copy_short_2b: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy_short_2b: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy_short_2b: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy_short_2b: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short_2b: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end long_offset_short_standalone: MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX ADDQ $0x03, AX ADDQ $0x03, BX // emitRepeat emit_repeat_again_standalone_emit_copy_short: MOVL DX, SI LEAL -4(DX), DX CMPL SI, $0x08 JLE repeat_two_standalone_emit_copy_short CMPL SI, $0x0c JGE cant_repeat_two_offset_standalone_emit_copy_short CMPL CX, $0x00000800 JLT repeat_two_offset_standalone_emit_copy_short cant_repeat_two_offset_standalone_emit_copy_short: CMPL DX, $0x00000104 JLT repeat_three_standalone_emit_copy_short CMPL DX, $0x00010100 JLT repeat_four_standalone_emit_copy_short CMPL DX, $0x0100ffff JLT repeat_five_standalone_emit_copy_short LEAL -16842747(DX), DX MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX ADDQ $0x05, BX JMP emit_repeat_again_standalone_emit_copy_short repeat_five_standalone_emit_copy_short: LEAL -65536(DX), DX MOVL DX, CX MOVW $0x001d, (AX) MOVW DX, 2(AX) SARL $0x10, CX MOVB CL, 4(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end repeat_four_standalone_emit_copy_short: LEAL -256(DX), DX MOVW $0x0019, (AX) MOVW DX, 2(AX) ADDQ $0x04, BX ADDQ $0x04, AX JMP gen_emit_copy_end repeat_three_standalone_emit_copy_short: LEAL -4(DX), DX MOVW $0x0015, (AX) MOVB DL, 2(AX) ADDQ $0x03, BX ADDQ $0x03, AX JMP gen_emit_copy_end repeat_two_standalone_emit_copy_short: SHLL $0x02, DX ORL $0x01, DX MOVW DX, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end repeat_two_offset_standalone_emit_copy_short: XORQ SI, SI LEAL 1(SI)(DX*4), DX MOVB CL, 1(AX) SARL $0x08, CX SHLL $0x05, CX ORL CX, DX MOVB DL, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end two_byte_offset_short_standalone: MOVL DX, SI SHLL $0x02, SI CMPL DX, $0x0c JGE emit_copy_three_standalone CMPL CX, $0x00000800 JGE emit_copy_three_standalone LEAL -15(SI), SI MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX ORL CX, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end emit_copy_three_standalone: LEAL -2(SI), SI MOVB SI, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX gen_emit_copy_end: MOVQ BX, ret+40(FP) RET // func emitCopyNoRepeat(dst []byte, offset int, length int) int TEXT ·emitCopyNoRepeat(SB), NOSPLIT, $0-48 XORQ BX, BX MOVQ dst_base+0(FP), AX MOVQ offset+24(FP), CX MOVQ length+32(FP), DX // emitCopy CMPL CX, $0x00010000 JL two_byte_offset_standalone_snappy four_bytes_loop_back_standalone_snappy: CMPL DX, $0x40 JLE four_bytes_remain_standalone_snappy MOVB $0xff, (AX) MOVL CX, 1(AX) LEAL -64(DX), DX ADDQ $0x05, BX ADDQ $0x05, AX CMPL DX, $0x04 JL four_bytes_remain_standalone_snappy JMP four_bytes_loop_back_standalone_snappy four_bytes_remain_standalone_snappy: TESTL DX, DX JZ gen_emit_copy_end_snappy XORL SI, SI LEAL -1(SI)(DX*4), DX MOVB DL, (AX) MOVL CX, 1(AX) ADDQ $0x05, BX ADDQ $0x05, AX JMP gen_emit_copy_end_snappy two_byte_offset_standalone_snappy: CMPL DX, $0x40 JLE two_byte_offset_short_standalone_snappy MOVB $0xee, (AX) MOVW CX, 1(AX) LEAL -60(DX), DX ADDQ $0x03, AX ADDQ $0x03, BX JMP two_byte_offset_standalone_snappy two_byte_offset_short_standalone_snappy: MOVL DX, SI SHLL $0x02, SI CMPL DX, $0x0c JGE emit_copy_three_standalone_snappy CMPL CX, $0x00000800 JGE emit_copy_three_standalone_snappy LEAL -15(SI), SI MOVB CL, 1(AX) SHRL $0x08, CX SHLL $0x05, CX ORL CX, SI MOVB SI, (AX) ADDQ $0x02, BX ADDQ $0x02, AX JMP gen_emit_copy_end_snappy emit_copy_three_standalone_snappy: LEAL -2(SI), SI MOVB SI, (AX) MOVW CX, 1(AX) ADDQ $0x03, BX ADDQ $0x03, AX gen_emit_copy_end_snappy: MOVQ BX, ret+40(FP) RET // func matchLen(a []byte, b []byte) int // Requires: BMI TEXT ·matchLen(SB), NOSPLIT, $0-56 MOVQ a_base+0(FP), AX MOVQ b_base+24(FP), CX MOVQ a_len+8(FP), DX // matchLen XORL SI, SI CMPL DX, $0x08 JL matchlen_match4_standalone matchlen_loopback_standalone: MOVQ (AX)(SI*1), BX XORQ (CX)(SI*1), BX TESTQ BX, BX JZ matchlen_loop_standalone #ifdef GOAMD64_v3 TZCNTQ BX, BX #else BSFQ BX, BX #endif SARQ $0x03, BX LEAL (SI)(BX*1), SI JMP gen_match_len_end matchlen_loop_standalone: LEAL -8(DX), DX LEAL 8(SI), SI CMPL DX, $0x08 JGE matchlen_loopback_standalone JZ gen_match_len_end matchlen_match4_standalone: CMPL DX, $0x04 JL matchlen_match2_standalone MOVL (AX)(SI*1), BX CMPL (CX)(SI*1), BX JNE matchlen_match2_standalone SUBL $0x04, DX LEAL 4(SI), SI matchlen_match2_standalone: CMPL DX, $0x02 JL matchlen_match1_standalone MOVW (AX)(SI*1), BX CMPW (CX)(SI*1), BX JNE matchlen_match1_standalone SUBL $0x02, DX LEAL 2(SI), SI matchlen_match1_standalone: CMPL DX, $0x01 JL gen_match_len_end MOVB (AX)(SI*1), BL CMPB (CX)(SI*1), BL JNE gen_match_len_end LEAL 1(SI), SI gen_match_len_end: MOVQ SI, ret+48(FP) RET // func cvtLZ4BlockAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4BlockAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -10(AX)(CX*1), CX XORQ DI, DI lz4_s2_loop: CMPQ DX, BX JAE lz4_s2_corrupt CMPQ AX, CX JAE lz4_s2_dstfull MOVBQZX (DX), R8 MOVQ R8, R9 MOVQ R8, R10 SHRQ $0x04, R9 ANDQ $0x0f, R10 CMPQ R8, $0xf0 JB lz4_s2_ll_end lz4_s2_ll_loop: INCQ DX CMPQ DX, BX JAE lz4_s2_corrupt MOVBQZX (DX), R8 ADDQ R8, R9 CMPQ R8, $0xff JEQ lz4_s2_ll_loop lz4_s2_ll_end: LEAQ (DX)(R9*1), R8 ADDQ $0x04, R10 CMPQ R8, BX JAE lz4_s2_corrupt INCQ DX INCQ R8 TESTQ R9, R9 JZ lz4_s2_lits_done LEAQ (AX)(R9*1), R11 CMPQ R11, CX JAE lz4_s2_dstfull ADDQ R9, SI LEAL -1(R9), R11 CMPL R11, $0x3c JLT one_byte_lz4_s2 CMPL R11, $0x00000100 JLT two_bytes_lz4_s2 CMPL R11, $0x00010000 JLT three_bytes_lz4_s2 CMPL R11, $0x01000000 JLT four_bytes_lz4_s2 MOVB $0xfc, (AX) MOVL R11, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4_s2 four_bytes_lz4_s2: MOVL R11, R12 SHRL $0x10, R12 MOVB $0xf8, (AX) MOVW R11, 1(AX) MOVB R12, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4_s2 three_bytes_lz4_s2: MOVB $0xf4, (AX) MOVW R11, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4_s2 two_bytes_lz4_s2: MOVB $0xf0, (AX) MOVB R11, 1(AX) ADDQ $0x02, AX CMPL R11, $0x40 JL memmove_lz4_s2 JMP memmove_long_lz4_s2 one_byte_lz4_s2: SHLB $0x02, R11 MOVB R11, (AX) ADDQ $0x01, AX memmove_lz4_s2: LEAQ (AX)(R9*1), R11 // genMemMoveShort CMPQ R9, $0x08 JLE emit_lit_memmove_lz4_s2_memmove_move_8 CMPQ R9, $0x10 JBE emit_lit_memmove_lz4_s2_memmove_move_8through16 CMPQ R9, $0x20 JBE emit_lit_memmove_lz4_s2_memmove_move_17through32 JMP emit_lit_memmove_lz4_s2_memmove_move_33through64 emit_lit_memmove_lz4_s2_memmove_move_8: MOVQ (DX), R12 MOVQ R12, (AX) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_8through16: MOVQ (DX), R12 MOVQ -8(DX)(R9*1), DX MOVQ R12, (AX) MOVQ DX, -8(AX)(R9*1) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R9*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R9*1) JMP memmove_end_copy_lz4_s2 emit_lit_memmove_lz4_s2_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) memmove_end_copy_lz4_s2: MOVQ R11, AX JMP lz4_s2_lits_emit_done memmove_long_lz4_s2: LEAQ (AX)(R9*1), R11 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R9*1), X2 MOVOU -16(DX)(R9*1), X3 MOVQ R9, R13 SHRQ $0x05, R13 MOVQ AX, R12 ANDL $0x0000001f, R12 MOVQ $0x00000040, R14 SUBQ R12, R14 DECQ R13 JA emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 LEAQ -32(DX)(R14*1), R12 LEAQ -32(AX)(R14*1), R15 emit_lit_memmove_long_lz4_s2large_big_loop_back: MOVOU (R12), X4 MOVOU 16(R12), X5 MOVOA X4, (R15) MOVOA X5, 16(R15) ADDQ $0x20, R15 ADDQ $0x20, R12 ADDQ $0x20, R14 DECQ R13 JNA emit_lit_memmove_long_lz4_s2large_big_loop_back emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32: MOVOU -32(DX)(R14*1), X4 MOVOU -16(DX)(R14*1), X5 MOVOA X4, -32(AX)(R14*1) MOVOA X5, -16(AX)(R14*1) ADDQ $0x20, R14 CMPQ R9, R14 JAE emit_lit_memmove_long_lz4_s2large_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R9*1) MOVOU X3, -16(AX)(R9*1) MOVQ R11, AX lz4_s2_lits_emit_done: MOVQ R8, DX lz4_s2_lits_done: CMPQ DX, BX JNE lz4_s2_match CMPQ R10, $0x04 JEQ lz4_s2_done JMP lz4_s2_corrupt lz4_s2_match: LEAQ 2(DX), R8 CMPQ R8, BX JAE lz4_s2_corrupt MOVWQZX (DX), R9 MOVQ R8, DX TESTQ R9, R9 JZ lz4_s2_corrupt CMPQ R9, SI JA lz4_s2_corrupt CMPQ R10, $0x13 JNE lz4_s2_ml_done lz4_s2_ml_loop: MOVBQZX (DX), R8 INCQ DX ADDQ R8, R10 CMPQ DX, BX JAE lz4_s2_corrupt CMPQ R8, $0xff JEQ lz4_s2_ml_loop lz4_s2_ml_done: ADDQ R10, SI CMPQ R9, DI JNE lz4_s2_docopy // emitRepeat emit_repeat_again_lz4_s2: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JLE repeat_two_lz4_s2 CMPL R8, $0x0c JGE cant_repeat_two_offset_lz4_s2 CMPL R9, $0x00000800 JLT repeat_two_offset_lz4_s2 cant_repeat_two_offset_lz4_s2: CMPL R10, $0x00000104 JLT repeat_three_lz4_s2 CMPL R10, $0x00010100 JLT repeat_four_lz4_s2 CMPL R10, $0x0100ffff JLT repeat_five_lz4_s2 LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2 repeat_five_lz4_s2: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop lz4_s2_docopy: MOVQ R9, DI // emitCopy CMPL R10, $0x40 JLE two_byte_offset_short_lz4_s2 CMPL R9, $0x00000800 JAE long_offset_short_lz4_s2 MOVL $0x00000001, R8 LEAL 16(R8), R8 MOVB R9, 1(AX) MOVL R9, R11 SHRL $0x08, R11 SHLL $0x05, R11 ORL R11, R8 MOVB R8, (AX) ADDQ $0x02, AX SUBL $0x08, R10 // emitRepeat LEAL -4(R10), R10 JMP cant_repeat_two_offset_lz4_s2_emit_copy_short_2b emit_repeat_again_lz4_s2_emit_copy_short_2b: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JLE repeat_two_lz4_s2_emit_copy_short_2b CMPL R8, $0x0c JGE cant_repeat_two_offset_lz4_s2_emit_copy_short_2b CMPL R9, $0x00000800 JLT repeat_two_offset_lz4_s2_emit_copy_short_2b cant_repeat_two_offset_lz4_s2_emit_copy_short_2b: CMPL R10, $0x00000104 JLT repeat_three_lz4_s2_emit_copy_short_2b CMPL R10, $0x00010100 JLT repeat_four_lz4_s2_emit_copy_short_2b CMPL R10, $0x0100ffff JLT repeat_five_lz4_s2_emit_copy_short_2b LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short_2b repeat_five_lz4_s2_emit_copy_short_2b: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2_emit_copy_short_2b: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2_emit_copy_short_2b: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2_emit_copy_short_2b: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2_emit_copy_short_2b: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop long_offset_short_lz4_s2: MOVB $0xee, (AX) MOVW R9, 1(AX) LEAL -60(R10), R10 ADDQ $0x03, AX // emitRepeat emit_repeat_again_lz4_s2_emit_copy_short: MOVL R10, R8 LEAL -4(R10), R10 CMPL R8, $0x08 JLE repeat_two_lz4_s2_emit_copy_short CMPL R8, $0x0c JGE cant_repeat_two_offset_lz4_s2_emit_copy_short CMPL R9, $0x00000800 JLT repeat_two_offset_lz4_s2_emit_copy_short cant_repeat_two_offset_lz4_s2_emit_copy_short: CMPL R10, $0x00000104 JLT repeat_three_lz4_s2_emit_copy_short CMPL R10, $0x00010100 JLT repeat_four_lz4_s2_emit_copy_short CMPL R10, $0x0100ffff JLT repeat_five_lz4_s2_emit_copy_short LEAL -16842747(R10), R10 MOVL $0xfffb001d, (AX) MOVB $0xff, 4(AX) ADDQ $0x05, AX JMP emit_repeat_again_lz4_s2_emit_copy_short repeat_five_lz4_s2_emit_copy_short: LEAL -65536(R10), R10 MOVL R10, R9 MOVW $0x001d, (AX) MOVW R10, 2(AX) SARL $0x10, R9 MOVB R9, 4(AX) ADDQ $0x05, AX JMP lz4_s2_loop repeat_four_lz4_s2_emit_copy_short: LEAL -256(R10), R10 MOVW $0x0019, (AX) MOVW R10, 2(AX) ADDQ $0x04, AX JMP lz4_s2_loop repeat_three_lz4_s2_emit_copy_short: LEAL -4(R10), R10 MOVW $0x0015, (AX) MOVB R10, 2(AX) ADDQ $0x03, AX JMP lz4_s2_loop repeat_two_lz4_s2_emit_copy_short: SHLL $0x02, R10 ORL $0x01, R10 MOVW R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop repeat_two_offset_lz4_s2_emit_copy_short: XORQ R8, R8 LEAL 1(R8)(R10*4), R10 MOVB R9, 1(AX) SARL $0x08, R9 SHLL $0x05, R9 ORL R9, R10 MOVB R10, (AX) ADDQ $0x02, AX JMP lz4_s2_loop two_byte_offset_short_lz4_s2: MOVL R10, R8 SHLL $0x02, R8 CMPL R10, $0x0c JGE emit_copy_three_lz4_s2 CMPL R9, $0x00000800 JGE emit_copy_three_lz4_s2 LEAL -15(R8), R8 MOVB R9, 1(AX) SHRL $0x08, R9 SHLL $0x05, R9 ORL R9, R8 MOVB R8, (AX) ADDQ $0x02, AX JMP lz4_s2_loop emit_copy_three_lz4_s2: LEAL -2(R8), R8 MOVB R8, (AX) MOVW R9, 1(AX) ADDQ $0x03, AX JMP lz4_s2_loop lz4_s2_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4_s2_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4_s2_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET // func cvtLZ4BlockSnappyAsm(dst []byte, src []byte) (uncompressed int, dstUsed int) // Requires: SSE2 TEXT ·cvtLZ4BlockSnappyAsm(SB), NOSPLIT, $0-64 XORQ SI, SI MOVQ dst_base+0(FP), AX MOVQ dst_len+8(FP), CX MOVQ src_base+24(FP), DX MOVQ src_len+32(FP), BX LEAQ (DX)(BX*1), BX LEAQ -10(AX)(CX*1), CX lz4_snappy_loop: CMPQ DX, BX JAE lz4_snappy_corrupt CMPQ AX, CX JAE lz4_snappy_dstfull MOVBQZX (DX), DI MOVQ DI, R8 MOVQ DI, R9 SHRQ $0x04, R8 ANDQ $0x0f, R9 CMPQ DI, $0xf0 JB lz4_snappy_ll_end lz4_snappy_ll_loop: INCQ DX CMPQ DX, BX JAE lz4_snappy_corrupt MOVBQZX (DX), DI ADDQ DI, R8 CMPQ DI, $0xff JEQ lz4_snappy_ll_loop lz4_snappy_ll_end: LEAQ (DX)(R8*1), DI ADDQ $0x04, R9 CMPQ DI, BX JAE lz4_snappy_corrupt INCQ DX INCQ DI TESTQ R8, R8 JZ lz4_snappy_lits_done LEAQ (AX)(R8*1), R10 CMPQ R10, CX JAE lz4_snappy_dstfull ADDQ R8, SI LEAL -1(R8), R10 CMPL R10, $0x3c JLT one_byte_lz4_snappy CMPL R10, $0x00000100 JLT two_bytes_lz4_snappy CMPL R10, $0x00010000 JLT three_bytes_lz4_snappy CMPL R10, $0x01000000 JLT four_bytes_lz4_snappy MOVB $0xfc, (AX) MOVL R10, 1(AX) ADDQ $0x05, AX JMP memmove_long_lz4_snappy four_bytes_lz4_snappy: MOVL R10, R11 SHRL $0x10, R11 MOVB $0xf8, (AX) MOVW R10, 1(AX) MOVB R11, 3(AX) ADDQ $0x04, AX JMP memmove_long_lz4_snappy three_bytes_lz4_snappy: MOVB $0xf4, (AX) MOVW R10, 1(AX) ADDQ $0x03, AX JMP memmove_long_lz4_snappy two_bytes_lz4_snappy: MOVB $0xf0, (AX) MOVB R10, 1(AX) ADDQ $0x02, AX CMPL R10, $0x40 JL memmove_lz4_snappy JMP memmove_long_lz4_snappy one_byte_lz4_snappy: SHLB $0x02, R10 MOVB R10, (AX) ADDQ $0x01, AX memmove_lz4_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveShort CMPQ R8, $0x08 JLE emit_lit_memmove_lz4_snappy_memmove_move_8 CMPQ R8, $0x10 JBE emit_lit_memmove_lz4_snappy_memmove_move_8through16 CMPQ R8, $0x20 JBE emit_lit_memmove_lz4_snappy_memmove_move_17through32 JMP emit_lit_memmove_lz4_snappy_memmove_move_33through64 emit_lit_memmove_lz4_snappy_memmove_move_8: MOVQ (DX), R11 MOVQ R11, (AX) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_8through16: MOVQ (DX), R11 MOVQ -8(DX)(R8*1), DX MOVQ R11, (AX) MOVQ DX, -8(AX)(R8*1) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_17through32: MOVOU (DX), X0 MOVOU -16(DX)(R8*1), X1 MOVOU X0, (AX) MOVOU X1, -16(AX)(R8*1) JMP memmove_end_copy_lz4_snappy emit_lit_memmove_lz4_snappy_memmove_move_33through64: MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) memmove_end_copy_lz4_snappy: MOVQ R10, AX JMP lz4_snappy_lits_emit_done memmove_long_lz4_snappy: LEAQ (AX)(R8*1), R10 // genMemMoveLong MOVOU (DX), X0 MOVOU 16(DX), X1 MOVOU -32(DX)(R8*1), X2 MOVOU -16(DX)(R8*1), X3 MOVQ R8, R12 SHRQ $0x05, R12 MOVQ AX, R11 ANDL $0x0000001f, R11 MOVQ $0x00000040, R13 SUBQ R11, R13 DECQ R12 JA emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 LEAQ -32(DX)(R13*1), R11 LEAQ -32(AX)(R13*1), R14 emit_lit_memmove_long_lz4_snappylarge_big_loop_back: MOVOU (R11), X4 MOVOU 16(R11), X5 MOVOA X4, (R14) MOVOA X5, 16(R14) ADDQ $0x20, R14 ADDQ $0x20, R11 ADDQ $0x20, R13 DECQ R12 JNA emit_lit_memmove_long_lz4_snappylarge_big_loop_back emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32: MOVOU -32(DX)(R13*1), X4 MOVOU -16(DX)(R13*1), X5 MOVOA X4, -32(AX)(R13*1) MOVOA X5, -16(AX)(R13*1) ADDQ $0x20, R13 CMPQ R8, R13 JAE emit_lit_memmove_long_lz4_snappylarge_forward_sse_loop_32 MOVOU X0, (AX) MOVOU X1, 16(AX) MOVOU X2, -32(AX)(R8*1) MOVOU X3, -16(AX)(R8*1) MOVQ R10, AX lz4_snappy_lits_emit_done: MOVQ DI, DX lz4_snappy_lits_done: CMPQ DX, BX JNE lz4_snappy_match CMPQ R9, $0x04 JEQ lz4_snappy_done JMP lz4_snappy_corrupt lz4_snappy_match: LEAQ 2(DX), DI CMPQ DI, BX JAE lz4_snappy_corrupt MOVWQZX (DX), R8 MOVQ DI, DX TESTQ R8, R8 JZ lz4_snappy_corrupt CMPQ R8, SI JA lz4_snappy_corrupt CMPQ R9, $0x13 JNE lz4_snappy_ml_done lz4_snappy_ml_loop: MOVBQZX (DX), DI INCQ DX ADDQ DI, R9 CMPQ DX, BX JAE lz4_snappy_corrupt CMPQ DI, $0xff JEQ lz4_snappy_ml_loop lz4_snappy_ml_done: ADDQ R9, SI // emitCopy two_byte_offset_lz4_s2: CMPL R9, $0x40 JLE two_byte_offset_short_lz4_s2 MOVB $0xee, (AX) MOVW R8, 1(AX) LEAL -60(R9), R9 ADDQ $0x03, AX CMPQ AX, CX JAE lz4_snappy_loop JMP two_byte_offset_lz4_s2 two_byte_offset_short_lz4_s2: MOVL R9, DI SHLL $0x02, DI CMPL R9, $0x0c JGE emit_copy_three_lz4_s2 CMPL R8, $0x00000800 JGE emit_copy_three_lz4_s2 LEAL -15(DI), DI MOVB R8, 1(AX) SHRL $0x08, R8 SHLL $0x05, R8 ORL R8, DI MOVB DI, (AX) ADDQ $0x02, AX JMP lz4_snappy_loop emit_copy_three_lz4_s2: LEAL -2(DI), DI MOVB DI, (AX) MOVW R8, 1(AX) ADDQ $0x03, AX JMP lz4_snappy_loop lz4_snappy_done: MOVQ dst_base+0(FP), CX SUBQ CX, AX MOVQ SI, uncompressed+48(FP) MOVQ AX, dstUsed+56(FP) RET lz4_snappy_corrupt: XORQ AX, AX LEAQ -1(AX), SI MOVQ SI, uncompressed+48(FP) RET lz4_snappy_dstfull: XORQ AX, AX LEAQ -2(AX), SI MOVQ SI, uncompressed+48(FP) RET