// Code generated by command: go run gen.go -out ../seqdec_amd64.s -pkg=zstd. DO NOT EDIT. //go:build !appengine && !noasm && gc && !noasm // func sequenceDecs_decode_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_amd64(SB), $8-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 104(AX), R10 MOVQ s+0(FP), AX MOVQ 144(AX), R11 MOVQ 152(AX), R12 MOVQ 160(AX), R13 sequenceDecs_decode_amd64_main_loop: MOVQ (SP), R14 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decode_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_amd64_fill_end sequenceDecs_decode_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decode_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_byte_by_byte sequenceDecs_decode_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_amd64_of_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_amd64_of_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_amd64_of_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_amd64_of_update_zero: MOVQ AX, 16(R10) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_amd64_ml_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_amd64_ml_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_amd64_ml_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_amd64_ml_update_zero: MOVQ AX, 8(R10) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decode_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_amd64_fill_2_end sequenceDecs_decode_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decode_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_amd64_fill_2_byte_by_byte sequenceDecs_decode_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_amd64_ll_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_amd64_ll_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_amd64_ll_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_amd64_ll_update_zero: MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_amd64_skip_update // Update Literal Length State MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, DI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, R8 // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, R9 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decode_amd64_skip_update: // Adjust offset MOVQ 16(R10), CX CMPQ AX, $0x01 JBE sequenceDecs_decode_amd64_adjust_offsetB_1_or_0 MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 JNE sequenceDecs_decode_amd64_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_amd64_adjust_offset_nonzero sequenceDecs_decode_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_amd64_adjust_offset_nonzero MOVQ R11, CX JMP sequenceDecs_decode_amd64_after_adjust sequenceDecs_decode_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_amd64_adjust_zero JEQ sequenceDecs_decode_amd64_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_amd64_adjust_three JMP sequenceDecs_decode_amd64_adjust_two sequenceDecs_decode_amd64_adjust_zero: MOVQ R11, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_one: MOVQ R12, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_two: MOVQ R13, AX JMP sequenceDecs_decode_amd64_adjust_test_temp_valid sequenceDecs_decode_amd64_adjust_three: LEAQ -1(R11), AX sequenceDecs_decode_amd64_adjust_test_temp_valid: TESTQ AX, AX JNZ sequenceDecs_decode_amd64_adjust_temp_valid MOVQ $0x00000001, AX sequenceDecs_decode_amd64_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R12, R13 MOVQ R11, R12 MOVQ AX, R11 MOVQ AX, CX sequenceDecs_decode_amd64_after_adjust: MOVQ CX, 16(R10) // Check values MOVQ 8(R10), AX MOVQ (R10), R14 LEAQ (AX)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_amd64_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decode_amd64_error_match_len_ofs_mismatch sequenceDecs_decode_amd64_match_len_ofs_ok: ADDQ $0x18, R10 MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decode_amd64_main_loop MOVQ s+0(FP), AX MOVQ R11, 144(AX) MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_amd64_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_56_amd64(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: CMOV TEXT ·sequenceDecs_decode_56_amd64(SB), $8-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 MOVQ 104(AX), R10 MOVQ s+0(FP), AX MOVQ 144(AX), R11 MOVQ 152(AX), R12 MOVQ 160(AX), R13 sequenceDecs_decode_56_amd64_main_loop: MOVQ (SP), R14 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decode_56_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R14 MOVQ (R14), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decode_56_amd64_fill_end sequenceDecs_decode_56_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decode_56_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decode_56_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R14 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R14), AX ORQ AX, DX JMP sequenceDecs_decode_56_amd64_fill_byte_by_byte sequenceDecs_decode_56_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_56_amd64_of_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_56_amd64_of_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_56_amd64_of_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_56_amd64_of_update_zero: MOVQ AX, 16(R10) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_56_amd64_ml_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_56_amd64_ml_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_56_amd64_ml_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_56_amd64_ml_update_zero: MOVQ AX, 8(R10) // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R15 SHLQ CL, R15 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decode_56_amd64_ll_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decode_56_amd64_ll_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decode_56_amd64_ll_update_zero NEGQ CX SHRQ CL, R15 ADDQ R15, AX sequenceDecs_decode_56_amd64_ll_update_zero: MOVQ AX, (R10) // Fill bitreader for state updates MOVQ R14, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_56_amd64_skip_update // Update Literal Length State MOVBQZX DI, R14 SHRQ $0x10, DI MOVWQZX DI, DI LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, DI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R14 SHRQ $0x10, R8 MOVWQZX R8, R8 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, R8 // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R14 SHRQ $0x10, R9 MOVWQZX R9, R9 LEAQ (BX)(R14*1), CX MOVQ DX, R15 MOVQ CX, BX ROLQ CL, R15 MOVL $0x00000001, BP MOVB R14, CL SHLL CL, BP DECL BP ANDQ BP, R15 ADDQ R15, R9 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decode_56_amd64_skip_update: // Adjust offset MOVQ 16(R10), CX CMPQ AX, $0x01 JBE sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0 MOVQ R12, R13 MOVQ R11, R12 MOVQ CX, R11 JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offsetB_1_or_0: CMPQ (R10), $0x00000000 JNE sequenceDecs_decode_56_amd64_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_56_amd64_adjust_offset_nonzero sequenceDecs_decode_56_amd64_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_adjust_offset_nonzero MOVQ R11, CX JMP sequenceDecs_decode_56_amd64_after_adjust sequenceDecs_decode_56_amd64_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_56_amd64_adjust_zero JEQ sequenceDecs_decode_56_amd64_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_56_amd64_adjust_three JMP sequenceDecs_decode_56_amd64_adjust_two sequenceDecs_decode_56_amd64_adjust_zero: MOVQ R11, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_one: MOVQ R12, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_two: MOVQ R13, AX JMP sequenceDecs_decode_56_amd64_adjust_test_temp_valid sequenceDecs_decode_56_amd64_adjust_three: LEAQ -1(R11), AX sequenceDecs_decode_56_amd64_adjust_test_temp_valid: TESTQ AX, AX JNZ sequenceDecs_decode_56_amd64_adjust_temp_valid MOVQ $0x00000001, AX sequenceDecs_decode_56_amd64_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R12, R13 MOVQ R11, R12 MOVQ AX, R11 MOVQ AX, CX sequenceDecs_decode_56_amd64_after_adjust: MOVQ CX, 16(R10) // Check values MOVQ 8(R10), AX MOVQ (R10), R14 LEAQ (AX)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decode_56_amd64_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_56_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch sequenceDecs_decode_56_amd64_match_len_ofs_ok: ADDQ $0x18, R10 MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decode_56_amd64_main_loop MOVQ s+0(FP), AX MOVQ R11, 144(AX) MOVQ R12, 152(AX) MOVQ R13, 160(AX) MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_56_amd64_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_56_amd64_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_bmi2(SB), $8-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 104(CX), R9 MOVQ s+0(FP), CX MOVQ 144(CX), R10 MOVQ 152(CX), R11 MOVQ 160(CX), R12 sequenceDecs_decode_bmi2_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decode_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_bmi2_fill_end sequenceDecs_decode_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decode_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_byte_by_byte sequenceDecs_decode_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 16(R9) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 8(R9) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decode_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_bmi2_fill_2_end sequenceDecs_decode_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decode_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_bmi2_fill_2_byte_by_byte sequenceDecs_decode_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, (R9) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_bmi2_skip_update LEAQ (SI)(DI*1), R14 ADDQ R8, R14 MOVBQZX R14, R14 LEAQ (DX)(R14*1), CX MOVQ AX, R15 MOVQ CX, DX ROLQ CL, R15 BZHIQ R14, R15, R15 // Update Offset State BZHIQ R8, R15, CX SHRXQ R8, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R15, CX SHRXQ DI, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R15, CX MOVQ $0x00001010, R14 BEXTRQ R14, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decode_bmi2_skip_update: // Adjust offset MOVQ 16(R9), CX CMPQ R13, $0x01 JBE sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0 MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 JNE sequenceDecs_decode_bmi2_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_bmi2_adjust_offset_nonzero sequenceDecs_decode_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_adjust_offset_nonzero MOVQ R10, CX JMP sequenceDecs_decode_bmi2_after_adjust sequenceDecs_decode_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_bmi2_adjust_zero JEQ sequenceDecs_decode_bmi2_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_bmi2_adjust_three JMP sequenceDecs_decode_bmi2_adjust_two sequenceDecs_decode_bmi2_adjust_zero: MOVQ R10, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_one: MOVQ R11, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_two: MOVQ R12, R13 JMP sequenceDecs_decode_bmi2_adjust_test_temp_valid sequenceDecs_decode_bmi2_adjust_three: LEAQ -1(R10), R13 sequenceDecs_decode_bmi2_adjust_test_temp_valid: TESTQ R13, R13 JNZ sequenceDecs_decode_bmi2_adjust_temp_valid MOVQ $0x00000001, R13 sequenceDecs_decode_bmi2_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R11, R12 MOVQ R10, R11 MOVQ R13, R10 MOVQ R13, CX sequenceDecs_decode_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values MOVQ 8(R9), R13 MOVQ (R9), R14 LEAQ (R13)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_bmi2_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_bmi2_match_len_ofs_ok TESTQ R13, R13 JNZ sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch sequenceDecs_decode_bmi2_match_len_ofs_ok: ADDQ $0x18, R9 MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decode_bmi2_main_loop MOVQ s+0(FP), CX MOVQ R10, 144(CX) MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_bmi2_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decode_56_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeAsmContext) int // Requires: BMI, BMI2, CMOV TEXT ·sequenceDecs_decode_56_bmi2(SB), $8-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 MOVQ 104(CX), R9 MOVQ s+0(FP), CX MOVQ 144(CX), R10 MOVQ 152(CX), R11 MOVQ 160(CX), R12 sequenceDecs_decode_56_bmi2_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decode_56_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R13 MOVQ (R13), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decode_56_bmi2_fill_end sequenceDecs_decode_56_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decode_56_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decode_56_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R13 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R13), CX ORQ CX, AX JMP sequenceDecs_decode_56_bmi2_fill_byte_by_byte sequenceDecs_decode_56_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 16(R9) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, 8(R9) // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R14 MOVQ AX, R15 LEAQ (DX)(R14*1), CX ROLQ CL, R15 BZHIQ R14, R15, R15 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R15, CX MOVQ CX, (R9) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decode_56_bmi2_skip_update LEAQ (SI)(DI*1), R14 ADDQ R8, R14 MOVBQZX R14, R14 LEAQ (DX)(R14*1), CX MOVQ AX, R15 MOVQ CX, DX ROLQ CL, R15 BZHIQ R14, R15, R15 // Update Offset State BZHIQ R8, R15, CX SHRXQ R8, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R15, CX SHRXQ DI, R15, R15 MOVQ $0x00001010, R14 BEXTRQ R14, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R15, CX MOVQ $0x00001010, R14 BEXTRQ R14, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decode_56_bmi2_skip_update: // Adjust offset MOVQ 16(R9), CX CMPQ R13, $0x01 JBE sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0 MOVQ R11, R12 MOVQ R10, R11 MOVQ CX, R10 JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offsetB_1_or_0: CMPQ (R9), $0x00000000 JNE sequenceDecs_decode_56_bmi2_adjust_offset_maybezero INCQ CX JMP sequenceDecs_decode_56_bmi2_adjust_offset_nonzero sequenceDecs_decode_56_bmi2_adjust_offset_maybezero: TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_adjust_offset_nonzero MOVQ R10, CX JMP sequenceDecs_decode_56_bmi2_after_adjust sequenceDecs_decode_56_bmi2_adjust_offset_nonzero: CMPQ CX, $0x01 JB sequenceDecs_decode_56_bmi2_adjust_zero JEQ sequenceDecs_decode_56_bmi2_adjust_one CMPQ CX, $0x02 JA sequenceDecs_decode_56_bmi2_adjust_three JMP sequenceDecs_decode_56_bmi2_adjust_two sequenceDecs_decode_56_bmi2_adjust_zero: MOVQ R10, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_one: MOVQ R11, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_two: MOVQ R12, R13 JMP sequenceDecs_decode_56_bmi2_adjust_test_temp_valid sequenceDecs_decode_56_bmi2_adjust_three: LEAQ -1(R10), R13 sequenceDecs_decode_56_bmi2_adjust_test_temp_valid: TESTQ R13, R13 JNZ sequenceDecs_decode_56_bmi2_adjust_temp_valid MOVQ $0x00000001, R13 sequenceDecs_decode_56_bmi2_adjust_temp_valid: CMPQ CX, $0x01 CMOVQNE R11, R12 MOVQ R10, R11 MOVQ R13, R10 MOVQ R13, CX sequenceDecs_decode_56_bmi2_after_adjust: MOVQ CX, 16(R9) // Check values MOVQ 8(R9), R13 MOVQ (R9), R14 LEAQ (R13)(R14*1), R15 MOVQ s+0(FP), BP ADDQ R15, 256(BP) MOVQ ctx+16(FP), R15 SUBQ R14, 128(R15) JS error_not_enough_literals CMPQ R13, $0x00020002 JA sequenceDecs_decode_56_bmi2_error_match_len_too_big TESTQ CX, CX JNZ sequenceDecs_decode_56_bmi2_match_len_ofs_ok TESTQ R13, R13 JNZ sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch sequenceDecs_decode_56_bmi2_match_len_ofs_ok: ADDQ $0x18, R9 MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decode_56_bmi2_main_loop MOVQ s+0(FP), CX MOVQ R10, 144(CX) MOVQ R11, 152(CX) MOVQ R12, 160(CX) MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decode_56_bmi2_error_match_len_ofs_mismatch: MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decode_56_bmi2_error_match_len_too_big: MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_executeSimple_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_amd64(SB), $8-9 MOVQ ctx+0(FP), R10 MOVQ 8(R10), CX TESTQ CX, CX JZ empty_seqs MOVQ (R10), AX MOVQ 24(R10), DX MOVQ 32(R10), BX MOVQ 80(R10), SI MOVQ 104(R10), DI MOVQ 120(R10), R8 MOVQ 56(R10), R9 MOVQ 64(R10), R10 ADDQ R10, R9 // seqsBase += 24 * seqIndex LEAQ (DX)(DX*2), R11 SHLQ $0x03, R11 ADDQ R11, AX // outBase += outPosition ADDQ DI, BX main_loop: MOVQ (AX), R11 MOVQ 16(AX), R12 MOVQ 8(AX), R13 // Copy literals TESTQ R11, R11 JZ check_offset XORQ R14, R14 copy_1: MOVUPS (SI)(R14*1), X0 MOVUPS X0, (BX)(R14*1) ADDQ $0x10, R14 CMPQ R14, R11 JB copy_1 ADDQ R11, SI ADDQ R11, BX ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: LEAQ (DI)(R10*1), R11 CMPQ R12, R11 JG error_match_off_too_big CMPQ R12, R8 JG error_match_off_too_big // Copy match from history MOVQ R12, R11 SUBQ DI, R11 JLS copy_match MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 JG copy_all_from_history MOVQ R13, R11 SUBQ $0x10, R11 JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (BX) ADDQ $0x10, R14 ADDQ $0x10, BX SUBQ $0x10, R11 JAE copy_4_loop LEAQ 16(R14)(R11*1), R14 LEAQ 16(BX)(R11*1), BX MOVUPS -16(R14), X0 MOVUPS X0, -16(BX) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), R11 MOVB 2(R14), R12 MOVW R11, (BX) MOVB R12, 2(BX) ADDQ R13, R14 ADDQ R13, BX JMP copy_4_end copy_4_move_4through7: MOVL (R14), R11 MOVL -4(R14)(R13*1), R12 MOVL R11, (BX) MOVL R12, -4(BX)(R13*1) ADDQ R13, R14 ADDQ R13, BX JMP copy_4_end copy_4_move_8through16: MOVQ (R14), R11 MOVQ -8(R14)(R13*1), R12 MOVQ R11, (BX) MOVQ R12, -8(BX)(R13*1) ADDQ R13, R14 ADDQ R13, BX copy_4_end: ADDQ R13, DI ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop JMP loop_finished copy_all_from_history: MOVQ R11, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (BX) ADDQ $0x10, R14 ADDQ $0x10, BX SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(BX)(R15*1), BX MOVUPS -16(R14), X0 MOVUPS X0, -16(BX) JMP copy_5_end copy_5_small: CMPQ R11, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ R11, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(R11*1), BP MOVB R15, (BX) MOVB BP, -1(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (BX) MOVB BP, 2(BX) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(R11*1), BP MOVL R15, (BX) MOVL BP, -4(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(R11*1), BP MOVQ R15, (BX) MOVQ BP, -8(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: MOVQ BX, R11 SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, DI MOVQ BX, R12 ADDQ R13, BX copy_2: MOVUPS (R11), X0 MOVUPS X0, (R12) ADDQ $0x10, R11 ADDQ $0x10, R12 SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, DI copy_slow_3: MOVB (R11), R12 MOVB R12, (BX) INCQ R11 INCQ BX DECQ R13 JNZ copy_slow_3 handle_loop: ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop loop_finished: // Return value MOVB $0x01, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET error_match_off_too_big: // Return value MOVB $0x00, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET empty_seqs: // Return value MOVB $0x01, ret+8(FP) RET // func sequenceDecs_executeSimple_safe_amd64(ctx *executeAsmContext) bool // Requires: SSE TEXT ·sequenceDecs_executeSimple_safe_amd64(SB), $8-9 MOVQ ctx+0(FP), R10 MOVQ 8(R10), CX TESTQ CX, CX JZ empty_seqs MOVQ (R10), AX MOVQ 24(R10), DX MOVQ 32(R10), BX MOVQ 80(R10), SI MOVQ 104(R10), DI MOVQ 120(R10), R8 MOVQ 56(R10), R9 MOVQ 64(R10), R10 ADDQ R10, R9 // seqsBase += 24 * seqIndex LEAQ (DX)(DX*2), R11 SHLQ $0x03, R11 ADDQ R11, AX // outBase += outPosition ADDQ DI, BX main_loop: MOVQ (AX), R11 MOVQ 16(AX), R12 MOVQ 8(AX), R13 // Copy literals TESTQ R11, R11 JZ check_offset MOVQ R11, R14 SUBQ $0x10, R14 JB copy_1_small copy_1_loop: MOVUPS (SI), X0 MOVUPS X0, (BX) ADDQ $0x10, SI ADDQ $0x10, BX SUBQ $0x10, R14 JAE copy_1_loop LEAQ 16(SI)(R14*1), SI LEAQ 16(BX)(R14*1), BX MOVUPS -16(SI), X0 MOVUPS X0, -16(BX) JMP copy_1_end copy_1_small: CMPQ R11, $0x03 JE copy_1_move_3 JB copy_1_move_1or2 CMPQ R11, $0x08 JB copy_1_move_4through7 JMP copy_1_move_8through16 copy_1_move_1or2: MOVB (SI), R14 MOVB -1(SI)(R11*1), R15 MOVB R14, (BX) MOVB R15, -1(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX JMP copy_1_end copy_1_move_3: MOVW (SI), R14 MOVB 2(SI), R15 MOVW R14, (BX) MOVB R15, 2(BX) ADDQ R11, SI ADDQ R11, BX JMP copy_1_end copy_1_move_4through7: MOVL (SI), R14 MOVL -4(SI)(R11*1), R15 MOVL R14, (BX) MOVL R15, -4(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX JMP copy_1_end copy_1_move_8through16: MOVQ (SI), R14 MOVQ -8(SI)(R11*1), R15 MOVQ R14, (BX) MOVQ R15, -8(BX)(R11*1) ADDQ R11, SI ADDQ R11, BX copy_1_end: ADDQ R11, DI // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: LEAQ (DI)(R10*1), R11 CMPQ R12, R11 JG error_match_off_too_big CMPQ R12, R8 JG error_match_off_too_big // Copy match from history MOVQ R12, R11 SUBQ DI, R11 JLS copy_match MOVQ R9, R14 SUBQ R11, R14 CMPQ R13, R11 JG copy_all_from_history MOVQ R13, R11 SUBQ $0x10, R11 JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (BX) ADDQ $0x10, R14 ADDQ $0x10, BX SUBQ $0x10, R11 JAE copy_4_loop LEAQ 16(R14)(R11*1), R14 LEAQ 16(BX)(R11*1), BX MOVUPS -16(R14), X0 MOVUPS X0, -16(BX) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), R11 MOVB 2(R14), R12 MOVW R11, (BX) MOVB R12, 2(BX) ADDQ R13, R14 ADDQ R13, BX JMP copy_4_end copy_4_move_4through7: MOVL (R14), R11 MOVL -4(R14)(R13*1), R12 MOVL R11, (BX) MOVL R12, -4(BX)(R13*1) ADDQ R13, R14 ADDQ R13, BX JMP copy_4_end copy_4_move_8through16: MOVQ (R14), R11 MOVQ -8(R14)(R13*1), R12 MOVQ R11, (BX) MOVQ R12, -8(BX)(R13*1) ADDQ R13, R14 ADDQ R13, BX copy_4_end: ADDQ R13, DI ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop JMP loop_finished copy_all_from_history: MOVQ R11, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (BX) ADDQ $0x10, R14 ADDQ $0x10, BX SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(BX)(R15*1), BX MOVUPS -16(R14), X0 MOVUPS X0, -16(BX) JMP copy_5_end copy_5_small: CMPQ R11, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ R11, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(R11*1), BP MOVB R15, (BX) MOVB BP, -1(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (BX) MOVB BP, 2(BX) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(R11*1), BP MOVL R15, (BX) MOVL BP, -4(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(R11*1), BP MOVQ R15, (BX) MOVQ BP, -8(BX)(R11*1) ADDQ R11, R14 ADDQ R11, BX copy_5_end: ADDQ R11, DI SUBQ R11, R13 // Copy match from the current buffer copy_match: MOVQ BX, R11 SUBQ R12, R11 // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, DI MOVQ R13, R12 SUBQ $0x10, R12 JB copy_2_small copy_2_loop: MOVUPS (R11), X0 MOVUPS X0, (BX) ADDQ $0x10, R11 ADDQ $0x10, BX SUBQ $0x10, R12 JAE copy_2_loop LEAQ 16(R11)(R12*1), R11 LEAQ 16(BX)(R12*1), BX MOVUPS -16(R11), X0 MOVUPS X0, -16(BX) JMP copy_2_end copy_2_small: CMPQ R13, $0x03 JE copy_2_move_3 JB copy_2_move_1or2 CMPQ R13, $0x08 JB copy_2_move_4through7 JMP copy_2_move_8through16 copy_2_move_1or2: MOVB (R11), R12 MOVB -1(R11)(R13*1), R14 MOVB R12, (BX) MOVB R14, -1(BX)(R13*1) ADDQ R13, R11 ADDQ R13, BX JMP copy_2_end copy_2_move_3: MOVW (R11), R12 MOVB 2(R11), R14 MOVW R12, (BX) MOVB R14, 2(BX) ADDQ R13, R11 ADDQ R13, BX JMP copy_2_end copy_2_move_4through7: MOVL (R11), R12 MOVL -4(R11)(R13*1), R14 MOVL R12, (BX) MOVL R14, -4(BX)(R13*1) ADDQ R13, R11 ADDQ R13, BX JMP copy_2_end copy_2_move_8through16: MOVQ (R11), R12 MOVQ -8(R11)(R13*1), R14 MOVQ R12, (BX) MOVQ R14, -8(BX)(R13*1) ADDQ R13, R11 ADDQ R13, BX copy_2_end: JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, DI copy_slow_3: MOVB (R11), R12 MOVB R12, (BX) INCQ R11 INCQ BX DECQ R13 JNZ copy_slow_3 handle_loop: ADDQ $0x18, AX INCQ DX CMPQ DX, CX JB main_loop loop_finished: // Return value MOVB $0x01, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET error_match_off_too_big: // Return value MOVB $0x00, ret+8(FP) // Update the context MOVQ ctx+0(FP), AX MOVQ DX, 24(AX) MOVQ DI, 104(AX) MOVQ 80(AX), CX SUBQ CX, SI MOVQ SI, 112(AX) RET empty_seqs: // Return value MOVB $0x01, ret+8(FP) RET // func sequenceDecs_decodeSync_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_amd64(SB), $64-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 XORQ CX, CX MOVQ CX, 8(SP) MOVQ CX, 16(SP) MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) MOVQ 144(AX), R11 MOVQ 136(AX), R12 MOVQ 200(AX), CX MOVQ CX, 56(SP) MOVQ 176(AX), CX MOVQ CX, 48(SP) MOVQ 184(AX), AX MOVQ AX, 40(SP) MOVQ 40(SP), AX ADDQ AX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R10, 32(SP) // outBase += outPosition ADDQ R12, R10 sequenceDecs_decodeSync_amd64_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decodeSync_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_amd64_fill_end sequenceDecs_decodeSync_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_amd64_fill_byte_by_byte sequenceDecs_decodeSync_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_amd64_of_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_amd64_of_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_amd64_of_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_amd64_of_update_zero: MOVQ AX, 8(SP) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_amd64_ml_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_amd64_ml_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_amd64_ml_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_amd64_ml_update_zero: MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_amd64_fill_2_end sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_amd64_fill_2_byte_by_byte sequenceDecs_decodeSync_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_amd64_ll_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_amd64_ll_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_amd64_ll_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_amd64_ll_update_zero: MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_amd64_skip_update // Update Literal Length State MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, DI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, R8 // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, R9 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decodeSync_amd64_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ AX, $0x01 JBE sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_amd64_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_amd64_adjust_offset_nonzero sequenceDecs_decodeSync_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_amd64_after_adjust sequenceDecs_decodeSync_amd64_adjust_offset_nonzero: MOVQ R13, AX XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_amd64_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_amd64_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_amd64_adjust_skip MOVQ 152(CX), AX MOVQ AX, 160(CX) sequenceDecs_decodeSync_amd64_adjust_skip: MOVQ 144(CX), AX MOVQ AX, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_amd64_after_adjust: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), AX MOVQ 24(SP), CX LEAQ (AX)(CX*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ CX, 104(R14) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decodeSync_amd64_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch sequenceDecs_decodeSync_amd64_match_len_ofs_ok: MOVQ 24(SP), AX MOVQ 8(SP), CX MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (AX)(R13*1), R14 ADDQ R10, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ AX, AX JZ check_offset XORQ R14, R14 copy_1: MOVUPS (R11)(R14*1), X0 MOVUPS X0, (R10)(R14*1) ADDQ $0x10, R14 CMPQ R14, AX JB copy_1 ADDQ AX, R11 ADDQ AX, R10 ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R12, AX ADDQ 40(SP), AX CMPQ CX, AX JG error_match_off_too_big CMPQ CX, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ CX, AX SUBQ R12, AX JLS copy_match MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX JG copy_all_from_history MOVQ R13, AX SUBQ $0x10, AX JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (R10) ADDQ $0x10, R14 ADDQ $0x10, R10 SUBQ $0x10, AX JAE copy_4_loop LEAQ 16(R14)(AX*1), R14 LEAQ 16(R10)(AX*1), R10 MOVUPS -16(R14), X0 MOVUPS X0, -16(R10) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), AX MOVB 2(R14), CL MOVW AX, (R10) MOVB CL, 2(R10) ADDQ R13, R14 ADDQ R13, R10 JMP copy_4_end copy_4_move_4through7: MOVL (R14), AX MOVL -4(R14)(R13*1), CX MOVL AX, (R10) MOVL CX, -4(R10)(R13*1) ADDQ R13, R14 ADDQ R13, R10 JMP copy_4_end copy_4_move_8through16: MOVQ (R14), AX MOVQ -8(R14)(R13*1), CX MOVQ AX, (R10) MOVQ CX, -8(R10)(R13*1) ADDQ R13, R14 ADDQ R13, R10 copy_4_end: ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: MOVQ AX, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (R10) ADDQ $0x10, R14 ADDQ $0x10, R10 SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(R10)(R15*1), R10 MOVUPS -16(R14), X0 MOVUPS X0, -16(R10) JMP copy_5_end copy_5_small: CMPQ AX, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ AX, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(AX*1), BP MOVB R15, (R10) MOVB BP, -1(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (R10) MOVB BP, 2(R10) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(AX*1), BP MOVL R15, (R10) MOVL BP, -4(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(AX*1), BP MOVQ R15, (R10) MOVQ BP, -8(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: MOVQ R10, AX SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R12 MOVQ R10, CX ADDQ R13, R10 copy_2: MOVUPS (AX), X0 MOVUPS X0, (CX) ADDQ $0x10, AX ADDQ $0x10, CX SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R12 copy_slow_3: MOVB (AX), CL MOVB CL, (R10) INCQ AX INCQ R10 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decodeSync_amd64_main_loop loop_finished: MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Update the context MOVQ ctx+16(FP), AX MOVQ R12, 136(AX) MOVQ 144(AX), CX SUBQ CX, R11 MOVQ R11, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_amd64_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_amd64_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R12, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R12, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_bmi2(SB), $64-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 XORQ R9, R9 MOVQ R9, 8(SP) MOVQ R9, 16(SP) MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) MOVQ 144(CX), R10 MOVQ 136(CX), R11 MOVQ 200(CX), R12 MOVQ R12, 56(SP) MOVQ 176(CX), R12 MOVQ R12, 48(SP) MOVQ 184(CX), CX MOVQ CX, 40(SP) MOVQ 40(SP), CX ADDQ CX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R9, 32(SP) // outBase += outPosition ADDQ R11, R9 sequenceDecs_decodeSync_bmi2_main_loop: MOVQ (SP), R12 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decodeSync_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_bmi2_fill_end sequenceDecs_decodeSync_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_bmi2_fill_byte_by_byte sequenceDecs_decodeSync_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 8(SP) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_bmi2_fill_2_end sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_bmi2_fill_2_byte_by_byte sequenceDecs_decodeSync_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 24(SP) // Fill bitreader for state updates MOVQ R12, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R12 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_bmi2_skip_update LEAQ (SI)(DI*1), R13 ADDQ R8, R13 MOVBQZX R13, R13 LEAQ (DX)(R13*1), CX MOVQ AX, R14 MOVQ CX, DX ROLQ CL, R14 BZHIQ R13, R14, R14 // Update Offset State BZHIQ R8, R14, CX SHRXQ R8, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R14, CX SHRXQ DI, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R14, CX MOVQ $0x00001010, R13 BEXTRQ R13, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decodeSync_bmi2_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ R12, $0x01 JBE sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero sequenceDecs_decodeSync_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_bmi2_after_adjust sequenceDecs_decodeSync_bmi2_adjust_offset_nonzero: MOVQ R13, R12 XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_bmi2_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_bmi2_adjust_skip MOVQ 152(CX), R12 MOVQ R12, 160(CX) sequenceDecs_decodeSync_bmi2_adjust_skip: MOVQ 144(CX), R12 MOVQ R12, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), CX MOVQ 24(SP), R12 LEAQ (CX)(R12*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ R12, 104(R14) JS error_not_enough_literals CMPQ CX, $0x00020002 JA sequenceDecs_decodeSync_bmi2_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_bmi2_match_len_ofs_ok TESTQ CX, CX JNZ sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch sequenceDecs_decodeSync_bmi2_match_len_ofs_ok: MOVQ 24(SP), CX MOVQ 8(SP), R12 MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (CX)(R13*1), R14 ADDQ R9, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ CX, CX JZ check_offset XORQ R14, R14 copy_1: MOVUPS (R10)(R14*1), X0 MOVUPS X0, (R9)(R14*1) ADDQ $0x10, R14 CMPQ R14, CX JB copy_1 ADDQ CX, R10 ADDQ CX, R9 ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R11, CX ADDQ 40(SP), CX CMPQ R12, CX JG error_match_off_too_big CMPQ R12, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ R12, CX SUBQ R11, CX JLS copy_match MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX JG copy_all_from_history MOVQ R13, CX SUBQ $0x10, CX JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (R9) ADDQ $0x10, R14 ADDQ $0x10, R9 SUBQ $0x10, CX JAE copy_4_loop LEAQ 16(R14)(CX*1), R14 LEAQ 16(R9)(CX*1), R9 MOVUPS -16(R14), X0 MOVUPS X0, -16(R9) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), CX MOVB 2(R14), R12 MOVW CX, (R9) MOVB R12, 2(R9) ADDQ R13, R14 ADDQ R13, R9 JMP copy_4_end copy_4_move_4through7: MOVL (R14), CX MOVL -4(R14)(R13*1), R12 MOVL CX, (R9) MOVL R12, -4(R9)(R13*1) ADDQ R13, R14 ADDQ R13, R9 JMP copy_4_end copy_4_move_8through16: MOVQ (R14), CX MOVQ -8(R14)(R13*1), R12 MOVQ CX, (R9) MOVQ R12, -8(R9)(R13*1) ADDQ R13, R14 ADDQ R13, R9 copy_4_end: ADDQ R13, R11 JMP handle_loop JMP loop_finished copy_all_from_history: MOVQ CX, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (R9) ADDQ $0x10, R14 ADDQ $0x10, R9 SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(R9)(R15*1), R9 MOVUPS -16(R14), X0 MOVUPS X0, -16(R9) JMP copy_5_end copy_5_small: CMPQ CX, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ CX, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(CX*1), BP MOVB R15, (R9) MOVB BP, -1(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (R9) MOVB BP, 2(R9) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(CX*1), BP MOVL R15, (R9) MOVL BP, -4(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(CX*1), BP MOVQ R15, (R9) MOVQ BP, -8(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: MOVQ R9, CX SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R11 MOVQ R9, R12 ADDQ R13, R9 copy_2: MOVUPS (CX), X0 MOVUPS X0, (R12) ADDQ $0x10, CX ADDQ $0x10, R12 SUBQ $0x10, R13 JHI copy_2 JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R11 copy_slow_3: MOVB (CX), R12 MOVB R12, (R9) INCQ CX INCQ R9 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decodeSync_bmi2_main_loop loop_finished: MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Update the context MOVQ ctx+16(FP), AX MOVQ R11, 136(AX) MOVQ 144(AX), CX SUBQ CX, R10 MOVQ R10, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_bmi2_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_bmi2_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R11, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R11, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_safe_amd64(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_amd64(SB), $64-32 MOVQ br+8(FP), AX MOVQ 32(AX), DX MOVBQZX 40(AX), BX MOVQ 24(AX), SI MOVQ (AX), AX ADDQ SI, AX MOVQ AX, (SP) MOVQ ctx+16(FP), AX MOVQ 72(AX), DI MOVQ 80(AX), R8 MOVQ 88(AX), R9 XORQ CX, CX MOVQ CX, 8(SP) MOVQ CX, 16(SP) MOVQ CX, 24(SP) MOVQ 112(AX), R10 MOVQ 128(AX), CX MOVQ CX, 32(SP) MOVQ 144(AX), R11 MOVQ 136(AX), R12 MOVQ 200(AX), CX MOVQ CX, 56(SP) MOVQ 176(AX), CX MOVQ CX, 48(SP) MOVQ 184(AX), AX MOVQ AX, 40(SP) MOVQ 40(SP), AX ADDQ AX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R10, 32(SP) // outBase += outPosition ADDQ R12, R10 sequenceDecs_decodeSync_safe_amd64_main_loop: MOVQ (SP), R13 // Fill bitreader to have enough for the offset and match length. CMPQ SI, $0x08 JL sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_safe_amd64_fill_end sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_safe_amd64_fill_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_safe_amd64_fill_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_safe_amd64_fill_byte_by_byte sequenceDecs_decodeSync_safe_amd64_fill_end: // Update offset MOVQ R9, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_safe_amd64_of_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_safe_amd64_of_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_safe_amd64_of_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_safe_amd64_of_update_zero: MOVQ AX, 8(SP) // Update match length MOVQ R8, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_safe_amd64_ml_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_safe_amd64_ml_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_safe_amd64_ml_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_safe_amd64_ml_update_zero: MOVQ AX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ SI, $0x08 JL sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte MOVQ BX, AX SHRQ $0x03, AX SUBQ AX, R13 MOVQ (R13), DX SUBQ AX, SI ANDQ $0x07, BX JMP sequenceDecs_decodeSync_safe_amd64_fill_2_end sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte: CMPQ SI, $0x00 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end CMPQ BX, $0x07 JLE sequenceDecs_decodeSync_safe_amd64_fill_2_end SHLQ $0x08, DX SUBQ $0x01, R13 SUBQ $0x01, SI SUBQ $0x08, BX MOVBQZX (R13), AX ORQ AX, DX JMP sequenceDecs_decodeSync_safe_amd64_fill_2_byte_by_byte sequenceDecs_decodeSync_safe_amd64_fill_2_end: // Update literal length MOVQ DI, AX MOVQ BX, CX MOVQ DX, R14 SHLQ CL, R14 MOVB AH, CL SHRQ $0x20, AX TESTQ CX, CX JZ sequenceDecs_decodeSync_safe_amd64_ll_update_zero ADDQ CX, BX CMPQ BX, $0x40 JA sequenceDecs_decodeSync_safe_amd64_ll_update_zero CMPQ CX, $0x40 JAE sequenceDecs_decodeSync_safe_amd64_ll_update_zero NEGQ CX SHRQ CL, R14 ADDQ R14, AX sequenceDecs_decodeSync_safe_amd64_ll_update_zero: MOVQ AX, 24(SP) // Fill bitreader for state updates MOVQ R13, (SP) MOVQ R9, AX SHRQ $0x08, AX MOVBQZX AL, AX MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_safe_amd64_skip_update // Update Literal Length State MOVBQZX DI, R13 SHRQ $0x10, DI MOVWQZX DI, DI LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, DI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(DI*8), DI // Update Match Length State MOVBQZX R8, R13 SHRQ $0x10, R8 MOVWQZX R8, R8 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, R8 // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(R8*8), R8 // Update Offset State MOVBQZX R9, R13 SHRQ $0x10, R9 MOVWQZX R9, R9 LEAQ (BX)(R13*1), CX MOVQ DX, R14 MOVQ CX, BX ROLQ CL, R14 MOVL $0x00000001, R15 MOVB R13, CL SHLL CL, R15 DECL R15 ANDQ R15, R14 ADDQ R14, R9 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R9*8), R9 sequenceDecs_decodeSync_safe_amd64_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ AX, $0x01 JBE sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero sequenceDecs_decodeSync_safe_amd64_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_safe_amd64_after_adjust sequenceDecs_decodeSync_safe_amd64_adjust_offset_nonzero: MOVQ R13, AX XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, AX CMOVQEQ R15, R14 ADDQ 144(CX)(AX*8), R14 JNZ sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_safe_amd64_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_safe_amd64_adjust_skip MOVQ 152(CX), AX MOVQ AX, 160(CX) sequenceDecs_decodeSync_safe_amd64_adjust_skip: MOVQ 144(CX), AX MOVQ AX, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_safe_amd64_after_adjust: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), AX MOVQ 24(SP), CX LEAQ (AX)(CX*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ CX, 104(R14) JS error_not_enough_literals CMPQ AX, $0x00020002 JA sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok TESTQ AX, AX JNZ sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch sequenceDecs_decodeSync_safe_amd64_match_len_ofs_ok: MOVQ 24(SP), AX MOVQ 8(SP), CX MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (AX)(R13*1), R14 ADDQ R10, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ AX, AX JZ check_offset MOVQ AX, R14 SUBQ $0x10, R14 JB copy_1_small copy_1_loop: MOVUPS (R11), X0 MOVUPS X0, (R10) ADDQ $0x10, R11 ADDQ $0x10, R10 SUBQ $0x10, R14 JAE copy_1_loop LEAQ 16(R11)(R14*1), R11 LEAQ 16(R10)(R14*1), R10 MOVUPS -16(R11), X0 MOVUPS X0, -16(R10) JMP copy_1_end copy_1_small: CMPQ AX, $0x03 JE copy_1_move_3 JB copy_1_move_1or2 CMPQ AX, $0x08 JB copy_1_move_4through7 JMP copy_1_move_8through16 copy_1_move_1or2: MOVB (R11), R14 MOVB -1(R11)(AX*1), R15 MOVB R14, (R10) MOVB R15, -1(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 JMP copy_1_end copy_1_move_3: MOVW (R11), R14 MOVB 2(R11), R15 MOVW R14, (R10) MOVB R15, 2(R10) ADDQ AX, R11 ADDQ AX, R10 JMP copy_1_end copy_1_move_4through7: MOVL (R11), R14 MOVL -4(R11)(AX*1), R15 MOVL R14, (R10) MOVL R15, -4(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 JMP copy_1_end copy_1_move_8through16: MOVQ (R11), R14 MOVQ -8(R11)(AX*1), R15 MOVQ R14, (R10) MOVQ R15, -8(R10)(AX*1) ADDQ AX, R11 ADDQ AX, R10 copy_1_end: ADDQ AX, R12 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R12, AX ADDQ 40(SP), AX CMPQ CX, AX JG error_match_off_too_big CMPQ CX, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ CX, AX SUBQ R12, AX JLS copy_match MOVQ 48(SP), R14 SUBQ AX, R14 CMPQ R13, AX JG copy_all_from_history MOVQ R13, AX SUBQ $0x10, AX JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (R10) ADDQ $0x10, R14 ADDQ $0x10, R10 SUBQ $0x10, AX JAE copy_4_loop LEAQ 16(R14)(AX*1), R14 LEAQ 16(R10)(AX*1), R10 MOVUPS -16(R14), X0 MOVUPS X0, -16(R10) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), AX MOVB 2(R14), CL MOVW AX, (R10) MOVB CL, 2(R10) ADDQ R13, R14 ADDQ R13, R10 JMP copy_4_end copy_4_move_4through7: MOVL (R14), AX MOVL -4(R14)(R13*1), CX MOVL AX, (R10) MOVL CX, -4(R10)(R13*1) ADDQ R13, R14 ADDQ R13, R10 JMP copy_4_end copy_4_move_8through16: MOVQ (R14), AX MOVQ -8(R14)(R13*1), CX MOVQ AX, (R10) MOVQ CX, -8(R10)(R13*1) ADDQ R13, R14 ADDQ R13, R10 copy_4_end: ADDQ R13, R12 JMP handle_loop JMP loop_finished copy_all_from_history: MOVQ AX, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (R10) ADDQ $0x10, R14 ADDQ $0x10, R10 SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(R10)(R15*1), R10 MOVUPS -16(R14), X0 MOVUPS X0, -16(R10) JMP copy_5_end copy_5_small: CMPQ AX, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ AX, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(AX*1), BP MOVB R15, (R10) MOVB BP, -1(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (R10) MOVB BP, 2(R10) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(AX*1), BP MOVL R15, (R10) MOVL BP, -4(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(AX*1), BP MOVQ R15, (R10) MOVQ BP, -8(R10)(AX*1) ADDQ AX, R14 ADDQ AX, R10 copy_5_end: ADDQ AX, R12 SUBQ AX, R13 // Copy match from the current buffer copy_match: MOVQ R10, AX SUBQ CX, AX // ml <= mo CMPQ R13, CX JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R12 MOVQ R13, CX SUBQ $0x10, CX JB copy_2_small copy_2_loop: MOVUPS (AX), X0 MOVUPS X0, (R10) ADDQ $0x10, AX ADDQ $0x10, R10 SUBQ $0x10, CX JAE copy_2_loop LEAQ 16(AX)(CX*1), AX LEAQ 16(R10)(CX*1), R10 MOVUPS -16(AX), X0 MOVUPS X0, -16(R10) JMP copy_2_end copy_2_small: CMPQ R13, $0x03 JE copy_2_move_3 JB copy_2_move_1or2 CMPQ R13, $0x08 JB copy_2_move_4through7 JMP copy_2_move_8through16 copy_2_move_1or2: MOVB (AX), CL MOVB -1(AX)(R13*1), R14 MOVB CL, (R10) MOVB R14, -1(R10)(R13*1) ADDQ R13, AX ADDQ R13, R10 JMP copy_2_end copy_2_move_3: MOVW (AX), CX MOVB 2(AX), R14 MOVW CX, (R10) MOVB R14, 2(R10) ADDQ R13, AX ADDQ R13, R10 JMP copy_2_end copy_2_move_4through7: MOVL (AX), CX MOVL -4(AX)(R13*1), R14 MOVL CX, (R10) MOVL R14, -4(R10)(R13*1) ADDQ R13, AX ADDQ R13, R10 JMP copy_2_end copy_2_move_8through16: MOVQ (AX), CX MOVQ -8(AX)(R13*1), R14 MOVQ CX, (R10) MOVQ R14, -8(R10)(R13*1) ADDQ R13, AX ADDQ R13, R10 copy_2_end: JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R12 copy_slow_3: MOVB (AX), CL MOVB CL, (R10) INCQ AX INCQ R10 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), AX DECQ 96(AX) JNS sequenceDecs_decodeSync_safe_amd64_main_loop loop_finished: MOVQ br+8(FP), AX MOVQ DX, 32(AX) MOVB BL, 40(AX) MOVQ SI, 24(AX) // Update the context MOVQ ctx+16(FP), AX MOVQ R12, 136(AX) MOVQ 144(AX), CX SUBQ CX, R11 MOVQ R11, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_safe_amd64_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_safe_amd64_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R12, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R12, 136(AX) MOVQ $0x00000005, ret+24(FP) RET // func sequenceDecs_decodeSync_safe_bmi2(s *sequenceDecs, br *bitReader, ctx *decodeSyncAsmContext) int // Requires: BMI, BMI2, CMOV, SSE TEXT ·sequenceDecs_decodeSync_safe_bmi2(SB), $64-32 MOVQ br+8(FP), CX MOVQ 32(CX), AX MOVBQZX 40(CX), DX MOVQ 24(CX), BX MOVQ (CX), CX ADDQ BX, CX MOVQ CX, (SP) MOVQ ctx+16(FP), CX MOVQ 72(CX), SI MOVQ 80(CX), DI MOVQ 88(CX), R8 XORQ R9, R9 MOVQ R9, 8(SP) MOVQ R9, 16(SP) MOVQ R9, 24(SP) MOVQ 112(CX), R9 MOVQ 128(CX), R10 MOVQ R10, 32(SP) MOVQ 144(CX), R10 MOVQ 136(CX), R11 MOVQ 200(CX), R12 MOVQ R12, 56(SP) MOVQ 176(CX), R12 MOVQ R12, 48(SP) MOVQ 184(CX), CX MOVQ CX, 40(SP) MOVQ 40(SP), CX ADDQ CX, 48(SP) // Calculate poiter to s.out[cap(s.out)] (a past-end pointer) ADDQ R9, 32(SP) // outBase += outPosition ADDQ R11, R9 sequenceDecs_decodeSync_safe_bmi2_main_loop: MOVQ (SP), R12 // Fill bitreader to have enough for the offset and match length. CMPQ BX, $0x08 JL sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_safe_bmi2_fill_end sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_safe_bmi2_fill_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_safe_bmi2_fill_byte_by_byte sequenceDecs_decodeSync_safe_bmi2_fill_end: // Update offset MOVQ $0x00000808, CX BEXTRQ CX, R8, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ R8, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 8(SP) // Update match length MOVQ $0x00000808, CX BEXTRQ CX, DI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ DI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 16(SP) // Fill bitreader to have enough for the remaining CMPQ BX, $0x08 JL sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte MOVQ DX, CX SHRQ $0x03, CX SUBQ CX, R12 MOVQ (R12), AX SUBQ CX, BX ANDQ $0x07, DX JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_end sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte: CMPQ BX, $0x00 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end CMPQ DX, $0x07 JLE sequenceDecs_decodeSync_safe_bmi2_fill_2_end SHLQ $0x08, AX SUBQ $0x01, R12 SUBQ $0x01, BX SUBQ $0x08, DX MOVBQZX (R12), CX ORQ CX, AX JMP sequenceDecs_decodeSync_safe_bmi2_fill_2_byte_by_byte sequenceDecs_decodeSync_safe_bmi2_fill_2_end: // Update literal length MOVQ $0x00000808, CX BEXTRQ CX, SI, R13 MOVQ AX, R14 LEAQ (DX)(R13*1), CX ROLQ CL, R14 BZHIQ R13, R14, R14 MOVQ CX, DX MOVQ SI, CX SHRQ $0x20, CX ADDQ R14, CX MOVQ CX, 24(SP) // Fill bitreader for state updates MOVQ R12, (SP) MOVQ $0x00000808, CX BEXTRQ CX, R8, R12 MOVQ ctx+16(FP), CX CMPQ 96(CX), $0x00 JZ sequenceDecs_decodeSync_safe_bmi2_skip_update LEAQ (SI)(DI*1), R13 ADDQ R8, R13 MOVBQZX R13, R13 LEAQ (DX)(R13*1), CX MOVQ AX, R14 MOVQ CX, DX ROLQ CL, R14 BZHIQ R13, R14, R14 // Update Offset State BZHIQ R8, R14, CX SHRXQ R8, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, R8, R8 ADDQ CX, R8 // Load ctx.ofTable MOVQ ctx+16(FP), CX MOVQ 48(CX), CX MOVQ (CX)(R8*8), R8 // Update Match Length State BZHIQ DI, R14, CX SHRXQ DI, R14, R14 MOVQ $0x00001010, R13 BEXTRQ R13, DI, DI ADDQ CX, DI // Load ctx.mlTable MOVQ ctx+16(FP), CX MOVQ 24(CX), CX MOVQ (CX)(DI*8), DI // Update Literal Length State BZHIQ SI, R14, CX MOVQ $0x00001010, R13 BEXTRQ R13, SI, SI ADDQ CX, SI // Load ctx.llTable MOVQ ctx+16(FP), CX MOVQ (CX), CX MOVQ (CX)(SI*8), SI sequenceDecs_decodeSync_safe_bmi2_skip_update: // Adjust offset MOVQ s+0(FP), CX MOVQ 8(SP), R13 CMPQ R12, $0x01 JBE sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0 MOVUPS 144(CX), X0 MOVQ R13, 144(CX) MOVUPS X0, 152(CX) JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offsetB_1_or_0: CMPQ 24(SP), $0x00000000 JNE sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero INCQ R13 JMP sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero sequenceDecs_decodeSync_safe_bmi2_adjust_offset_maybezero: TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero MOVQ 144(CX), R13 JMP sequenceDecs_decodeSync_safe_bmi2_after_adjust sequenceDecs_decodeSync_safe_bmi2_adjust_offset_nonzero: MOVQ R13, R12 XORQ R14, R14 MOVQ $-1, R15 CMPQ R13, $0x03 CMOVQEQ R14, R12 CMOVQEQ R15, R14 ADDQ 144(CX)(R12*8), R14 JNZ sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid MOVQ $0x00000001, R14 sequenceDecs_decodeSync_safe_bmi2_adjust_temp_valid: CMPQ R13, $0x01 JZ sequenceDecs_decodeSync_safe_bmi2_adjust_skip MOVQ 152(CX), R12 MOVQ R12, 160(CX) sequenceDecs_decodeSync_safe_bmi2_adjust_skip: MOVQ 144(CX), R12 MOVQ R12, 152(CX) MOVQ R14, 144(CX) MOVQ R14, R13 sequenceDecs_decodeSync_safe_bmi2_after_adjust: MOVQ R13, 8(SP) // Check values MOVQ 16(SP), CX MOVQ 24(SP), R12 LEAQ (CX)(R12*1), R14 MOVQ s+0(FP), R15 ADDQ R14, 256(R15) MOVQ ctx+16(FP), R14 SUBQ R12, 104(R14) JS error_not_enough_literals CMPQ CX, $0x00020002 JA sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big TESTQ R13, R13 JNZ sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok TESTQ CX, CX JNZ sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch sequenceDecs_decodeSync_safe_bmi2_match_len_ofs_ok: MOVQ 24(SP), CX MOVQ 8(SP), R12 MOVQ 16(SP), R13 // Check if we have enough space in s.out LEAQ (CX)(R13*1), R14 ADDQ R9, R14 CMPQ R14, 32(SP) JA error_not_enough_space // Copy literals TESTQ CX, CX JZ check_offset MOVQ CX, R14 SUBQ $0x10, R14 JB copy_1_small copy_1_loop: MOVUPS (R10), X0 MOVUPS X0, (R9) ADDQ $0x10, R10 ADDQ $0x10, R9 SUBQ $0x10, R14 JAE copy_1_loop LEAQ 16(R10)(R14*1), R10 LEAQ 16(R9)(R14*1), R9 MOVUPS -16(R10), X0 MOVUPS X0, -16(R9) JMP copy_1_end copy_1_small: CMPQ CX, $0x03 JE copy_1_move_3 JB copy_1_move_1or2 CMPQ CX, $0x08 JB copy_1_move_4through7 JMP copy_1_move_8through16 copy_1_move_1or2: MOVB (R10), R14 MOVB -1(R10)(CX*1), R15 MOVB R14, (R9) MOVB R15, -1(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 JMP copy_1_end copy_1_move_3: MOVW (R10), R14 MOVB 2(R10), R15 MOVW R14, (R9) MOVB R15, 2(R9) ADDQ CX, R10 ADDQ CX, R9 JMP copy_1_end copy_1_move_4through7: MOVL (R10), R14 MOVL -4(R10)(CX*1), R15 MOVL R14, (R9) MOVL R15, -4(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 JMP copy_1_end copy_1_move_8through16: MOVQ (R10), R14 MOVQ -8(R10)(CX*1), R15 MOVQ R14, (R9) MOVQ R15, -8(R9)(CX*1) ADDQ CX, R10 ADDQ CX, R9 copy_1_end: ADDQ CX, R11 // Malformed input if seq.mo > t+len(hist) || seq.mo > s.windowSize) check_offset: MOVQ R11, CX ADDQ 40(SP), CX CMPQ R12, CX JG error_match_off_too_big CMPQ R12, 56(SP) JG error_match_off_too_big // Copy match from history MOVQ R12, CX SUBQ R11, CX JLS copy_match MOVQ 48(SP), R14 SUBQ CX, R14 CMPQ R13, CX JG copy_all_from_history MOVQ R13, CX SUBQ $0x10, CX JB copy_4_small copy_4_loop: MOVUPS (R14), X0 MOVUPS X0, (R9) ADDQ $0x10, R14 ADDQ $0x10, R9 SUBQ $0x10, CX JAE copy_4_loop LEAQ 16(R14)(CX*1), R14 LEAQ 16(R9)(CX*1), R9 MOVUPS -16(R14), X0 MOVUPS X0, -16(R9) JMP copy_4_end copy_4_small: CMPQ R13, $0x03 JE copy_4_move_3 CMPQ R13, $0x08 JB copy_4_move_4through7 JMP copy_4_move_8through16 copy_4_move_3: MOVW (R14), CX MOVB 2(R14), R12 MOVW CX, (R9) MOVB R12, 2(R9) ADDQ R13, R14 ADDQ R13, R9 JMP copy_4_end copy_4_move_4through7: MOVL (R14), CX MOVL -4(R14)(R13*1), R12 MOVL CX, (R9) MOVL R12, -4(R9)(R13*1) ADDQ R13, R14 ADDQ R13, R9 JMP copy_4_end copy_4_move_8through16: MOVQ (R14), CX MOVQ -8(R14)(R13*1), R12 MOVQ CX, (R9) MOVQ R12, -8(R9)(R13*1) ADDQ R13, R14 ADDQ R13, R9 copy_4_end: ADDQ R13, R11 JMP handle_loop JMP loop_finished copy_all_from_history: MOVQ CX, R15 SUBQ $0x10, R15 JB copy_5_small copy_5_loop: MOVUPS (R14), X0 MOVUPS X0, (R9) ADDQ $0x10, R14 ADDQ $0x10, R9 SUBQ $0x10, R15 JAE copy_5_loop LEAQ 16(R14)(R15*1), R14 LEAQ 16(R9)(R15*1), R9 MOVUPS -16(R14), X0 MOVUPS X0, -16(R9) JMP copy_5_end copy_5_small: CMPQ CX, $0x03 JE copy_5_move_3 JB copy_5_move_1or2 CMPQ CX, $0x08 JB copy_5_move_4through7 JMP copy_5_move_8through16 copy_5_move_1or2: MOVB (R14), R15 MOVB -1(R14)(CX*1), BP MOVB R15, (R9) MOVB BP, -1(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_3: MOVW (R14), R15 MOVB 2(R14), BP MOVW R15, (R9) MOVB BP, 2(R9) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_4through7: MOVL (R14), R15 MOVL -4(R14)(CX*1), BP MOVL R15, (R9) MOVL BP, -4(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 JMP copy_5_end copy_5_move_8through16: MOVQ (R14), R15 MOVQ -8(R14)(CX*1), BP MOVQ R15, (R9) MOVQ BP, -8(R9)(CX*1) ADDQ CX, R14 ADDQ CX, R9 copy_5_end: ADDQ CX, R11 SUBQ CX, R13 // Copy match from the current buffer copy_match: MOVQ R9, CX SUBQ R12, CX // ml <= mo CMPQ R13, R12 JA copy_overlapping_match // Copy non-overlapping match ADDQ R13, R11 MOVQ R13, R12 SUBQ $0x10, R12 JB copy_2_small copy_2_loop: MOVUPS (CX), X0 MOVUPS X0, (R9) ADDQ $0x10, CX ADDQ $0x10, R9 SUBQ $0x10, R12 JAE copy_2_loop LEAQ 16(CX)(R12*1), CX LEAQ 16(R9)(R12*1), R9 MOVUPS -16(CX), X0 MOVUPS X0, -16(R9) JMP copy_2_end copy_2_small: CMPQ R13, $0x03 JE copy_2_move_3 JB copy_2_move_1or2 CMPQ R13, $0x08 JB copy_2_move_4through7 JMP copy_2_move_8through16 copy_2_move_1or2: MOVB (CX), R12 MOVB -1(CX)(R13*1), R14 MOVB R12, (R9) MOVB R14, -1(R9)(R13*1) ADDQ R13, CX ADDQ R13, R9 JMP copy_2_end copy_2_move_3: MOVW (CX), R12 MOVB 2(CX), R14 MOVW R12, (R9) MOVB R14, 2(R9) ADDQ R13, CX ADDQ R13, R9 JMP copy_2_end copy_2_move_4through7: MOVL (CX), R12 MOVL -4(CX)(R13*1), R14 MOVL R12, (R9) MOVL R14, -4(R9)(R13*1) ADDQ R13, CX ADDQ R13, R9 JMP copy_2_end copy_2_move_8through16: MOVQ (CX), R12 MOVQ -8(CX)(R13*1), R14 MOVQ R12, (R9) MOVQ R14, -8(R9)(R13*1) ADDQ R13, CX ADDQ R13, R9 copy_2_end: JMP handle_loop // Copy overlapping match copy_overlapping_match: ADDQ R13, R11 copy_slow_3: MOVB (CX), R12 MOVB R12, (R9) INCQ CX INCQ R9 DECQ R13 JNZ copy_slow_3 handle_loop: MOVQ ctx+16(FP), CX DECQ 96(CX) JNS sequenceDecs_decodeSync_safe_bmi2_main_loop loop_finished: MOVQ br+8(FP), CX MOVQ AX, 32(CX) MOVB DL, 40(CX) MOVQ BX, 24(CX) // Update the context MOVQ ctx+16(FP), AX MOVQ R11, 136(AX) MOVQ 144(AX), CX SUBQ CX, R10 MOVQ R10, 168(AX) // Return success MOVQ $0x00000000, ret+24(FP) RET // Return with match length error sequenceDecs_decodeSync_safe_bmi2_error_match_len_ofs_mismatch: MOVQ 16(SP), AX MOVQ ctx+16(FP), CX MOVQ AX, 216(CX) MOVQ $0x00000001, ret+24(FP) RET // Return with match too long error sequenceDecs_decodeSync_safe_bmi2_error_match_len_too_big: MOVQ ctx+16(FP), AX MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ $0x00000002, ret+24(FP) RET // Return with match offset too long error error_match_off_too_big: MOVQ ctx+16(FP), AX MOVQ 8(SP), CX MOVQ CX, 224(AX) MOVQ R11, 136(AX) MOVQ $0x00000003, ret+24(FP) RET // Return with not enough literals error error_not_enough_literals: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ $0x00000004, ret+24(FP) RET // Return with not enough output space error error_not_enough_space: MOVQ ctx+16(FP), AX MOVQ 24(SP), CX MOVQ CX, 208(AX) MOVQ 16(SP), CX MOVQ CX, 216(AX) MOVQ R11, 136(AX) MOVQ $0x00000005, ret+24(FP) RET