// Code generated by command: go run gen.go -out ../decompress_amd64.s -pkg=huff0. DO NOT EDIT. //go:build amd64 && !appengine && !noasm && gc // func decompress4x_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), AX MOVBQZX 8(AX), DI MOVQ 16(AX), SI MOVQ 48(AX), BX MOVQ 24(AX), R9 MOVQ 32(AX), R10 MOVQ (AX), R11 // Main loop main_loop: MOVQ SI, R8 CMPQ R8, BX SETGE DL // br0.fillFast32() MOVQ 32(R11), R12 MOVBQZX 40(R11), R13 CMPQ R13, $0x20 JBE skip_fill0 MOVQ 24(R11), AX SUBQ $0x20, R13 SUBQ $0x04, AX MOVQ (R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R14*1), R14 MOVQ R13, CX SHLQ CL, R14 MOVQ AX, 24(R11) ORQ R14, R12 // exhausted = exhausted || (br0.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) MOVQ DI, CX MOVQ R12, R14 SHRQ CL, R14 // v1 := table[val1&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (R8) // update the bitreader structure MOVQ R12, 32(R11) MOVB R13, 40(R11) ADDQ R9, R8 // br1.fillFast32() MOVQ 80(R11), R12 MOVBQZX 88(R11), R13 CMPQ R13, $0x20 JBE skip_fill1 MOVQ 72(R11), AX SUBQ $0x20, R13 SUBQ $0x04, AX MOVQ 48(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R14*1), R14 MOVQ R13, CX SHLQ CL, R14 MOVQ AX, 72(R11) ORQ R14, R12 // exhausted = exhausted || (br1.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) MOVQ DI, CX MOVQ R12, R14 SHRQ CL, R14 // v1 := table[val1&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (R8) // update the bitreader structure MOVQ R12, 80(R11) MOVB R13, 88(R11) ADDQ R9, R8 // br2.fillFast32() MOVQ 128(R11), R12 MOVBQZX 136(R11), R13 CMPQ R13, $0x20 JBE skip_fill2 MOVQ 120(R11), AX SUBQ $0x20, R13 SUBQ $0x04, AX MOVQ 96(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R14*1), R14 MOVQ R13, CX SHLQ CL, R14 MOVQ AX, 120(R11) ORQ R14, R12 // exhausted = exhausted || (br2.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) MOVQ DI, CX MOVQ R12, R14 SHRQ CL, R14 // v1 := table[val1&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (R8) // update the bitreader structure MOVQ R12, 128(R11) MOVB R13, 136(R11) ADDQ R9, R8 // br3.fillFast32() MOVQ 176(R11), R12 MOVBQZX 184(R11), R13 CMPQ R13, $0x20 JBE skip_fill3 MOVQ 168(R11), AX SUBQ $0x20, R13 SUBQ $0x04, AX MOVQ 144(R11), R14 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (AX)(R14*1), R14 MOVQ R13, CX SHLQ CL, R14 MOVQ AX, 168(R11) ORQ R14, R12 // exhausted = exhausted || (br3.off < 4) CMPQ AX, $0x04 SETLT AL ORB AL, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) MOVQ DI, CX MOVQ R12, R14 SHRQ CL, R14 // v1 := table[val1&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry)) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // these two writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) MOVW AX, (R8) // update the bitreader structure MOVQ R12, 176(R11) MOVB R13, 184(R11) ADDQ $0x02, SI TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX SUBQ 16(AX), SI SHLQ $0x02, SI MOVQ SI, 40(AX) RET // func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) TEXT ·decompress4x_8b_main_loop_amd64(SB), $0-8 XORQ DX, DX // Preload values MOVQ ctx+0(FP), CX MOVBQZX 8(CX), DI MOVQ 16(CX), BX MOVQ 48(CX), SI MOVQ 24(CX), R9 MOVQ 32(CX), R10 MOVQ (CX), R11 // Main loop main_loop: MOVQ BX, R8 CMPQ R8, SI SETGE DL // br0.fillFast32() MOVQ 32(R11), R12 MOVBQZX 40(R11), R13 CMPQ R13, $0x20 JBE skip_fill0 MOVQ 24(R11), R14 SUBQ $0x20, R13 SUBQ $0x04, R14 MOVQ (R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R14)(R15*1), R15 MOVQ R13, CX SHLQ CL, R15 MOVQ R14, 24(R11) ORQ R15, R12 // exhausted = exhausted || (br0.off < 4) CMPQ R14, $0x04 SETLT AL ORB AL, DL skip_fill0: // val0 := br0.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br0.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v1 := table[val0&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // val2 := br0.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v2 := table[val0&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // val3 := br0.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v3 := table[val0&mask] MOVW (R10)(R14*2), CX // br0.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (R8) // update the bitreader structure MOVQ R12, 32(R11) MOVB R13, 40(R11) ADDQ R9, R8 // br1.fillFast32() MOVQ 80(R11), R12 MOVBQZX 88(R11), R13 CMPQ R13, $0x20 JBE skip_fill1 MOVQ 72(R11), R14 SUBQ $0x20, R13 SUBQ $0x04, R14 MOVQ 48(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R14)(R15*1), R15 MOVQ R13, CX SHLQ CL, R15 MOVQ R14, 72(R11) ORQ R15, R12 // exhausted = exhausted || (br1.off < 4) CMPQ R14, $0x04 SETLT AL ORB AL, DL skip_fill1: // val0 := br1.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br1.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v1 := table[val0&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // val2 := br1.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v2 := table[val0&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // val3 := br1.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v3 := table[val0&mask] MOVW (R10)(R14*2), CX // br1.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (R8) // update the bitreader structure MOVQ R12, 80(R11) MOVB R13, 88(R11) ADDQ R9, R8 // br2.fillFast32() MOVQ 128(R11), R12 MOVBQZX 136(R11), R13 CMPQ R13, $0x20 JBE skip_fill2 MOVQ 120(R11), R14 SUBQ $0x20, R13 SUBQ $0x04, R14 MOVQ 96(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R14)(R15*1), R15 MOVQ R13, CX SHLQ CL, R15 MOVQ R14, 120(R11) ORQ R15, R12 // exhausted = exhausted || (br2.off < 4) CMPQ R14, $0x04 SETLT AL ORB AL, DL skip_fill2: // val0 := br2.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br2.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v1 := table[val0&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // val2 := br2.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v2 := table[val0&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // val3 := br2.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v3 := table[val0&mask] MOVW (R10)(R14*2), CX // br2.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (R8) // update the bitreader structure MOVQ R12, 128(R11) MOVB R13, 136(R11) ADDQ R9, R8 // br3.fillFast32() MOVQ 176(R11), R12 MOVBQZX 184(R11), R13 CMPQ R13, $0x20 JBE skip_fill3 MOVQ 168(R11), R14 SUBQ $0x20, R13 SUBQ $0x04, R14 MOVQ 144(R11), R15 // b.value |= uint64(low) << (b.bitsRead & 63) MOVL (R14)(R15*1), R15 MOVQ R13, CX SHLQ CL, R15 MOVQ R14, 168(R11) ORQ R15, R12 // exhausted = exhausted || (br3.off < 4) CMPQ R14, $0x04 SETLT AL ORB AL, DL skip_fill3: // val0 := br3.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v0 := table[val0&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v0.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 // val1 := br3.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v1 := table[val0&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v1.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // val2 := br3.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v2 := table[val0&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v2.entry) MOVB CH, AH SHLQ CL, R12 ADDB CL, R13 // val3 := br3.peekTopBits(peekBits) MOVQ R12, R14 MOVQ DI, CX SHRQ CL, R14 // v3 := table[val0&mask] MOVW (R10)(R14*2), CX // br3.advance(uint8(v3.entry) MOVB CH, AL SHLQ CL, R12 ADDB CL, R13 BSWAPL AX // these four writes get coalesced // out[id * dstEvery + 0] = uint8(v0.entry >> 8) // out[id * dstEvery + 1] = uint8(v1.entry >> 8) // out[id * dstEvery + 3] = uint8(v2.entry >> 8) // out[id * dstEvery + 4] = uint8(v3.entry >> 8) MOVL AX, (R8) // update the bitreader structure MOVQ R12, 176(R11) MOVB R13, 184(R11) ADDQ $0x04, BX TESTB DL, DL JZ main_loop MOVQ ctx+0(FP), AX SUBQ 16(AX), BX SHLQ $0x02, BX MOVQ BX, 40(AX) RET // func decompress1x_main_loop_amd64(ctx *decompress1xContext) TEXT ·decompress1x_main_loop_amd64(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exeeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exeeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_1_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), R12 MOVQ R11, CX SHLQ CL, R12 ORQ R12, R10 bitReader_fillFast_2_end: MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 MOVQ DI, CX MOVQ R10, R12 SHRQ CL, R12 MOVW (SI)(R12*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLQ CL, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX SUBQ 16(AX), DX MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exeeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET // func decompress1x_main_loop_bmi2(ctx *decompress1xContext) // Requires: BMI2 TEXT ·decompress1x_main_loop_bmi2(SB), $0-8 MOVQ ctx+0(FP), CX MOVQ 16(CX), DX MOVQ 24(CX), BX CMPQ BX, $0x04 JB error_max_decoded_size_exeeded LEAQ (DX)(BX*1), BX MOVQ (CX), SI MOVQ (SI), R8 MOVQ 24(SI), R9 MOVQ 32(SI), R10 MOVBQZX 40(SI), R11 MOVQ 32(CX), SI MOVBQZX 8(CX), DI JMP loop_condition main_loop: // Check if we have room for 4 bytes in the output buffer LEAQ 4(DX), CX CMPQ CX, BX JGE error_max_decoded_size_exeeded // Decode 4 values CMPQ R11, $0x20 JL bitReader_fillFast_1_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_1_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX CMPQ R11, $0x20 JL bitReader_fillFast_2_end SUBQ $0x20, R11 SUBQ $0x04, R9 MOVL (R8)(R9*1), CX SHLXQ R11, CX, CX ORQ CX, R10 bitReader_fillFast_2_end: SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AH MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 SHRXQ DI, R10, CX MOVW (SI)(CX*2), CX MOVB CH, AL MOVBQZX CL, CX ADDQ CX, R11 SHLXQ CX, R10, R10 BSWAPL AX // Store the decoded values MOVL AX, (DX) ADDQ $0x04, DX loop_condition: CMPQ R9, $0x08 JGE main_loop // Update ctx structure MOVQ ctx+0(FP), AX SUBQ 16(AX), DX MOVQ DX, 40(AX) MOVQ (AX), AX MOVQ R9, 24(AX) MOVQ R10, 32(AX) MOVB R11, 40(AX) RET // Report error error_max_decoded_size_exeeded: MOVQ ctx+0(FP), AX MOVQ $-1, CX MOVQ CX, 40(AX) RET