// +build !appengine // +build gc // +build !noasm #include "textflag.h" #include "funcdata.h" #include "go_asm.h" #ifdef GOAMD64_v4 #ifndef GOAMD64_v3 #define GOAMD64_v3 #endif #endif #define bufoff 256 // see decompress.go, we're using [4][256]byte table //func decompress4x_main_loop_x86(pbr0, pbr1, pbr2, pbr3 *bitReaderShifted, // peekBits uint8, buf *byte, tbl *dEntrySingle) (int, bool) TEXT ·decompress4x_main_loop_x86(SB), NOSPLIT, $8 #define off R8 #define buffer DI #define table SI #define br_bits_read R9 #define br_value R10 #define br_offset R11 #define peek_bits R12 #define exhausted DX #define br0 R13 #define br1 R14 #define br2 R15 #define br3 BP MOVQ BP, 0(SP) XORQ exhausted, exhausted // exhausted = false XORQ off, off // off = 0 MOVBQZX peekBits+32(FP), peek_bits MOVQ buf+40(FP), buffer MOVQ tbl+48(FP), table MOVQ pbr0+0(FP), br0 MOVQ pbr1+8(FP), br1 MOVQ pbr2+16(FP), br2 MOVQ pbr3+24(FP), br3 main_loop: {{ define "decode_2_values_x86" }} // const stream = {{ var "id" }} // br{{ var "id"}}.fillFast() MOVBQZX bitReaderShifted_bitsRead(br{{ var "id" }}), br_bits_read MOVQ bitReaderShifted_value(br{{ var "id" }}), br_value MOVQ bitReaderShifted_off(br{{ var "id" }}), br_offset // We must have at least 2 * max tablelog left CMPQ br_bits_read, $64-22 JBE skip_fill{{ var "id" }} SUBQ $32, br_bits_read // b.bitsRead -= 32 SUBQ $4, br_offset // b.off -= 4 // v := b.in[b.off-4 : b.off] // v = v[:4] // low := (uint32(v[0])) | (uint32(v[1]) << 8) | (uint32(v[2]) << 16) | (uint32(v[3]) << 24) MOVQ bitReaderShifted_in(br{{ var "id" }}), AX // b.value |= uint64(low) << (b.bitsRead & 63) #ifdef GOAMD64_v3 SHLXQ br_bits_read, 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) << (b.bitsRead & 63) #else MOVL 0(br_offset)(AX*1), AX // AX = uint32(b.in[b.off:b.off+4]) MOVQ br_bits_read, CX SHLQ CL, AX #endif ORQ AX, br_value // exhausted = exhausted || (br{{ var "id"}}.off < 4) CMPQ br_offset, $4 SETLT DL ORB DL, DH // } skip_fill{{ var "id" }}: // val0 := br{{ var "id"}}.peekTopBits(peekBits) #ifdef GOAMD64_v3 SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask #else MOVQ br_value, AX MOVQ peek_bits, CX SHRQ CL, AX // AX = (value >> peek_bits) & mask #endif // v0 := table[val0&mask] MOVW 0(table)(AX*2), AX // AX - v0 // br{{ var "id"}}.advance(uint8(v0.entry)) MOVB AH, BL // BL = uint8(v0.entry >> 8) #ifdef GOAMD64_v3 MOVBQZX AL, CX SHLXQ AX, br_value, br_value // value <<= n #else MOVBQZX AL, CX SHLQ CL, br_value // value <<= n #endif ADDQ CX, br_bits_read // bits_read += n #ifdef GOAMD64_v3 SHRXQ peek_bits, br_value, AX // AX = (value >> peek_bits) & mask #else // val1 := br{{ var "id"}}.peekTopBits(peekBits) MOVQ peek_bits, CX MOVQ br_value, AX SHRQ CL, AX // AX = (value >> peek_bits) & mask #endif // v1 := table[val1&mask] MOVW 0(table)(AX*2), AX // AX - v1 // br{{ var "id"}}.advance(uint8(v1.entry)) MOVB AH, BH // BH = uint8(v1.entry >> 8) #ifdef GOAMD64_v3 MOVBQZX AL, CX SHLXQ AX, br_value, br_value // value <<= n #else MOVBQZX AL, CX SHLQ CL, br_value // value <<= n #endif ADDQ CX, br_bits_read // bits_read += n // these two writes get coalesced // buf[stream][off] = uint8(v0.entry >> 8) // buf[stream][off+1] = uint8(v1.entry >> 8) MOVW BX, {{ var "bufofs" }}(buffer)(off*1) // update the bitrader reader structure MOVB br_bits_read, bitReaderShifted_bitsRead(br{{ var "id" }}) MOVQ br_value, bitReaderShifted_value(br{{ var "id" }}) MOVQ br_offset, bitReaderShifted_off(br{{ var "id" }}) {{ end }} {{ set "id" "0" }} {{ set "ofs" "0" }} {{ set "bufofs" "0" }} {{/* id * bufoff */}} {{ template "decode_2_values_x86" . }} {{ set "id" "1" }} {{ set "ofs" "8" }} {{ set "bufofs" "256" }} {{ template "decode_2_values_x86" . }} {{ set "id" "2" }} {{ set "ofs" "16" }} {{ set "bufofs" "512" }} {{ template "decode_2_values_x86" . }} {{ set "id" "3" }} {{ set "ofs" "24" }} {{ set "bufofs" "768" }} {{ template "decode_2_values_x86" . }} ADDQ $2, off // off += 2 TESTB DH, DH // any br[i].ofs < 4? JNZ end CMPQ off, $bufoff JL main_loop end: MOVQ 0(SP), BP MOVB off, ret+56(FP) RET #undef off #undef buffer #undef table #undef br_bits_read #undef br_value #undef br_offset #undef peek_bits #undef exhausted #undef br0 #undef br1 #undef br2 #undef br3