//go:build amd64 && !appengine && !noasm && gc // +build amd64,!appengine,!noasm,gc // This file contains the specialisation of Decoder.Decompress4X // and Decoder.Decompress1X that use an asm implementation of thir main loops. package huff0 import ( "errors" "fmt" "github.com/klauspost/compress/internal/cpuinfo" ) // decompress4x_main_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog > 8. // //go:noescape func decompress4x_main_loop_amd64(ctx *decompress4xContext) // decompress4x_8b_loop_x86 is an x86 assembler implementation // of Decompress4X when tablelog <= 8 which decodes 4 entries // per loop. // //go:noescape func decompress4x_8b_main_loop_amd64(ctx *decompress4xContext) // fallback8BitSize is the size where using Go version is faster. const fallback8BitSize = 800 type decompress4xContext struct { pbr *[4]bitReaderShifted peekBits uint8 out *byte dstEvery int tbl *dEntrySingle decoded int limit *byte } // Decompress4X will decompress a 4X encoded stream. // The length of the supplied input must match the end of a block exactly. // The *capacity* of the dst slice must match the destination size of // the uncompressed data exactly. func (d *Decoder) Decompress4X(dst, src []byte) ([]byte, error) { if len(d.dt.single) == 0 { return nil, errors.New("no table loaded") } if len(src) < 6+(4*1) { return nil, errors.New("input too small") } use8BitTables := d.actualTableLog <= 8 if cap(dst) < fallback8BitSize && use8BitTables { return d.decompress4X8bit(dst, src) } var br [4]bitReaderShifted // Decode "jump table" start := 6 for i := 0; i < 3; i++ { length := int(src[i*2]) | (int(src[i*2+1]) << 8) if start+length >= len(src) { return nil, errors.New("truncated input (or invalid offset)") } err := br[i].init(src[start : start+length]) if err != nil { return nil, err } start += length } err := br[3].init(src[start:]) if err != nil { return nil, err } // destination, offset to match first output dstSize := cap(dst) dst = dst[:dstSize] out := dst dstEvery := (dstSize + 3) / 4 const tlSize = 1 << tableLogMax const tlMask = tlSize - 1 single := d.dt.single[:tlSize] var decoded int if len(out) > 4*4 && !(br[0].off < 4 || br[1].off < 4 || br[2].off < 4 || br[3].off < 4) { ctx := decompress4xContext{ pbr: &br, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() out: &out[0], dstEvery: dstEvery, tbl: &single[0], limit: &out[dstEvery-4], // Always stop decoding when first buffer gets here to avoid writing OOB on last. } if use8BitTables { decompress4x_8b_main_loop_amd64(&ctx) } else { decompress4x_main_loop_amd64(&ctx) } decoded = ctx.decoded out = out[decoded/4:] } // Decode remaining. remainBytes := dstEvery - (decoded / 4) for i := range br { offset := dstEvery * i endsAt := offset + remainBytes if endsAt > len(out) { endsAt = len(out) } br := &br[i] bitsLeft := br.remaining() for bitsLeft > 0 { br.fill() if offset >= endsAt { return nil, errors.New("corruption detected: stream overrun 4") } // Read value and increment offset. val := br.peekBitsFast(d.actualTableLog) v := single[val&tlMask].entry nBits := uint8(v) br.advance(nBits) bitsLeft -= uint(nBits) out[offset] = uint8(v >> 8) offset++ } if offset != endsAt { return nil, fmt.Errorf("corruption detected: short output block %d, end %d != %d", i, offset, endsAt) } decoded += offset - dstEvery*i err = br.close() if err != nil { return nil, err } } if dstSize != decoded { return nil, errors.New("corruption detected: short output block") } return dst, nil } // decompress4x_main_loop_x86 is an x86 assembler implementation // of Decompress1X when tablelog > 8. // //go:noescape func decompress1x_main_loop_amd64(ctx *decompress1xContext) // decompress4x_main_loop_x86 is an x86 with BMI2 assembler implementation // of Decompress1X when tablelog > 8. // //go:noescape func decompress1x_main_loop_bmi2(ctx *decompress1xContext) type decompress1xContext struct { pbr *bitReaderShifted peekBits uint8 out *byte outCap int tbl *dEntrySingle decoded int } // Error reported by asm implementations const error_max_decoded_size_exeeded = -1 // Decompress1X will decompress a 1X encoded stream. // The cap of the output buffer will be the maximum decompressed size. // The length of the supplied input must match the end of a block exactly. func (d *Decoder) Decompress1X(dst, src []byte) ([]byte, error) { if len(d.dt.single) == 0 { return nil, errors.New("no table loaded") } var br bitReaderShifted err := br.init(src) if err != nil { return dst, err } maxDecodedSize := cap(dst) dst = dst[:maxDecodedSize] const tlSize = 1 << tableLogMax const tlMask = tlSize - 1 if maxDecodedSize >= 4 { ctx := decompress1xContext{ pbr: &br, out: &dst[0], outCap: maxDecodedSize, peekBits: uint8((64 - d.actualTableLog) & 63), // see: bitReaderShifted.peekBitsFast() tbl: &d.dt.single[0], } if cpuinfo.HasBMI2() { decompress1x_main_loop_bmi2(&ctx) } else { decompress1x_main_loop_amd64(&ctx) } if ctx.decoded == error_max_decoded_size_exeeded { return nil, ErrMaxDecodedSizeExceeded } dst = dst[:ctx.decoded] } // br < 8, so uint8 is fine bitsLeft := uint8(br.off)*8 + 64 - br.bitsRead for bitsLeft > 0 { br.fill() if len(dst) >= maxDecodedSize { br.close() return nil, ErrMaxDecodedSizeExceeded } v := d.dt.single[br.peekBitsFast(d.actualTableLog)&tlMask] nBits := uint8(v.entry) br.advance(nBits) bitsLeft -= nBits dst = append(dst, uint8(v.entry>>8)) } return dst, br.close() }