// Copyright 2014-2021 Ulrich Kunitz. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package lzma import ( "errors" "fmt" "github.com/ulikunitz/xz/internal/hash" ) /* For compression we need to find byte sequences that match the byte * sequence at the dictionary head. A hash table is a simple method to * provide this capability. */ // maxMatches limits the number of matches requested from the Matches // function. This controls the speed of the overall encoding. const maxMatches = 16 // shortDists defines the number of short distances supported by the // implementation. const shortDists = 8 // The minimum is somehow arbitrary but the maximum is limited by the // memory requirements of the hash table. const ( minTableExponent = 9 maxTableExponent = 20 ) // newRoller contains the function used to create an instance of the // hash.Roller. var newRoller = func(n int) hash.Roller { return hash.NewCyclicPoly(n) } // hashTable stores the hash table including the rolling hash method. // // We implement chained hashing into a circular buffer. Each entry in // the circular buffer stores the delta distance to the next position with a // word that has the same hash value. type hashTable struct { dict *encoderDict // actual hash table t []int64 // circular list data with the offset to the next word data []uint32 front int // mask for computing the index for the hash table mask uint64 // hash offset; initial value is -int64(wordLen) hoff int64 // length of the hashed word wordLen int // hash roller for computing the hash values for the Write // method wr hash.Roller // hash roller for computing arbitrary hashes hr hash.Roller // preallocated slices p [maxMatches]int64 distances [maxMatches + shortDists]int } // hashTableExponent derives the hash table exponent from the dictionary // capacity. func hashTableExponent(n uint32) int { e := 30 - nlz32(n) switch { case e < minTableExponent: e = minTableExponent case e > maxTableExponent: e = maxTableExponent } return e } // newHashTable creates a new hash table for words of length wordLen func newHashTable(capacity int, wordLen int) (t *hashTable, err error) { if !(0 < capacity) { return nil, errors.New( "newHashTable: capacity must not be negative") } exp := hashTableExponent(uint32(capacity)) if !(1 <= wordLen && wordLen <= 4) { return nil, errors.New("newHashTable: " + "argument wordLen out of range") } n := 1 << uint(exp) if n <= 0 { panic("newHashTable: exponent is too large") } t = &hashTable{ t: make([]int64, n), data: make([]uint32, capacity), mask: (uint64(1) << uint(exp)) - 1, hoff: -int64(wordLen), wordLen: wordLen, wr: newRoller(wordLen), hr: newRoller(wordLen), } return t, nil } func (t *hashTable) SetDict(d *encoderDict) { t.dict = d } // buffered returns the number of bytes that are currently hashed. func (t *hashTable) buffered() int { n := t.hoff + 1 switch { case n <= 0: return 0 case n >= int64(len(t.data)): return len(t.data) } return int(n) } // addIndex adds n to an index ensuring that is stays inside the // circular buffer for the hash chain. func (t *hashTable) addIndex(i, n int) int { i += n - len(t.data) if i < 0 { i += len(t.data) } return i } // putDelta puts the delta instance at the current front of the circular // chain buffer. func (t *hashTable) putDelta(delta uint32) { t.data[t.front] = delta t.front = t.addIndex(t.front, 1) } // putEntry puts a new entry into the hash table. If there is already a // value stored it is moved into the circular chain buffer. func (t *hashTable) putEntry(h uint64, pos int64) { if pos < 0 { return } i := h & t.mask old := t.t[i] - 1 t.t[i] = pos + 1 var delta int64 if old >= 0 { delta = pos - old if delta > 1<<32-1 || delta > int64(t.buffered()) { delta = 0 } } t.putDelta(uint32(delta)) } // WriteByte converts a single byte into a hash and puts them into the hash // table. func (t *hashTable) WriteByte(b byte) error { h := t.wr.RollByte(b) t.hoff++ t.putEntry(h, t.hoff) return nil } // Write converts the bytes provided into hash tables and stores the // abbreviated offsets into the hash table. The method will never return an // error. func (t *hashTable) Write(p []byte) (n int, err error) { for _, b := range p { // WriteByte doesn't generate an error. t.WriteByte(b) } return len(p), nil } // getMatches the matches for a specific hash. The functions returns the // number of positions found. // // TODO: Make a getDistances because that we are actually interested in. func (t *hashTable) getMatches(h uint64, positions []int64) (n int) { if t.hoff < 0 || len(positions) == 0 { return 0 } buffered := t.buffered() tailPos := t.hoff + 1 - int64(buffered) rear := t.front - buffered if rear >= 0 { rear -= len(t.data) } // get the slot for the hash pos := t.t[h&t.mask] - 1 delta := pos - tailPos for { if delta < 0 { return n } positions[n] = tailPos + delta n++ if n >= len(positions) { return n } i := rear + int(delta) if i < 0 { i += len(t.data) } u := t.data[i] if u == 0 { return n } delta -= int64(u) } } // hash computes the rolling hash for the word stored in p. For correct // results its length must be equal to t.wordLen. func (t *hashTable) hash(p []byte) uint64 { var h uint64 for _, b := range p { h = t.hr.RollByte(b) } return h } // Matches fills the positions slice with potential matches. The // functions returns the number of positions filled into positions. The // byte slice p must have word length of the hash table. func (t *hashTable) Matches(p []byte, positions []int64) int { if len(p) != t.wordLen { panic(fmt.Errorf( "byte slice must have length %d", t.wordLen)) } h := t.hash(p) return t.getMatches(h, positions) } // NextOp identifies the next operation using the hash table. // // TODO: Use all repetitions to find matches. func (t *hashTable) NextOp(rep [4]uint32) operation { // get positions data := t.dict.data[:maxMatchLen] n, _ := t.dict.buf.Peek(data) data = data[:n] var p []int64 if n < t.wordLen { p = t.p[:0] } else { p = t.p[:maxMatches] n = t.Matches(data[:t.wordLen], p) p = p[:n] } // convert positions in potential distances head := t.dict.head dists := append(t.distances[:0], 1, 2, 3, 4, 5, 6, 7, 8) for _, pos := range p { dis := int(head - pos) if dis > shortDists { dists = append(dists, dis) } } // check distances var m match dictLen := t.dict.DictLen() for _, dist := range dists { if dist > dictLen { continue } // Here comes a trick. We are only interested in matches // that are longer than the matches we have been found // before. So before we test the whole byte sequence at // the given distance, we test the first byte that would // make the match longer. If it doesn't match the byte // to match, we don't to care any longer. i := t.dict.buf.rear - dist + m.n if i < 0 { i += len(t.dict.buf.data) } if t.dict.buf.data[i] != data[m.n] { // We can't get a longer match. Jump to the next // distance. continue } n := t.dict.buf.matchLen(dist, data) switch n { case 0: continue case 1: if uint32(dist-minDistance) != rep[0] { continue } } if n > m.n { m = match{int64(dist), n} if n == len(data) { // No better match will be found. break } } } if m.n == 0 { return lit{data[0]} } return m }