Skip to content

Commit 94867dd

Browse files
authored
huff0: Add size estimation function. (#405)
1 parent e9c9800 commit 94867dd

File tree

3 files changed

+134
-3
lines changed

3 files changed

+134
-3
lines changed

huff0/compress.go

Lines changed: 64 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -161,6 +161,70 @@ func compress(in []byte, s *Scratch, compressor func(src []byte) ([]byte, error)
161161
return s.Out, false, nil
162162
}
163163

164+
// EstimateSizes will estimate the data sizes
165+
func EstimateSizes(in []byte, s *Scratch) (tableSz, dataSz, reuseSz int, err error) {
166+
s, err = s.prepare(in)
167+
if err != nil {
168+
return 0, 0, 0, err
169+
}
170+
171+
// Create histogram, if none was provided.
172+
tableSz, dataSz, reuseSz = -1, -1, -1
173+
maxCount := s.maxCount
174+
var canReuse = false
175+
if maxCount == 0 {
176+
maxCount, canReuse = s.countSimple(in)
177+
} else {
178+
canReuse = s.canUseTable(s.prevTable)
179+
}
180+
181+
// We want the output size to be less than this:
182+
wantSize := len(in)
183+
if s.WantLogLess > 0 {
184+
wantSize -= wantSize >> s.WantLogLess
185+
}
186+
187+
// Reset for next run.
188+
s.clearCount = true
189+
s.maxCount = 0
190+
if maxCount >= len(in) {
191+
if maxCount > len(in) {
192+
return 0, 0, 0, fmt.Errorf("maxCount (%d) > length (%d)", maxCount, len(in))
193+
}
194+
if len(in) == 1 {
195+
return 0, 0, 0, ErrIncompressible
196+
}
197+
// One symbol, use RLE
198+
return 0, 0, 0, ErrUseRLE
199+
}
200+
if maxCount == 1 || maxCount < (len(in)>>7) {
201+
// Each symbol present maximum once or too well distributed.
202+
return 0, 0, 0, ErrIncompressible
203+
}
204+
205+
// Calculate new table.
206+
err = s.buildCTable()
207+
if err != nil {
208+
return 0, 0, 0, err
209+
}
210+
211+
if false && !s.canUseTable(s.cTable) {
212+
panic("invalid table generated")
213+
}
214+
215+
tableSz, err = s.cTable.estTableSize(s)
216+
if err != nil {
217+
return 0, 0, 0, err
218+
}
219+
if canReuse {
220+
reuseSz = s.prevTable.estimateSize(s.count[:s.symbolLen])
221+
}
222+
dataSz = s.cTable.estimateSize(s.count[:s.symbolLen])
223+
224+
// Restore
225+
return tableSz, dataSz, reuseSz, nil
226+
}
227+
164228
func (s *Scratch) compress1X(src []byte) ([]byte, error) {
165229
return s.compress1xDo(s.Out, src)
166230
}

huff0/compress_test.go

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -231,6 +231,7 @@ func TestCompress1X(t *testing.T) {
231231
if len(buf0) > BlockSizeMax {
232232
buf0 = buf0[:BlockSizeMax]
233233
}
234+
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
234235
b, re, err := Compress1X(buf0, &s)
235236
if err != test.err1X {
236237
t.Errorf("want error %v (%T), got %v (%T)", test.err1X, test.err1X, err, err)
@@ -256,6 +257,7 @@ func TestCompress1X(t *testing.T) {
256257
if len(s.OutData) == 0 {
257258
t.Error("got no data output")
258259
}
260+
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
259261
t.Logf("%s: %d -> %d bytes (%.2f:1) re:%t (table: %d bytes)", test.name, len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
260262
s.Out = nil
261263
bRe, _, err := Compress1X(b, &s)
@@ -406,7 +408,7 @@ func TestCompress4XReuse(t *testing.T) {
406408
for j := range buf0 {
407409
buf0[j] = byte(int64(i) + (rng.Int63() & 3))
408410
}
409-
411+
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
410412
b, re, err := Compress4X(buf0, &s)
411413
if err != nil {
412414
t.Fatal(err)
@@ -421,7 +423,7 @@ func TestCompress4XReuse(t *testing.T) {
421423
if re {
422424
t.Error("claimed to have re-used. Unlikely.")
423425
}
424-
426+
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
425427
t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
426428
})
427429
}
@@ -441,6 +443,7 @@ func TestCompress4XReuseActually(t *testing.T) {
441443
buf0[j] = byte(rng.Int63() & 7)
442444
}
443445

446+
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
444447
b, re, err := Compress4X(buf0, &s)
445448
if err != nil {
446449
t.Fatal(err)
@@ -458,7 +461,7 @@ func TestCompress4XReuseActually(t *testing.T) {
458461
if !re && i > 0 {
459462
t.Error("Expected table to be reused")
460463
}
461-
464+
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
462465
t.Logf("%s: %d -> %d bytes (%.2f:1) %t (table: %d bytes)", t.Name(), len(buf0), len(b), float64(len(buf0))/float64(len(b)), re, len(s.OutTable))
463466
})
464467
}
@@ -488,6 +491,7 @@ func TestCompress1XReuse(t *testing.T) {
488491
}
489492
firstData := len(s.OutData)
490493
s.Reuse = ReusePolicyAllow
494+
tbSz, dSz, reSz, _ := EstimateSizes(buf0, &s)
491495
b, re, err := Compress1X(buf0, &s)
492496
if err != nil {
493497
t.Errorf("got secondary error %v (%T)", err, err)
@@ -505,6 +509,7 @@ func TestCompress1XReuse(t *testing.T) {
505509
if len(b) != firstData {
506510
t.Errorf("data length did not match first: %d, second:%d", firstData, len(b))
507511
}
512+
t.Logf("Estimate: table %d, got %d, data %d, got %d, reuse: %d", tbSz, len(s.OutTable), dSz, len(s.OutData), reSz)
508513
t.Logf("%s: %d -> %d bytes (%.2f:1) %t", test.name, len(buf0), len(b), float64(len(buf0))/float64(len(b)), re)
509514
})
510515
}

huff0/huff0.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -245,6 +245,68 @@ func (c cTable) write(s *Scratch) error {
245245
return nil
246246
}
247247

248+
func (c cTable) estTableSize(s *Scratch) (sz int, err error) {
249+
var (
250+
// precomputed conversion table
251+
bitsToWeight [tableLogMax + 1]byte
252+
huffLog = s.actualTableLog
253+
// last weight is not saved.
254+
maxSymbolValue = uint8(s.symbolLen - 1)
255+
huffWeight = s.huffWeight[:256]
256+
)
257+
const (
258+
maxFSETableLog = 6
259+
)
260+
// convert to weight
261+
bitsToWeight[0] = 0
262+
for n := uint8(1); n < huffLog+1; n++ {
263+
bitsToWeight[n] = huffLog + 1 - n
264+
}
265+
266+
// Acquire histogram for FSE.
267+
hist := s.fse.Histogram()
268+
hist = hist[:256]
269+
for i := range hist[:16] {
270+
hist[i] = 0
271+
}
272+
for n := uint8(0); n < maxSymbolValue; n++ {
273+
v := bitsToWeight[c[n].nBits] & 15
274+
huffWeight[n] = v
275+
hist[v]++
276+
}
277+
278+
// FSE compress if feasible.
279+
if maxSymbolValue >= 2 {
280+
huffMaxCnt := uint32(0)
281+
huffMax := uint8(0)
282+
for i, v := range hist[:16] {
283+
if v == 0 {
284+
continue
285+
}
286+
huffMax = byte(i)
287+
if v > huffMaxCnt {
288+
huffMaxCnt = v
289+
}
290+
}
291+
s.fse.HistogramFinished(huffMax, int(huffMaxCnt))
292+
s.fse.TableLog = maxFSETableLog
293+
b, err := fse.Compress(huffWeight[:maxSymbolValue], s.fse)
294+
if err == nil && len(b) < int(s.symbolLen>>1) {
295+
sz += 1 + len(b)
296+
return sz, nil
297+
}
298+
// Unable to compress (RLE/uncompressible)
299+
}
300+
// write raw values as 4-bits (max : 15)
301+
if maxSymbolValue > (256 - 128) {
302+
// should not happen : likely means source cannot be compressed
303+
return 0, ErrIncompressible
304+
}
305+
// special case, pack weights 4 bits/weight.
306+
sz += 1 + int(maxSymbolValue/2)
307+
return sz, nil
308+
}
309+
248310
// estimateSize returns the estimated size in bytes of the input represented in the
249311
// histogram supplied.
250312
func (c cTable) estimateSize(hist []uint32) int {

0 commit comments

Comments
 (0)