|
|
@ -71,7 +71,7 @@ Results on the [Silesia compression corpus](http://sun.aei.polsl.pl/~sdeor/index |
|
|
|
This software is available under 2 licenses -- choose whichever you prefer. |
|
|
|
------------------------------------------------------------------------------ |
|
|
|
ALTERNATIVE A - MIT License |
|
|
|
Copyright (c) 2020-2023 Micha Mettke |
|
|
|
Copyright (c) 2020 Micha Mettke |
|
|
|
Permission is hereby granted, free of charge, to any person obtaining a copy of |
|
|
|
this software and associated documentation files (the "Software"), to deal in |
|
|
|
the Software without restriction, including without limitation the rights to |
|
|
@ -125,7 +125,7 @@ extern "C" { |
|
|
|
|
|
|
|
#define SDEFL_MIN_MATCH 4 |
|
|
|
#define SDEFL_BLK_MAX (256*1024) |
|
|
|
#define SDEFL_SEQ_SIZ ((SDEFL_BLK_MAX+2)/3) |
|
|
|
#define SDEFL_SEQ_SIZ ((SDEFL_BLK_MAX + SDEFL_MIN_MATCH)/SDEFL_MIN_MATCH) |
|
|
|
|
|
|
|
#define SDEFL_SYM_MAX (288) |
|
|
|
#define SDEFL_OFF_MAX (32) |
|
|
@ -185,7 +185,6 @@ extern int zsdeflate(struct sdefl *s, void *o, const void *i, int n, int lvl); |
|
|
|
#define SDEFL_MAX_CODE_LEN (15) |
|
|
|
#define SDEFL_SYM_BITS (10u) |
|
|
|
#define SDEFL_SYM_MSK ((1u << SDEFL_SYM_BITS)-1u) |
|
|
|
#define SDEFL_RAW_BLK_SIZE (65535) |
|
|
|
#define SDEFL_LIT_LEN_CODES (14) |
|
|
|
#define SDEFL_OFF_CODES (15) |
|
|
|
#define SDEFL_PRE_CODES (7) |
|
|
@ -193,7 +192,6 @@ extern int zsdeflate(struct sdefl *s, void *o, const void *i, int n, int lvl); |
|
|
|
#define SDEFL_EOB (256) |
|
|
|
|
|
|
|
#define sdefl_npow2(n) (1 << (sdefl_ilog2((n)-1) + 1)) |
|
|
|
#define sdefl_div_round_up(n,d) (((n)+((d)-1))/(d)) |
|
|
|
|
|
|
|
static int |
|
|
|
sdefl_ilog2(int n) { |
|
|
@ -440,12 +438,12 @@ sdefl_precode(struct sdefl_symcnt *cnt, unsigned *freqs, unsigned *items, |
|
|
|
} while (run_start != total); |
|
|
|
cnt->items = (int)(at - items); |
|
|
|
} |
|
|
|
struct sdefl_match_codest { |
|
|
|
struct sdefl_match_codes { |
|
|
|
int ls, lc; |
|
|
|
int dc, dx; |
|
|
|
}; |
|
|
|
static void |
|
|
|
sdefl_match_codes(struct sdefl_match_codest *cod, int dist, int len) { |
|
|
|
sdefl_match_codes(struct sdefl_match_codes *cod, int dist, int len) { |
|
|
|
static const short dxmax[] = {0,6,12,24,48,96,192,384,768,1536,3072,6144,12288,24576}; |
|
|
|
static const unsigned char lslot[258+1] = { |
|
|
|
0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 8, 9, 9, 10, 10, 11, 11, 12, 12, 12, |
|
|
@ -473,44 +471,6 @@ sdefl_match_codes(struct sdefl_match_codest *cod, int dist, int len) { |
|
|
|
cod->dx = sdefl_ilog2(sdefl_npow2(dist) >> 2); |
|
|
|
cod->dc = cod->dx ? ((cod->dx + 1) << 1) + (dist > dxmax[cod->dx]) : dist-1; |
|
|
|
} |
|
|
|
enum sdefl_blk_type { |
|
|
|
SDEFL_BLK_UCOMPR, |
|
|
|
SDEFL_BLK_DYN |
|
|
|
}; |
|
|
|
static enum sdefl_blk_type |
|
|
|
sdefl_blk_type(const struct sdefl *s, int blk_len, int pre_item_len, |
|
|
|
const unsigned *pre_freq, const unsigned char *pre_len) { |
|
|
|
static const unsigned char x_pre_bits[] = { 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,2,3,7}; |
|
|
|
static const unsigned char x_len_bits[] = {0,0,0,0,0,0,0,0, 1,1,1,1,2,2,2,2, |
|
|
|
3,3,3,3,4,4,4,4, 5,5,5,5,0}; |
|
|
|
static const unsigned char x_off_bits[] = {0,0,0,0,1,1,2,2, 3,3,4,4,5,5,6,6, |
|
|
|
7,7,8,8,9,9,10,10, 11,11,12,12,13,13}; |
|
|
|
|
|
|
|
int dyn_cost = 0; |
|
|
|
int fix_cost = 0; |
|
|
|
int sym = 0; |
|
|
|
|
|
|
|
dyn_cost += 5 + 5 + 4 + (3 * pre_item_len); |
|
|
|
for (sym = 0; sym < SDEFL_PRE_MAX; sym++) |
|
|
|
dyn_cost += pre_freq[sym] * (x_pre_bits[sym] + pre_len[sym]); |
|
|
|
for (sym = 0; sym < 256; sym++) |
|
|
|
dyn_cost += s->freq.lit[sym] * s->cod.len.lit[sym]; |
|
|
|
dyn_cost += s->cod.len.lit[SDEFL_EOB]; |
|
|
|
for (sym = 257; sym < 286; sym++) |
|
|
|
dyn_cost += s->freq.lit[sym] * (x_len_bits[sym - 257] + s->cod.len.lit[sym]); |
|
|
|
for (sym = 0; sym < 30; sym++) |
|
|
|
dyn_cost += s->freq.off[sym] * (x_off_bits[sym] + s->cod.len.off[sym]); |
|
|
|
|
|
|
|
fix_cost += 8*(5 * sdefl_div_round_up(blk_len, SDEFL_RAW_BLK_SIZE) + blk_len + 1 + 2); |
|
|
|
return (dyn_cost < fix_cost) ? SDEFL_BLK_DYN : SDEFL_BLK_UCOMPR; |
|
|
|
} |
|
|
|
static void |
|
|
|
sdefl_put16(unsigned char **dst, unsigned short x) { |
|
|
|
unsigned char *val = *dst; |
|
|
|
val[0] = (unsigned char)(x & 0xff); |
|
|
|
val[1] = (unsigned char)(x >> 8); |
|
|
|
*dst = val + 2; |
|
|
|
} |
|
|
|
static void |
|
|
|
sdefl_match(unsigned char **dst, struct sdefl *s, int dist, int len) { |
|
|
|
static const char lxn[] = {0,0,0,0,0,0,0,0,1,1,1,1,2,2,2,2,3,3,3,3,4,4,4,4,5,5,5,5,0}; |
|
|
@ -519,7 +479,7 @@ sdefl_match(unsigned char **dst, struct sdefl *s, int dist, int len) { |
|
|
|
static const short dmin[] = {1,2,3,4,5,7,9,13,17,25,33,49,65,97,129,193,257, |
|
|
|
385,513,769,1025,1537,2049,3073,4097,6145,8193,12289,16385,24577}; |
|
|
|
|
|
|
|
struct sdefl_match_codest cod; |
|
|
|
struct sdefl_match_codes cod; |
|
|
|
sdefl_match_codes(&cod, dist, len); |
|
|
|
sdefl_put(dst, s, (int)s->cod.word.lit[cod.lc], s->cod.len.lit[cod.lc]); |
|
|
|
sdefl_put(dst, s, len - lmin[cod.ls], lxn[cod.ls]); |
|
|
@ -528,8 +488,7 @@ sdefl_match(unsigned char **dst, struct sdefl *s, int dist, int len) { |
|
|
|
} |
|
|
|
static void |
|
|
|
sdefl_flush(unsigned char **dst, struct sdefl *s, int is_last, |
|
|
|
const unsigned char *in, int blk_begin, int blk_end) { |
|
|
|
int blk_len = blk_end - blk_begin; |
|
|
|
const unsigned char *in) { |
|
|
|
int j, i = 0, item_cnt = 0; |
|
|
|
struct sdefl_symcnt symcnt = {0}; |
|
|
|
unsigned codes[SDEFL_PRE_MAX]; |
|
|
@ -539,7 +498,7 @@ sdefl_flush(unsigned char **dst, struct sdefl *s, int is_last, |
|
|
|
static const unsigned char perm[SDEFL_PRE_MAX] = {16,17,18,0,8,7,9,6,10,5,11, |
|
|
|
4,12,3,13,2,14,1,15}; |
|
|
|
|
|
|
|
/* calculate huffman codes */ |
|
|
|
/* huffman codes */ |
|
|
|
s->freq.lit[SDEFL_EOB]++; |
|
|
|
sdefl_huff(s->cod.len.lit, s->cod.word.lit, s->freq.lit, SDEFL_SYM_MAX, SDEFL_LIT_LEN_CODES); |
|
|
|
sdefl_huff(s->cod.len.off, s->cod.word.off, s->freq.off, SDEFL_OFF_MAX, SDEFL_OFF_CODES); |
|
|
@ -550,58 +509,35 @@ sdefl_flush(unsigned char **dst, struct sdefl *s, int is_last, |
|
|
|
break; |
|
|
|
} |
|
|
|
} |
|
|
|
/* write block */ |
|
|
|
switch (sdefl_blk_type(s, blk_len, item_cnt, freqs, lens)) { |
|
|
|
case SDEFL_BLK_UCOMPR: { |
|
|
|
/* uncompressed blocks */ |
|
|
|
int n = sdefl_div_round_up(blk_len, SDEFL_RAW_BLK_SIZE); |
|
|
|
for (i = 0; i < n; ++i) { |
|
|
|
int fin = is_last && (i + 1 == n); |
|
|
|
int amount = blk_len < SDEFL_RAW_BLK_SIZE ? blk_len : SDEFL_RAW_BLK_SIZE; |
|
|
|
sdefl_put(dst, s, !!fin, 1); /* block */ |
|
|
|
sdefl_put(dst, s, 0x00, 2); /* stored block */ |
|
|
|
if (s->bitcnt) { |
|
|
|
sdefl_put(dst, s, 0x00, 8 - s->bitcnt); |
|
|
|
} |
|
|
|
assert(s->bitcnt == 0); |
|
|
|
sdefl_put16(dst, (unsigned short)amount); |
|
|
|
sdefl_put16(dst, ~(unsigned short)amount); |
|
|
|
memcpy(*dst, in + blk_begin + i * SDEFL_RAW_BLK_SIZE, amount); |
|
|
|
*dst = *dst + amount; |
|
|
|
blk_len -= amount; |
|
|
|
} |
|
|
|
} break; |
|
|
|
case SDEFL_BLK_DYN: { |
|
|
|
/* dynamic huffman block */ |
|
|
|
sdefl_put(dst, s, !!is_last, 1); /* block */ |
|
|
|
sdefl_put(dst, s, 0x02, 2); /* dynamic huffman */ |
|
|
|
sdefl_put(dst, s, symcnt.lit - 257, 5); |
|
|
|
sdefl_put(dst, s, symcnt.off - 1, 5); |
|
|
|
sdefl_put(dst, s, item_cnt - 4, 4); |
|
|
|
for (i = 0; i < item_cnt; ++i) { |
|
|
|
sdefl_put(dst, s, lens[perm[i]], 3); |
|
|
|
} |
|
|
|
for (i = 0; i < symcnt.items; ++i) { |
|
|
|
unsigned sym = items[i] & 0x1F; |
|
|
|
sdefl_put(dst, s, (int)codes[sym], lens[sym]); |
|
|
|
if (sym < 16) continue; |
|
|
|
if (sym == 16) sdefl_put(dst, s, items[i] >> 5, 2); |
|
|
|
else if(sym == 17) sdefl_put(dst, s, items[i] >> 5, 3); |
|
|
|
else sdefl_put(dst, s, items[i] >> 5, 7); |
|
|
|
} |
|
|
|
/* block sequences */ |
|
|
|
for (i = 0; i < s->seq_cnt; ++i) { |
|
|
|
if (s->seq[i].off >= 0) { |
|
|
|
for (j = 0; j < s->seq[i].len; ++j) { |
|
|
|
int c = in[s->seq[i].off + j]; |
|
|
|
sdefl_put(dst, s, (int)s->cod.word.lit[c], s->cod.len.lit[c]); |
|
|
|
} |
|
|
|
} else { |
|
|
|
sdefl_match(dst, s, -s->seq[i].off, s->seq[i].len); |
|
|
|
/* block header */ |
|
|
|
sdefl_put(dst, s, is_last ? 0x01 : 0x00, 1); /* block */ |
|
|
|
sdefl_put(dst, s, 0x02, 2); /* dynamic huffman */ |
|
|
|
sdefl_put(dst, s, symcnt.lit - 257, 5); |
|
|
|
sdefl_put(dst, s, symcnt.off - 1, 5); |
|
|
|
sdefl_put(dst, s, item_cnt - 4, 4); |
|
|
|
for (i = 0; i < item_cnt; ++i) { |
|
|
|
sdefl_put(dst, s, lens[perm[i]], 3); |
|
|
|
} |
|
|
|
for (i = 0; i < symcnt.items; ++i) { |
|
|
|
unsigned sym = items[i] & 0x1F; |
|
|
|
sdefl_put(dst, s, (int)codes[sym], lens[sym]); |
|
|
|
if (sym < 16) continue; |
|
|
|
if (sym == 16) sdefl_put(dst, s, items[i] >> 5, 2); |
|
|
|
else if(sym == 17) sdefl_put(dst, s, items[i] >> 5, 3); |
|
|
|
else sdefl_put(dst, s, items[i] >> 5, 7); |
|
|
|
} |
|
|
|
/* block sequences */ |
|
|
|
for (i = 0; i < s->seq_cnt; ++i) { |
|
|
|
if (s->seq[i].off >= 0) { |
|
|
|
for (j = 0; j < s->seq[i].len; ++j) { |
|
|
|
int c = in[s->seq[i].off + j]; |
|
|
|
sdefl_put(dst, s, (int)s->cod.word.lit[c], s->cod.len.lit[c]); |
|
|
|
} |
|
|
|
} else { |
|
|
|
sdefl_match(dst, s, -s->seq[i].off, s->seq[i].len); |
|
|
|
} |
|
|
|
sdefl_put(dst, s, (int)(s)->cod.word.lit[SDEFL_EOB], (s)->cod.len.lit[SDEFL_EOB]); |
|
|
|
p">} break;} |
|
|
|
} |
|
|
|
sdefl_put(dst, s, (int)(s)->cod.word.lit[SDEFL_EOB], (s)->cod.len.lit[SDEFL_EOB]); |
|
|
|
memset(&s->freq, 0, sizeof(s->freq)); |
|
|
|
s->seq_cnt = 0; |
|
|
|
} |
|
|
@ -614,12 +550,8 @@ sdefl_seq(struct sdefl *s, int off, int len) { |
|
|
|
} |
|
|
|
static void |
|
|
|
sdefl_reg_match(struct sdefl *s, int off, int len) { |
|
|
|
struct sdefl_match_codest cod; |
|
|
|
struct sdefl_match_codes cod; |
|
|
|
sdefl_match_codes(&cod, off, len); |
|
|
|
|
|
|
|
assert(cod.lc < SDEFL_SYM_MAX); |
|
|
|
assert(cod.dc < SDEFL_OFF_MAX); |
|
|
|
|
|
|
|
s->freq.lit[cod.lc]++; |
|
|
|
s->freq.off[cod.dc]++; |
|
|
|
} |
|
|
@ -628,35 +560,22 @@ struct sdefl_match { |
|
|
|
int len; |
|
|
|
}; |
|
|
|
static void |
|
|
|
sdefl_fnd(struct sdefl_match *m, const struct sdefl *s, int chain_len, |
|
|
|
int max_match, const unsigned char *in, int p, int e) { |
|
|
|
int i = s->tbl[sdefl_hash32(in + p)]; |
|
|
|
int limit = ((p - SDEFL_WIN_SIZ) < SDEFL_NIL) ? SDEFL_NIL : (p-SDEFL_WIN_SIZ); |
|
|
|
|
|
|
|
assert(p < e); |
|
|
|
assert(p + max_match <= e); |
|
|
|
sdefl_fnd(struct sdefl_match *m, const struct sdefl *s, |
|
|
|
int chain_len, int max_match, const unsigned char *in, int p) { |
|
|
|
int i = s->tbl[sdefl_hash32(&in[p])]; |
|
|
|
int limit = ((p-SDEFL_WIN_SIZ)<SDEFL_NIL)?SDEFL_NIL:(p-SDEFL_WIN_SIZ); |
|
|
|
while (i > limit) { |
|
|
|
assert(i + m->len < e); |
|
|
|
assert(p + m->len < e); |
|
|
|
assert(i + SDEFL_MIN_MATCH < e); |
|
|
|
assert(p + SDEFL_MIN_MATCH < e); |
|
|
|
|
|
|
|
if (in[i + m->len] == in[p + m->len] && |
|
|
|
(sdefl_uload32(&in[i]) == sdefl_uload32(&in[p]))) { |
|
|
|
if (in[i+m->len] == in[p+m->len] && |
|
|
|
(sdefl_uload32(&in[i]) == sdefl_uload32(&in[p]))){ |
|
|
|
int n = SDEFL_MIN_MATCH; |
|
|
|
while (n < max_match && in[i + n] == in[p + n]) { |
|
|
|
assert(i + n < e); |
|
|
|
assert(p + n < e); |
|
|
|
n++; |
|
|
|
} |
|
|
|
while (n < max_match && in[i+n] == in[p+n]) n++; |
|
|
|
if (n > m->len) { |
|
|
|
m->len = n, m->off = p - i; |
|
|
|
if (n == max_match) |
|
|
|
break; |
|
|
|
if (n == max_match) break; |
|
|
|
} |
|
|
|
} |
|
|
|
if (!(--chain_len)) break; |
|
|
|
i = s->prv[i & SDEFL_WIN_MSK]; |
|
|
|
i = s->prv[i&SDEFL_WIN_MSK]; |
|
|
|
} |
|
|
|
} |
|
|
|
static int |
|
|
@ -669,20 +588,19 @@ sdefl_compr(struct sdefl *s, unsigned char *out, const unsigned char *in, |
|
|
|
for (n = 0; n < SDEFL_HASH_SIZ; ++n) { |
|
|
|
s->tbl[n] = SDEFL_NIL; |
|
|
|
} |
|
|
|
do {int blk_begin = i; |
|
|
|
int blk_end = ((i + SDEFL_BLK_MAX) < in_len) ? (i + SDEFL_BLK_MAX) : in_len; |
|
|
|
do {int blk_end = ((i + SDEFL_BLK_MAX) < in_len) ? (i + SDEFL_BLK_MAX) : in_len; |
|
|
|
while (i < blk_end) { |
|
|
|
struct sdefl_match m = {0}; |
|
|
|
int left = blk_end - i; |
|
|
|
int max_match = (left > SDEFL_MAX_MATCH) ? SDEFL_MAX_MATCH : left; |
|
|
|
int max_match = (left >= SDEFL_MAX_MATCH) ? SDEFL_MAX_MATCH : left; |
|
|
|
int nice_match = pref[lvl] < max_match ? pref[lvl] : max_match; |
|
|
|
int run = 1, inc = 1, run_inc = 0; |
|
|
|
if (max_match > SDEFL_MIN_MATCH) { |
|
|
|
sdefl_fnd(&m, s, max_chain, max_match, in, i, in_len); |
|
|
|
sdefl_fnd(&m, s, max_chain, max_match, in, i); |
|
|
|
} |
|
|
|
if (lvl >= 5 && m.len >= SDEFL_MIN_MATCH && m.len + 1 < nice_match){ |
|
|
|
if (lvl >= 5 && m.len >= SDEFL_MIN_MATCH && m.len < nice_match){ |
|
|
|
struct sdefl_match m2 = {0}; |
|
|
|
sdefl_fnd(&m2, s, max_chain, m.len + 1, in, i + 1, in_len); |
|
|
|
sdefl_fnd(&m2, s, max_chain, m.len+1, in, i+1); |
|
|
|
m.len = (m2.len > m.len) ? 0 : m.len; |
|
|
|
} |
|
|
|
if (m.len >= SDEFL_MIN_MATCH) { |
|
|
@ -718,12 +636,12 @@ sdefl_compr(struct sdefl *s, unsigned char *out, const unsigned char *in, |
|
|
|
sdefl_seq(s, i - litlen, litlen); |
|
|
|
litlen = 0; |
|
|
|
} |
|
|
|
sdefl_flush(&q, s, blk_end == in_len, in, blk_begin, blk_end); |
|
|
|
sdefl_flush(&q, s, blk_end == in_len, in); |
|
|
|
} while (i < in_len); |
|
|
|
if (s->bitcnt) { |
|
|
|
|
|
|
|
if (s->bitcnt > 0) |
|
|
|
sdefl_put(&q, s, 0x00, 8 - s->bitcnt); |
|
|
|
} |
|
|
|
assert(s->bitcnt == 0); |
|
|
|
|
|
|
|
return (int)(q - out); |
|
|
|
} |
|
|
|
extern int |
|
|
@ -783,8 +701,9 @@ zsdeflate(struct sdefl *s, void *out, const void *in, int n, int lvl) { |
|
|
|
} |
|
|
|
extern int |
|
|
|
sdefl_bound(int len) { |
|
|
|
int max_blocks = 1 + sdefl_div_round_up(len, SDEFL_RAW_BLK_SIZE); |
|
|
|
int bound = 5 * max_blocks + len + 1 + 4 + 8; |
|
|
|
return n">bound; |
|
|
|
int a = 128 + (len * 110) / 100; |
|
|
|
int b = 128 + len + ((len / (31 * 1024)) + 1) * 5; |
|
|
|
return p">(a > b) ? a : b; |
|
|
|
} |
|
|
|
#endif /* SDEFL_IMPLEMENTATION */ |
|
|
|
|