/*
* Extraordinarily brute force Lempel & Ziv-like
* compressor. The input file must fit in memory
* during compression, and the output file will
* be reconstructed in memory during decompression.
* We search for large common sequences and use a
* greedy algorithm to choose which sequence gets
* compressed first.
*
* Files begin with "BLZ\n" and a 32-bit uncompressed file length.
*
* Output format is a series of blocks followed by
* a raw data section. Each block begins with a 32-bit big-endian
* number. The top bit is type and the next 31 bits
* are uncompressed size. Type is one of
* 0 - use raw data for this length
* 1 - a 32-bit offset follows
* After the blocks come the raw data. (The end of the blocks can be
* noted by summing block lengths until you reach the file length.)
*/
#include <u.h>
#include <libc.h>
#include <bio.h>
#define malloc sbrk
int minrun = 16;
int win = 16;
ulong outn;
int verbose;
int mindist;
enum { Prime = 16777213 }; /* smallest prime < 2^24 (so p*256+256 < 2^32) */
enum { NOFF = 3 };
Biobuf bout;
ulong length;
uchar *data;
ulong sum32(ulong, void*, long);
uchar *odat;
int nodat;
int nraw;
int rawstart;
int acct;
int zlength;
int maxchain;
int maxrle[256];
int nnew;
typedef struct Node Node;
struct Node {
Node *link;
ulong key;
ulong offset[NOFF];
};
Node *nodepool;
int nnodepool;
Node **hash;
uint nhash;
uint maxlen;
uint maxsame;
uint replacesame = 8*1024*1024;
Node *freelist, **freeend;
uint nalloc;
Node*
allocnode(void)
{
int i;
Node *n;
if(nnodepool == 0){
nnodepool = 256*1024;
nodepool = malloc(sizeof(Node)*nnodepool);
}
if(freelist){
n = freelist;
freelist = n->link;
return n;
}
assert(nnodepool > 0);
nalloc++;
n = &nodepool[--nnodepool];
for(i=0; i<NOFF; i++)
n->offset[i] = -1;
return n;
}
void
freenode(Node *n)
{
if(freelist == nil)
freelist = n;
else
*freeend = n;
freeend = &n->link;
n->link = nil;
}
Node**
llookup(ulong key)
{
uint c;
Node **l, **top, *n;
if(nhash == 0){
uint x;
x = length/8;
for(nhash=1; nhash<x; nhash<<=1)
;
hash = sbrk(sizeof(Node*)*nhash);
}
top = &hash[key&(nhash-1)];
c = 0;
for(l=top; *l; l=&(*l)->link){
c++;
if((*l)->key == key){
/* move to front */
n = *l;
*l = n->link;
n->link = *top;
*top = n;
return top;
}
}
if(c > maxlen)
maxlen = c;
return l;
}
Node*
lookup(ulong key)
{
return *llookup(key);
}
void
insertnode(ulong key, ulong offset)
{
int i;
Node *n, **l;
l = llookup(key);
if(*l == nil){
if(l==&hash[key&(nhash-1)])
nnew++;
*l = allocnode();
(*l)->key = key;
}
n = *l;
/* add or replace last */
for(i=0; i<NOFF-1 && n->offset[i]!=-1; i++)
;
n->offset[i] = offset;
}
void
Bputint(Biobufhdr *b, int n)
{
uchar p[4];
p[0] = n>>24;
p[1] = n>>16;
p[2] = n>>8;
p[3] = n;
Bwrite(b, p, 4);
}
void
flushraw(void)
{
if(nraw){
if(verbose)
fprint(2, "Raw %d+%d\n", rawstart, nraw);
zlength += 4+nraw;
Bputint(&bout, (1<<31)|nraw);
memmove(odat+nodat, data+rawstart, nraw);
nodat += nraw;
nraw = 0;
}
}
int
rawbyte(int i)
{
assert(acct == i);
if(nraw == 0)
rawstart = i;
acct++;
nraw++;
return 1;
}
int
refblock(int i, int len, int off)
{
assert(acct == i);
acct += len;
if(nraw)
flushraw();
if(verbose)
fprint(2, "Copy %d+%d from %d\n", i, len, off);
Bputint(&bout, len);
Bputint(&bout, off);
zlength += 4+4;
return len;
}
int
cmprun(uchar *a, uchar *b, int len)
{
int i;
if(a==b)
return 0;
for(i=0; i<len && a[i]==b[i]; i++)
;
return i;
}
int
countrle(uchar *a)
{
uchar a0;
uchar *p;
p = a;
a0 = *p;
while(*++p == a0)
;
return p - a;
}
void
compress(void)
{
int best, i, j, o, rle, run, maxrun, maxoff;
ulong sum;
Node *n;
sum = 0;
for(i=0; i<win && i<length; i++)
sum = (sum*256+data[i])%Prime;
for(i=0; i<length-win; ){
maxrun = 0;
maxoff = 0;
if(verbose)
fprint(2, "look %.6lux\n", sum);
n = lookup(sum);
if(n){
best = -1;
for(o=0; o<NOFF; o++){
if(n->offset[o] == -1)
break;
run = cmprun(data+i, data+n->offset[o], length-i);
if(run > maxrun && n->offset[o]+mindist < i){
maxrun = run;
maxoff = n->offset[o];
best = o;
}
}
if(best > 0){
o = n->offset[best];
for(j=best; j>0; j--)
n->offset[j] = n->offset[j-1];
n->offset[0] = o;
}
}
if(maxrun >= minrun)
j = i+refblock(i, maxrun, maxoff);
else
j = i+rawbyte(i);
for(; i<j; i++){
/* avoid huge chains from large runs of same byte */
rle = countrle(data+i);
if(rle<4)
insertnode(sum, i);
else if(rle>maxrle[data[i]]){
maxrle[data[i]] = rle;
insertnode(sum, i);
}
sum = (sum*256+data[i+win]) % Prime;
sum = (sum + data[i]*outn) % Prime;
}
}
/* could do better here */
for(; i<length; i++)
rawbyte(i);
flushraw();
}
void
usage(void)
{
fprint(2, "usage: bflz [-n winsize] [file]\n");
exits("usage");
}
void
main(int argc, char **argv)
{
int fd, i, n;
char buf[10485760];
ARGBEGIN{
case 'd':
verbose = 1;
break;
case 's':
replacesame = atoi(EARGF(usage()));
break;
case 'm':
mindist = atoi(EARGF(usage()));
break;
case 'n':
win = atoi(EARGF(usage()));
minrun = win;
break;
default:
usage();
}ARGEND
switch(argc){
default:
usage();
case 0:
fd = 0;
break;
case 1:
if((fd = open(argv[0], OREAD)) < 0)
sysfatal("open %s: %r", argv[0]);
break;
}
while((n = readn(fd, buf, sizeof buf)) > 0){
data = realloc(data, length+n);
if(data == nil)
sysfatal("realloc: %r");
memmove(data+length, buf, n);
length += n;
if(n < sizeof buf)
break;
}
odat = malloc(length);
if(odat == nil)
sysfatal("malloc: %r");
Binit(&bout, 1, OWRITE);
Bprint(&bout, "BLZ\n");
Bputint(&bout, length);
outn = 1;
for(i=0; i<win; i++)
outn = (outn * 256) % Prime;
if(verbose)
fprint(2, "256^%d = %.6lux\n", win, outn);
outn = Prime - outn;
if(verbose)
fprint(2, "outn = %.6lux\n", outn);
compress();
Bwrite(&bout, odat, nodat);
Bterm(&bout);
fprint(2, "brk %p\n", sbrk(1));
fprint(2, "%d nodes used; %d of %d hash slots used\n", nalloc, nnew, nhash);
exits(nil);
}
|