#include <u.h>
#include <libc.h>
#include <bio.h>
#include <ctype.h>
#include "code.h"
/* read an annotated spelling list in the form
word <tab> affixcode [ , affixcode ] ...
print a reencoded version
octal <tab> word
*/
typedef struct Dict Dict;
struct Dict
{
char* word;
int encode;
};
Dict words[200000];
char space[500000];
long encodes[4094];
long nspace;
long nwords;
int ncodes;
Biobuf bout;
void readinput(int f);
long typecode(char *str);
int wcmp(void*, void*);
void pdict(void);
void sput(int);
void
main(int argc, char *argv[])
{
int f;
Binit(&bout, 1, OWRITE);
nwords = 0;
nspace = 0;
ncodes = 0;
if(argc <= 1)
readinput(0);
while(argc > 1) {
f = open(argv[1], 0);
if(f < 0) {
fprint(2, "Cannot open %s\n", argv[1]);
exits("open");
}
readinput(f);
argc--;
argv++;
}
fprint(2, "words = %ld; space = %ld; codes = %d\n",
nwords, nspace, ncodes);
qsort(words, nwords, sizeof(words[0]), wcmp);
pdict();
exits(0);
}
wcmp(void *a, void *b)
{
return strcmp(((Dict*)a)->word, ((Dict*)b)->word);
}
void
readinput(int f)
{
long i;
char *code, *line, *bword;
Biobuf buf;
long lineno = 0;
Binit(&buf, f, OREAD);
while(line = Brdline(&buf, '\n')) {
line[Blinelen(&buf)-1] = 0;
lineno++;
code = line;
while(isspace(*code))
code++;
bword = code;
while(*code && !isspace(*code))
code++;
i = code-bword;
memmove(space+nspace, bword, i);
words[nwords].word = space+nspace;
nspace += i;
space[nspace] = 0;
nspace++;
if(*code) {
*code++ = 0;
while(isspace(*code))
code++;
}
words[nwords].encode = typecode(code);
nwords++;
if(nwords >= sizeof(words)/sizeof(words[0])) {
fprint(2, "words array too small\n");
exits("words");
}
if(nspace >= sizeof(space)/sizeof(space[0])) {
fprint(2, "space array too small\n");
exits("space");
}
}
Bterm(&buf);
}
typedef struct Class Class;
struct Class
{
char* codename;
long bits;
};
Class codea[] =
{
{ "a", ADJ },
{ "adv", ADV },
0
};
Class codec[] =
{
{ "comp", COMP },
0
};
Class coded[] =
{
{ "d", DONT_TOUCH},
0
};
Class codee[] =
{
{ "ed", ED },
{ "er", ACTOR },
0
};
Class codei[] =
{
{ "in", IN },
{ "ion", ION },
0
};
Class codem[] =
{
{ "man", MAN },
{ "ms", MONO },
0
};
Class coden[] =
{
{ "n", NOUN },
{ "na", N_AFFIX },
{ "nopref", NOPREF },
0
};
Class codep[] =
{
{ "pc", PROP_COLLECT },
0
};
Class codes[] =
{
{ "s", STOP },
0
};
Class codev[] =
{
{ "v", VERB },
{ "va", V_AFFIX },
{ "vi", V_IRREG },
0
};
Class codey[] =
{
{ "y", _Y },
0
};
Class codez[] =
{
0
};
Class* codetab[] =
{
codea,
codez,
codec,
coded,
codee,
codez,
codez,
codez,
codei,
codez,
codez,
codez,
codem,
coden,
codez,
codep,
codez,
codez,
codes,
codez,
codez,
codev,
codez,
codez,
codey,
codez,
};
long
typecode(char *str)
{
Class *p;
long code;
int n, i;
char *s, *sp, *st;
code = 0;
loop:
for(s=str; *s != 0 && *s != ','; s++)
;
for(p = codetab[*str-'a']; sp = p->codename; p++) {
st = str;
for(n=s-str;; st++,sp++) {
if(*st != *sp)
goto cont;
n--;
if(n == 0)
break;
}
code |= p->bits;
if(*s == 0)
goto out;
str = s+1;
goto loop;
cont:;
}
fprint(2, "Unknown affix code \"%s\"\n", str);
return 0;
out:
for(i=0; i<ncodes; i++)
if(encodes[i] == code)
return i;
encodes[i] = code;
ncodes++;
return i;
}
void
sput(int s)
{
Bputc(&bout, s>>8);
Bputc(&bout, s);
}
void
lput(long l)
{
Bputc(&bout, l>>24);
Bputc(&bout, l>>16);
Bputc(&bout, l>>8);
Bputc(&bout, l);
}
/*
* spit out the encoded dictionary
* all numbers are encoded big-endian.
* struct
* {
* short ncodes;
* long encodes[ncodes];
* struct
* {
* short encode;
* char word[*];
* } words[*];
* };
* 0x8000 flag for code word
* 0x7800 count of number of common bytes with previous word
* 0x07ff index into codes array for affixes
*/
void
pdict(void)
{
long i, count;
int encode, j, c;
char *lastword, *thisword, *word;
sput(ncodes);
for(i=0; i<ncodes; i++)
lput(encodes[i]);
count = ncodes*4 + 2;
lastword = "";
for(i=0; i<nwords; i++) {
word = words[i].word;
thisword = word;
for(j=0; *thisword == *lastword; j++) {
if(*thisword == 0) {
fprint(2, "identical words: %s\n", word);
break;
}
thisword++;
lastword++;
}
if(j > 15)
j = 15;
encode = words[i].encode;
c = (1<<15) | (j<<11) | encode;
sput(c);
count += 2;
for(thisword=word+j; c = *thisword; thisword++) {
Bputc(&bout, c);
count++;
}
lastword = word;
}
fprint(2, "output bytes = %ld\n", count);
}
|