/*
* suggest.c - generate suggestions for spelling errors.
*
* term% spell doc.ms | suggest
*
* The triagram permute validation seems a good idea, however in
* testing it rejected only 20% of the initial suggestions. This
* is comperable in cpu terms to the overhead in generating the
* probability tables in the first place.
*
%A V. I. Levenshtein
%T Binary codes capable of correcting deletions, insertions, and reversals
%J Soviet Physics Doklady 10
%D 1966
%V 707710
%T Hanging on the Metaphone
%A Lawrence Philips
%J Computer Language
%D December 1990
%P 38
*/
#include <u.h>
#include <libc.h>
#include <ctype.h>
#include <bio.h>
#include <pool.h>
/*
* These constants are rather arbitary but
* to work for my (rather small) corpus.
*/
enum{
Idx = 3, // apply Dist below after this many suggestions
Dist = 2, // stop offering suggestions if levenshtein is above this
Maxidx = 8, // give up after this many suggestions
// triagramme table
Trilen = 28, // side length
Begword = 0, // code for begining of word
Endword = 26, // code for end of word
Nalpha = 27, // code for non alpha character
Ncode = 4 // length of metaphone hash used
};
char *Wordfile = "/lib/words";
Biobuf *Words = nil;
uvlong Unknown = 0; // stats
uvlong Rejected = 0;
uvlong Generated = 0;
uvlong Ntri = 0; // number of triagrammes
uint ***Tri = nil; // triagramme counts
typedef struct Table Table;
typedef struct Result Result;
struct Table {
char code[Ncode];
vlong off;
};
struct Result {
Result *next;
char code[Ncode];
char *word;
int dist;
};
Table *Tab = nil;
int Ntab = 0;
void
metaphone(char *name, char *buf, int len)
{
int ii, jj, silent, hard, Lng, lastChr;
char curLtr, prevLtr, nextLtr, nextLtr2, nextLtr3;
int vowelAfter, vowelBefore, frontvAfter;
char *p, *p1;
char wname[60];
char *ename = wname;
static char *VOWELS = "AEIOU";
static char *FRONTV = "EIY"; /* special cases for letters in FRONT of these */
static char *VARSON = "CSPTG"; /* variable sound--those modified by adding an "h" */
static char *DOUBLE = "."; /* let these double letters through */
static char *excpPAIR = "AGKPW"; /* exceptions "ae-", "gn-", "kn-", "pn-", "wr-" */
static char *nextLTR = "ENNNR";
memset(buf, 0, len);
jj = 0;
for(ii = 0; name[ii] != '\0'; ii++) {
if(isalpha(name[ii])) {
ename[jj] = toupper(name[ii]);
jj++;
}
}
ename[jj] = '\0';
if(strlen(ename) == 0)
return;
/* if ae, gn, kn, pn, wr then drop the first letter */
if((p = strchr(excpPAIR, ename[0])) != nil) {
p1 = nextLTR + (p - excpPAIR);
if(*p1 == ename[1])
strcpy(ename, &ename[1]);
}
/* change x to s */
if(ename[0] == 'X')
ename[0] = 'S';
/* get rid of the "h" in "wh" */
if(strncmp(ename, "WH", 2) == 0)
strcpy(&ename[1], &ename[2]);
Lng = strlen(ename);
lastChr = Lng - 1; /* index to last character in string makes code easier */
/* Remove an S from the end of the string */
if(ename[lastChr] == 'S') {
ename[lastChr] = '\0';
Lng = strlen(ename);
lastChr = Lng - 1;
}
for(ii = 0; ((strlen(buf) < len) && (ii < Lng)); ii++) {
curLtr = ename[ii];
vowelBefore = 0;
prevLtr = ' ';
if(ii > 0) {
prevLtr = ename[ii - 1];
if(strchr(VOWELS, prevLtr) != nil)
vowelBefore = 1;
}
/* if first letter is a vowel KEEP it */
if(ii == 0 && (strchr(VOWELS, curLtr) != nil)) {
strncat(buf, &curLtr, 1);
continue;
}
vowelAfter = 0;
frontvAfter = 0;
nextLtr = ' ';
if(ii < lastChr) {
nextLtr = ename[ii + 1];
if(strchr(VOWELS, nextLtr) != nil)
vowelAfter = 1;
if(strchr(FRONTV, nextLtr) != nil)
frontvAfter = 1;
}
/* skip double letters except ones in list */
if(curLtr == nextLtr && (strchr(DOUBLE, nextLtr) == nil))
continue;
nextLtr2 = ' ';
if(ii < (lastChr - 1))
nextLtr2 = ename[ii + 2];
nextLtr3 = ' ';
if(ii < (lastChr - 2))
nextLtr3 = ename[ii + 3];
switch (curLtr) {
case 'B':
silent = 0;
if(ii == lastChr && prevLtr == 'M')
silent = 1;
if(!silent)
strncat(buf, &curLtr, 1);
break;
/* silent -sci-,-sce-,-scy-; sci-, etc OK */
case 'C':
if(!(ii > 1 && prevLtr == 'S' && frontvAfter))
if(ii > 0 && nextLtr == 'I' && nextLtr2 == 'A')
strncat(buf, "X", 1);
else if(frontvAfter)
strncat(buf, "S", 1);
else if(ii > 1 && prevLtr == 'S' && nextLtr == 'H')
strncat(buf, "K", 1);
else if(nextLtr == 'H')
if(ii == 0 && (strchr(VOWELS, nextLtr2) == nil))
strncat(buf, "K", 1);
else
strncat(buf, "X", 1);
else if(prevLtr == 'C')
strncat(buf, "C", 1);
else
strncat(buf, "K", 1);
break;
case 'D':
if(nextLtr == 'G' && (strchr(FRONTV, nextLtr2) != nil))
strncat(buf, "J", 1);
else
strncat(buf, "T", 1);
break;
case 'G':
silent = 0;
/* SILENT -gh- except for -gh and no vowel after h */
if((ii < (lastChr - 1) && nextLtr == 'H')
&& (strchr(VOWELS, nextLtr2) == nil))
silent = 1;
if((ii == (lastChr - 3))
&& nextLtr == 'N' && nextLtr2 == 'E' && nextLtr3 == 'D')
silent = 1;
else if((ii == (lastChr - 1)) && nextLtr == 'N')
silent = 1;
if(prevLtr == 'D' && frontvAfter)
silent = 1;
if(prevLtr == 'G')
hard = 1;
else
hard = 0;
if(!silent)
if(frontvAfter && (!hard))
strncat(buf, "J", 1);
else
strncat(buf, "K", 1);
break;
case 'H':
silent = 0;
if(strchr(VARSON, prevLtr) != nil)
silent = 1;
if(vowelBefore && !vowelAfter)
silent = 1;
if(!silent)
strncat(buf, &curLtr, 1);
break;
case 'F':
case 'J':
case 'L':
case 'M':
case 'N':
case 'R':
strncat(buf, &curLtr, 1);
break;
case 'K':
if(prevLtr != 'C')
strncat(buf, &curLtr, 1);
break;
case 'P':
if(nextLtr == 'H')
strncat(buf, "F", 1);
else
strncat(buf, "P", 1);
break;
case 'Q':
strncat(buf, "K", 1);
break;
case 'S':
if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A'))
strncat(buf, "X", 1);
else if(nextLtr == 'H')
strncat(buf, "X", 1);
else
strncat(buf, "S", 1);
break;
case 'T':
if(ii > 1 && nextLtr == 'I' && (nextLtr2 == 'O' || nextLtr2 == 'A'))
strncat(buf, "X", 1);
else if(nextLtr == 'H') /* The=0, Tho=T, Withrow=0 */
if(ii > 0 || (strchr(VOWELS, nextLtr2) != nil))
strncat(buf, "0", 1);
else
strncat(buf, "T", 1);
else if(!(ii < (lastChr - 2) && nextLtr == 'C' && nextLtr2 == 'H'))
strncat(buf, "T", 1);
break;
case 'V':
strncat(buf, "F", 1);
break;
case 'W':
case 'Y':
if(ii < lastChr && vowelAfter)
strncat(buf, &curLtr, 1);
break;
case 'X':
strncat(buf, "KS", 2);
break;
case 'Z':
strncat(buf, "S", 1);
break;
}
}
/*
* DON'T DO THIS NOW, REMOVING "S" IN BEGINNING HAS the same effect
* with plurals, in addition imbedded S's in the Metaphone are included
*
* Lng = strlen(buf);
* lastChr = Lng -1; if (buf[lastChr] == 'S' && Lng >= 3 )
* buf[lastChr] = '\0';
*/
}
/**********************************************************/
/*
* Lifted from /sys/src/ape/lib/ap/gen/bsearch.c
* and stripped of it's ANSI-isms
*/
void*
bsearch(void* key, void* base, int nmemb, int size, int (*compar)(void*, void*))
{
long i, bot, top, new;
void *p;
bot = 0;
top = bot + nmemb - 1;
while(bot <= top){
new = (top + bot)/2;
p = (char *)base+new*size;
i = (*compar)(key, p);
if(i == 0)
return p;
if(i > 0)
bot = new + 1;
else
top = new - 1;
}
return 0;
}
/**********************************************************/
int
min3(int a, int b, int c)
{
int min = a;
if(b < min)
min = b;
if(c < min)
min = c;
return min;
}
int
levenshtein(char *s, char *t)
{
static int *d = nil;
static int alloc = 0;
int k, i, j, n, m, cost, distance;
n = strlen(s);
m = strlen(t);
if(n == 0 || m == 0)
return -1;
/*
* This leaks once on exit, but who cares
*/
if((m+1)*(n+1) > alloc){
free(d);
alloc = (m+1)*(n+1)+32;
if((d = malloc(sizeof(int)*alloc)) == nil)
sysfatal("no memory\n");
}
m++;
n++;
for(k = 0; k < n; k++)
d[k] = k;
for(k = 0; k < m; k++)
d[k*n] = k;
for(i = 1; i < n; i++)
for(j = 1; j < m; j++) {
cost = (s[i-1] == t[j-1])? 0: 1;
d[j*n+i] = min3(d[(j-1)*n+i]+1, d[j*n+i-1]+1, d[(j -1)*n+i-1]+cost);
#ifdef NOT_DEFINED
if(i>=2 && j>=2 && (d[j*n+i]-d[(j-2)*n+i-2]==2) && (s[i-2]==t[j-1]) && (s[i-1]==t[j-2]))
d[j*n+i]--;
#endif
}
distance = d[n*m-1];
return distance;
}
/**********************************************************/
int
enc(char c)
{
if(! isalpha(c))
return Nalpha;
if(isupper(c))
c = tolower(c);
return c - 'a';
}
void
triagrams(char *word)
{
char *p;
if(word[0] == 0 || word[1] == 0) // too short
return;
Ntri++;
p = word;
Tri[Begword][enc(p[0])][enc(p[1])]++;
for(; p[0] && p[1] && p[2]; p++)
Tri[enc(p[0])][enc(p[1])][enc(p[2])]++;
Tri[enc(p[0])][enc(p[1])][Endword]++;
}
double
triprob(char *p, int l, int i)
{
double n;
if(i > l-2)
return 0.5; // too short
if(i == 0)
n = Tri[Begword][enc(p[0])][enc(p[1])];
else
if(i == l-1)
n = Tri[enc(p[i])][enc(p[i+1])][Endword];
else
n = Tri[enc(p[i-1])][enc(p[i+0])][enc(p[i+1])];
return n / (double)Ntri;
}
/**********************************************************/
int
tabcmp(void *a, void *b)
{
Table *x = (Table *)a;
Table *y = (Table *)b;
return memcmp(x->code, y->code, Ncode);
}
int
rescmp(void *a, void *b)
{
Result **x = (Result **)a;
Result **y = (Result **)b;
if((*x)->dist == (*y)->dist)
return strcmp((*x)->word, (*y)->word);
return (*x)->dist - (*y)->dist;
}
void
ingest(void)
{
char *p;
vlong off;
int alloc;
off = 0;
Tab = nil;
Ntab = alloc = 0;
while((p = Brdline(Words, '\n')) != nil){
p[Blinelen(Words) -1] = 0;
triagrams(p);
if(Ntab >= alloc){
alloc += 128;
if((Tab = realloc(Tab, alloc*sizeof(Table))) == nil)
sysfatal("No memory: %r\n");
}
metaphone(p, Tab[Ntab].code, Ncode);
Tab[Ntab].off = off;
Ntab++;
off = Bseek(Words, 0, 1);
}
qsort(Tab, Ntab, sizeof(Table), tabcmp);
}
void
results(char *word, Result *root)
{
int n, i;
char *prev;
Result **idx, *r;
n = 0;
for(r = root; r; r = r->next)
n++;
if((idx = malloc(n * sizeof(Result *))) == nil)
sysfatal("No memory: %r\n");
i = 0;
for(r = root; r; r = r->next)
idx[i++] = r;
qsort(idx, n, sizeof(Result *), rescmp);
print("%-16s → ", word);
prev = "";
for(i = 0; i < n; i++){
r = idx[i];
if(strcmp(r->word, prev) == 0)
continue;
print("%s ", r->word);
if(r->dist == 0)
break;
if((i > Idx && r->dist > Dist) || i > Maxidx)
break;
prev = r->word;
}
print("\n");
free(idx);
}
int
lookup(Result **root, char *word, Table *tp)
{
Result *r;
char l, *p;
Table *first, *last;
for(r = *root; r; r = r->next)
if(memcmp(r->code, tp->code, Ncode) == 0) // already tested
return 0;
for(first = tp; tabcmp(tp, first) == 0 && first >= Tab; first--)
continue;
first++;
for(last = tp; tabcmp(tp, last) == 0; last++)
continue;
for(tp = first; tp < last; tp++){
Bseek(Words, tp->off, 0);
if((p = Brdline(Words, '\n')) == nil){
fprint(2, "%s - read failed at off=%lld: %r\n", Wordfile, tp->off);
continue;
}
l = Blinelen(Words) -2;
p[l+1] = 0;
if((r = malloc(sizeof(Result)+strlen(p)+1)) == nil)
sysfatal("No memory");
r->dist = levenshtein(word, p);
memcpy(r->code, tp->code, Ncode);
r->word = ((char *)r)+sizeof(Result);
strcpy(r->word, p);
r->next = *root;
*root = r;
if(r->dist == 0)
return 1;
/*
* Workaround for feature of deroff where it strips
* trailing periods from all words, even abbreviations.
*/
if(p[l] == '.' && strncmp(p, word, l-1) == 0){
r->dist = 0;
return 1;
}
}
return 0;
}
void
permute(char *word)
{
char *buf;
int i, j, l;
Table t, *tp;
Result *r, *nr, *root;
root = nil;
l = strlen(word);
if((buf = malloc(l+2)) == nil) // 2 == 1 insertion + string terminator
sysfatal("No memory %r\n");
/* unchanged */
metaphone(word, t.code, Ncode);
if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil){
Generated++;
if(lookup(&root, word, tp) == 1)
Unknown++;
}
/* deletion */
for(i = 0; i < l; i++){
Generated++;
strncpy(buf, word, i);
strcpy(buf+i, word+i+1);
if(triprob(buf, l, i) == 0){
Rejected++;
continue;
}
metaphone(buf, t.code, Ncode);
if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
lookup(&root, word, tp);
Unknown++;
}
/* modification */
for(j = 'a'; j <= 'z'; j++){
for(i = 0; i < l; i++){
Generated++;
strcpy(buf, word);
buf[i] = j;
if(triprob(buf, l, i) == 0){
Rejected++;
continue;
}
metaphone(buf, t.code, Ncode);
if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
lookup(&root, word, tp);
Unknown++;
}
}
/* transposition */
for(j = 0; j < l-1; j++){
for(i = j+1; i < l; i++){
Generated++;
strcpy(buf, word);
buf[i] = word[j];
buf[j] = word[i];
if(triprob(buf, l, i) == 0 || triprob(buf, l, j) == 0){
Rejected++;
continue;
}
metaphone(buf, t.code, Ncode);
if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
lookup(&root, word, tp);
Unknown++;
}
}
/* insertion */
for(j = 'a'; j <= 'z'; j++){
for(i = 0; i < l; i++){
Generated++;
strncpy(buf, word, i);
buf[i+1] = j;
strcpy(buf+i+1, word+i);
if(triprob(buf, l, i+1) == 0){
Rejected++;
continue;
}
metaphone(buf, t.code, Ncode);
if((tp = bsearch(&t, Tab, Ntab, sizeof(Table), tabcmp)) != nil)
lookup(&root, word, tp);
Unknown++;
}
}
results(word, root);
free(buf);
for(r = root; r; r = nr){
nr = r->next;
free(r);
}
}
void
usage(void)
{
fprint(2, "usage: %s [-s] [-d dict] [words]\n", argv0);
fprint(2, " -d dict use dict rather than %s\n", Wordfile);
fprint(2, " -s print stats at exit\n");
exits("usage");
}
void
main(int argc, char *argv[])
{
char *p;
Biobuf bi;
int i, j, stats;
stats = 0;
ARGBEGIN{
case 's':
stats++;
break;
case 'd':
Wordfile = EARGF(usage());
break;
default:
usage();
break;
}ARGEND;
if((Words = Bopen(Wordfile, OREAD)) == nil)
sysfatal("%s - cannot open: %r\n", Wordfile);
if((Tri = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
sysfatal("No memory: %r\n");
for(i = 0; i < Trilen; i++){
if((Tri[i] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
sysfatal("No memory: %r\n");
for(j = 0; j < Trilen; j++)
if((Tri[i][j] = mallocz(sizeof(Tri[0][0][0]) * Trilen, 1)) == nil)
sysfatal("No memory: %r\n");
}
ingest();
if(argc){
for(i = 0; i < argc; i++)
permute(argv[i]);
}
else{
Binit(&bi, OREAD, 0);
while((p = Brdline(&bi, '\n')) != nil){
for(i = Blinelen(&bi) -1; isspace(p[i]) && i >= 0; i--)
p[i] = 0;
for(; isspace(*p); p++)
continue;
if(*p == 0)
continue;
permute(p);
}
Bterm(&bi);
}
free(Tab);
Bterm(Words);
if(stats)
print("generated: %llud rejected: %llud unknown: %llud\n",
Generated, Rejected, Unknown);
exits(0);
}
|