/usr/web/sources/contrib/tristan/dict/gerlexpro.c

Plan 9 from Bell Labs’s /usr/web/sources/contrib/tristan/dict/gerlexpro.c

#include <u.h>
#include <libc.h>
#include <bio.h>
#include "dict.h"

/* Dictionaries from the Germanic Lexicon Project:
  * "An Anglo-Saxon Dictionary" (Bosworth + Toller)
  * "An Icelandic English Dictionary" (Cleasby + Vigfusson)
  * http://lexicon.ff.cuni.cz/
  */
enum {
	Buflen=1000,
	Maxaux=5,
};

/* Possible tags */
enum {
	B,		/* bold (and headword)*/
	Header,	/* page header */
	I,		/* italics */
	Intro,	/* introduction */
	Letter,	/* letter header */
	Page,	/* page */
	Table,	/* table */
	Td,		/* table data */
	Th,		/* table header */
	Tr,		/* table row */
	Ntag		/* end of tags */
};

/* Assoc tables must be sorted on first field */

static Assoc tagtab[] = {
	{"B",				B},
	{"b",				B},
	{"HEADER",		Header},
	{"I",				I},
	{"i",				I},
	{"INTRODUCTION",	Intro},
	{"letterheader",		Letter},
	{"PAGE",			Page},
	{"TABLE",			Table},
	{"TD",			Td},
	{"TH",			Th},
	{"TR",			Tr},
};

static Assoc spectab[] = {
	{"AElig",	L'Æ'},
	{"AElig-acute",	L'Ǽ'},
	{"Aacute",	L'Á'},
	{"Aring",	L'Å'},
	{"Beta",	L'Β'},
	{"ETH",	L'Ð'},
	{"Eacute",	L'É'},
	{"FINGER",	L'☞'},
	{"Iacute",	L'Í'},
	{"OElig",	L'Œ'},
	{"Oacute",	L'Ó'},
	{"Ouml",	L'Ö'},
	{"THORN",L'Þ'},
	{"Uacute",	L'Ú'},
	{"Uuml",	L'Ü'},
	{"Yacute",	L'Ý'},
	{"a-long",	L'ā'},
	{"a-short", L'ā'},	/* fixme */
	{"aacute",	L'á'},
	{"acirc",	L'â'},
	{"aelig",	L'æ'},
	{"aelig-acute",	L'ǽ'},
	{"aelig-circ",	L'æ'},	/* +circumflex */
	{"aelig-long", L'ǣ'},
	{"alpha",	L'α'},
	{"alpha-tonos",	L'ά'},
	{"amp",		L'&'},
	{"aolig-acute",	L'ꜵ'},	/* +acute */
	{"aring",	L'å'},
	{"auml",	L'ä'},
	{"b-bar",	L'ƀ'},		/* b with bar U+0180 */
	{"b-rune",	L'b'},		/* fixme: should be rune */
	{"beta",	L'β'},
	{"bull",	L'•'},
	{"c-rune",	L'c'},		/* fixme: should be rune */
	{"chi",		L'χ'},
	{"d-bar",	L'ð'},
	{"dash-uncertain",		L''},
	{"delta",	L'δ'},
	{"e-hook",	L'ẻ'},
	{"e-long",	L'ē'},
	{"e-rune",	L'e'},		/* fixme: should be rune */
	{"e-short",L'ē'},		/* fixme ? */
	{"eacute",	L'é'},
	{"ecirc",	L'ê'},
	{"egrave",	L'è'},
	{"epsilon",	L'ε'},
	{"epsilon-tonos",	L'έ'},
	{"eta",	L'η'},
	{"eta-tonos",	L'ή'},
	{"eth",	L'ð'},
	{"euml",	L'ë'},
	{"f-rune",	L'f'},		/* fixme: should be rune */
	{"frac12",	L'½'},
	{"gamma",	L'γ'},
	{"hand",	L'☞'},
	{"i-long",	L'ī'},
	{"i-short",	L'ī'},		/* fixme ? */
	{"iacute",	L'í'},
	{"icirc",	L'î'},
	{"igrave",	L'ì'},
	{"iota",	L'ι'},
	{"iota-oxia",	L'ί'},
	{"iota-tonos",	L'ί'},
	{"iuml",	L'ï'},
	{"kappa",	L'κ'},
	{"l-bar",	L'ł'},
	{"l-rune",	L'l'},		/* fixme: should be rune */
	{"lambda",	L'λ'},
	{"mdash",	L'—'},
	{"mu",		L'μ'},
	{"n-long",	L'n'},	/* um? */
	{"n-rune",	L'n'},		/* fixme: should be rune */
	{"ntilde",	L'ñ'},
	{"nu",		L'ν'},
	{"o-long",	L'ō'},
	{"o-short",L'ō'},		/* fixme ? */
	{"oacute",	L'ó'},
	{"obar",	L'ø'},
	{"ocirc",	L'ô'},
	{"oelig",	L'œ'},
	{"oelig-acute",	L'œ'},	/* +acute */
	{"ograve",	L'ò'},
	{"omega",	L'ω'},
	{"omega-tonos",	L'ώ'},
	{"omicron",	L'ο'},
	{"omicron-tonos",	L'ό'},
	{"oslash",	L'ø'},
	{"ouml",	L'ö'},
	{"para",	L'¶'},
	{"phi",	L'φ'},
	{"pi",		L'π'},
	{"pound",	L'£'},
	{"psi",	L'ψ'},
	{"r-long",	L'r'},	/* with macron? or ɼ */
	{"r-udot",	L'·'},	/* fixme */
	{"rho",	L'ρ'},
	{"sect",	L'§'},
	{"sigma",	L'σ'},
	{"sigmaf",	L'ς'},
	{"szlig",	L'ß'},	/* fixme? */
	{"tau",	L'τ'},
	{"theta",	L'θ'},
	{"thorn",	L'þ'},
	{"tilde",	L'~'},	/* fixme, should these be x + tilde */
	{"u-long",	L'ū'},
	{"u-rune",	L'u'},		/* fixme: should be rune */
	{"u-short",	L'ū'},	/* fixme */
	{"uacute",	L'ú'},
	{"ucirc",	L'û'},
	{"ugrave",	L'ù'},
	{"upsilon",	L'υ'},
	{"upsilon-tonos",	L'ύ'},
	{"uuml",	L'ü'},
	{"w-rune",	L'w'},		/* fixme: should be rune */
	{"xi",		L'ξ'},
	{"y-long",	L'ÿ'},		/* fixme: y with macron? */
	{"y-rune",	L'y'},		/* fixme: should be rune */
	{"yacute",	L'ý'},
	{"yogh",		L'ʒ'},
	{"zeta",	L'ζ'},
};

static Rune normtab[128] = {
	/*0*/	/*1*/	/*2*/	/*3*/	/*4*/	/*5*/	/*6*/	/*7*/
/*00*/	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,
	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,
/*10*/	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,
	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,	NONE,
/*20*/	L' ',	L'!',	L'"',	L'#',	L'$',	L'%',	SPCS,	L'\'',
	L'(',	L')',	L'*',	L'+',	L',',	L'-',	L'.',	L'/',
/*30*/  L'0',	L'1',	L'2',	L'3',	L'4',	L'5',	L'6',	L'7',
	L'8',	L'9',	L':',	L';',	TAGS,	L'=',	TAGE,	L'?',
/*40*/  L'@',	L'A',	L'B',	L'C',	L'D',	L'E',	L'F',	L'G',
	L'H',	L'I',	L'J',	L'K',	L'L',	L'M',	L'N',	L'O',
/*50*/	L'P',	L'Q',	L'R',	L'S',	L'T',	L'U',	L'V',	L'W',
	L'X',	L'Y',	L'Z',	L'[',	L'\\',	L']',	L'^',	L'_',
/*60*/	L'`',	L'a',	L'b',	L'c',	L'd',	L'e',	L'f',	L'g',
	L'h',	L'i',	L'j',	L'k',	L'l',	L'm',	L'n',	L'o',
/*70*/	L'p',	L'q',	L'r',	L's',	L't',	L'u',	L'v',	L'w',
	L'x',	L'y',	L'z',	L'{',	L'|',	L'}',	L'~',	NONE,
};

static int	tagstarts;
static char	tag[Buflen];
static int	naux;
static char	auxname[Maxaux][Buflen];
static char	auxval[Maxaux][Buflen];
static char	spec[Buflen];
static Entry	curentry;
#define cursize (curentry.end-curentry.start)

static char	*getspec(char *, char *);
static char	*gettag(char *, char *);

/*
 * cmd is one of:
 *    'p': normal print
 *    'h': just print headwords
 *    'P': print raw
 */
void
gerlexproprintentry(Entry e, int cmd)
{
	char *p, *pe;
	int t, headword;
	long r;

	p = e.start;
	pe = e.end;
	changett(0, 0, 0);
	headword=1;
	curentry = e;
	if(cmd == 'r')	while(p<pe)	outchar(*p++);
	else while(p < pe) {
		r = normtab[(*p++)&0x7F];
		if(r < NONE) {
			outrune(r);
		} else if(r == SPCS) {
			/* Start of special character name */
			p = getspec(p, pe);
			r = lookassoc(spectab, asize(spectab), spec);
			if(r == -1) {
				if(debug)
					err("spec %ld %d %s",
						e.doff, cursize, spec);
				r = L'�';
			}
			outrune(r);
		} else if(r == TAGS) {
			/* Start of tag name */
			p = gettag(p, pe);
			t = lookassoc(tagtab, asize(tagtab), tag);
			if(t == -1) {
				if(debug)
					err("tag %ld %d %s",
						e.doff, cursize, tag);
				continue;
			}
			switch(t){
			case B:
			case Letter:
				if(!tagstarts && headword){
					outnl(0);
					headword=0;
					if(cmd=='h') return;
				}
				break;
			case Header:
				outinhibit=tagstarts;
				break;
			case Table:
				/*
				 * Todo: gather columns, justify them, etc.
				 * For now, just let colums come out as rows
				 */
				if(!tagstarts)
					outnl(0);
				break;
			case Tr:
				if(tagstarts)
					outnl(0);
				break;
			}
		}
	}
	outnl(0);
}

/*
 * Return offset into bdict where next entry after fromoff starts.
 * entries start with \n<B> or \n<letterheader>
 */
long
gerlexpronextoff(long fromoff)
{
	long a, n;
	int c;

	a = Bseek(bdict, fromoff, 0);
	if(a < 0)
		return -1;
	n = 0;
	c = Bgetc(bdict);
	for(;;) {
		if(c < 0)
			break;
		if(c == '\n') {
			c = Bgetc(bdict);
			if(c == '<') {
				c = Bgetc(bdict);
				if(c == 'B' && (c=Bgetc(bdict)) == '>')
						n = 3;
				else if(c == 'l' && (c=Bgetc(bdict)) == 'e')
						n = 3;
			if(n)	break;
			}
		} else c=Bgetc(bdict);
	}
	return (Boffset(bdict)-n);
}

static char *prkey =
"KEY TO THE PRONUNCIATION\n"
"\n"
"I. CONSONANTS\n"
"b, d, f, k, l, m, n, p, t, v, z: usual English values\n"
"\n"
"g as in go (gəʊ)\n"
"h  ...  ho! (həʊ)\n"
"r  ...  run (rʌn), terrier (ˈtɛriə(r))\n"
"(r)...  her (hɜː(r))\n"
"s  ...  see (siː), success (səkˈsɜs)\n"
"w  ...  wear (wɛə(r))\n"
"hw ...  when (hwɛn)\n"
"j  ...  yes (jɛs)\n"
"θ  ...  thin (θin), bath (bɑːθ)\n"
"ð  ...  then (ðɛn), bathe (beɪð)\n"
"ʃ  ...  shop (ʃɒp), dish (dɪʃ)\n"
"tʃ ...  chop (tʃɒp), ditch (dɪtʃ)\n"
"ʒ  ...  vision (ˈvɪʒən), déjeuner (deʒøne)\n"
"dʒ ...  judge (dʒʌdʒ)\n"
"ŋ  ...  singing (ˈsɪŋɪŋ), think (θiŋk)\n"
"ŋg ...  finger (ˈfiŋgə(r))\n"
"\n"
"Foreign\n"
"ʎ as in It. seraglio (serˈraʎo)\n"
"ɲ  ...  Fr. cognac (kɔɲak)\n"
"x  ...  Ger. ach (ax), Sc. loch (lɒx)\n"
"ç  ...  Ger. ich (ɪç), Sc. nicht (nɪçt)\n"
"ɣ  ...  North Ger. sagen (ˈzaːɣən)\n"
"c  ...  Afrikaans baardmannetjie (ˈbaːrtmanəci)\n"
"ɥ  ...  Fr. cuisine (kɥizin)\n"
"\n"
"II. VOWELS AND DIPTHONGS\n"
"\n"
"Short\n"
"ɪ as in pit (pɪt), -ness (-nɪs)\n"
"ɛ  ...  pet (pɛt), Fr. sept (sɛt)\n"
"æ  ...  pat (pæt)\n"
"ʌ  ...  putt (pʌt)\n"
"ɒ  ...  pot (pɒt)\n"
"ʊ  ...  put (pʊt)\n"
"ə  ...  another (əˈnʌðə(r))\n"
"(ə)...  beaten (ˈbiːt(ə)n)\n"
"i  ...  Fr. si (si)\n"
"e  ...  Fr. bébé (bebe)\n"
"a  ...  Fr. mari (mari)\n"
"ɑ  ...  Fr. bâtiment (bɑtimã)\n"
"ɔ  ...  Fr. homme (ɔm)\n"
"o  ...  Fr. eau (o)\n"
"ø  ...  Fr. peu (pø)\n"
"œ  ...  Fr. boeuf (bœf), coeur (kœr)\n"
"u  ...  Fr. douce (dus)\n"
"ʏ  ...  Ger. Müller (ˈmʏlər)\n"
"y  ...  Fr. du (dy)\n"
"\n"
"Long\n"
"iː as in bean (biːn)\n"
"ɑː ...  barn (bɑːn)\n"
"ɔː ...  born (bɔːn)\n"
"uː ...  boon (buːn)\n"
"ɜː ...  burn (bɜːn)\n"
"eː ...  Ger. Schnee (ʃneː)\n"
"ɛː ...  Ger. Fähre (ˈfɛːrə)\n"
"aː ...  Ger. Tag (taːk)\n"
"oː ...  Ger. Sohn (zoːn)\n"
"øː ...  Ger. Goethe (gøːtə)\n"
"yː ...  Ger. grün (gryːn)\n"
"\n"
"Nasal\n"
"ɛ˜, æ˜ as in Fr. fin (fɛ˜, fæ˜)\n"
"ã  ...  Fr. franc (frã)\n"
"ɔ˜ ...  Fr. bon (bɔ˜n)\n"
"œ˜ ...  Fr. un (œ˜)\n"
"\n"
"Dipthongs, etc.\n"
"eɪ as in bay (beɪ)\n"
"aɪ ...  buy (baɪ)\n"
"ɔɪ ...  boy (bɔɪ)\n"
"əʊ ...  no (nəʊ)\n"
"aʊ ...  now (naʊ)\n"
"ɪə ...  peer (pɪə(r))\n"
"ɛə ...  pair (pɛə(r))\n"
"ʊə ...  tour (tʊə(r))\n"
"ɔə ...  boar (bɔə(r))\n"
"\n"
"III. STRESS\n"
"\n"
"Main stress: ˈ preceding stressed syllable\n"
"Secondary stress: ˌ preceding stressed syllable\n"
"\n"
"E.g.: pronunciation (prəˌnʌnsɪˈeɪʃ(ə)n)\n";
/* TODO: find transcriptions of foreign consonents, œ, ʏ, nasals */

void
gerlexproprintkey(void)
{
	Bprint(bout, "%s", prkey);
}

/*
 * f points just after a '&', fe points at end of entry.
 * Accumulate the special name, starting after the &
 * and continuing until the next '.', in spec[].
 * Return pointer to char after '.'.
 */
static char *
getspec(char *f, char *fe)
{
	char *t;
	int c, i;

	t = spec;
	i = sizeof spec;
	while(--i > 0) {
		c = *f++;
		if(c == ';' || f == fe)
			break;
		*t++ = c;
	}
	*t = 0;
	return f;
}

/*
 * f points just after '<'; fe points at end of entry.
 * Expect next characters from bin to match:
 *  [/][^ >]+( [^>=]+=[^ >]+)*>
 *      tag   auxname auxval
 * Accumulate the tag and its auxilliary information in
 * tag[], auxname[][] and auxval[][].
 * Set tagstarts=1 if the tag is 'starting' (has no '/'), else 0.
 * Set naux to the number of aux pairs found.
 * Return pointer to after final '>'.
 */
static char *
gettag(char *f, char *fe)
{
	char *t;
	int c, i;

	t = tag;
	c = *f++;
	if(c == '/')
		tagstarts = 0;
	else {
		tagstarts = 1;
		*t++ = c;
	}
	i = Buflen;
	naux = 0;
	while(--i > 0) {
		c = *f++;
		if(c == '>' || f == fe)
			break;
		if(c == ' ') {
			*t = 0;
			t = auxname[naux];
			i = Buflen;
			if(naux < Maxaux-1)
				naux++;
		} else if(naux && c == '=') {
			*t = 0;
			t = auxval[naux-1];
			i = Buflen;
		} else
			*t++ = c;
	}
	*t = 0;
	return f;
}
(Return to Plan 9 Home Page)