# when raw index has a lot of entries like
# 1578324 problematico, a, ci, che
# apply this algorithm:
# treat things after comma as suffixes
# for each suffix:
# if single letter, replace last letter
# else search backwards for beginning of suffix
# and if it leads to an old suffix of approximately
# the same length, put replace that suffix
# This will still leave some commas to fix by hand
# Usage: awk -F' ' -f comfix.awk rawindex > newrawindex
NF == 2 {
i = index($2, ",")
if(i == 0 || length($2) == 0)
print $0
else {
n = split($2, a, /,[ ]*/)
w = a[1]
printf "%s\t%s\n", $1, w
for(i = 2; i <= n; i++) {
suf = a[i]
m = matchsuflen(w, suf)
if(m) {
nw = substr(w, 1, length(w)-m) suf
printf "%s\t%s\n", $1, nw
} else
printf "%s\t%s\n", $1, w ", " suf
}
}
}
NF != 2 {
print $0
}
function matchsuflen(w, suf, wlen,suflen,c,pat,k,d)
{
wlen = length(w)
suflen = length(suf)
if(suflen == 1)
return 1
else {
c = substr(suf, 1, 1)
for (k = 1; k <= wlen ; k++)
if(substr(w, wlen-k+1, 1) == c)
break
if(k > wlen)
return 0
d = k-suflen
if(d < 0)
d = -d
if(d > 3)
return 0
return k
}
}
|