/*
* sherlock.c -
* Originally written by Loki from Rob Pike's
* sig and comp programs.
* Ported to Plan 9 by Akshat Kumar.
*/
#include <u.h>
#include <libc.h>
#include <bio.h>
int Ntoken = 3;
int Zerobits = 4;
ulong zeromask;
char ** token;
int Thresh = 20;
/* characters to ignore at start and end of each word */
char * Ignore = " \t\n";
/* characters to treat as word-separators or words on their own */
char * Punct_full = ",.<>/?;:'\"`~[]{}\\|!@#$%^&*()-+_=";
char * Punct = "";
typedef struct Sig Sig;
struct Sig
{
int nval;
ulong *val;
};
Sig * signature(Biobuf *);
int compare(Sig *, Sig *);
void usage(void)
{
fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]"
" file1 file2 ...\n", argv0);
exits("usage");
}
void main(int argc, char *argv[])
{
int f, i, j, percent;
Biobuf bin;
Sig **sig;
char *err;
ARGBEGIN {
case 't':
Thresh = atoi(EARGF(usage()));
break;
case 'z':
Zerobits = atoi(EARGF(usage()));
break;
case 'n':
Ntoken = atoi(EARGF(usage()));
break;
default:
usage();
} ARGEND;
if (Thresh < 0 || Thresh > 100) {
fprint(2, "%s: threshold must be between 0 and 100\n", argv0);
exits("threshold");
}
if (Zerobits < 0 || Zerobits > 31) {
fprint(2, "%s: zerobits must be between 0 and 31\n", argv0);
exits("zerobits");
}
if (Ntoken <= 0) {
fprint(2, "%s: Ntoken must be greater than 0\n", argv0);
exits("ntoken");
}
if (argc < 2)
usage();
token = mallocz(Ntoken * sizeof(*token), 1);
zeromask = (1<<Zerobits)-1;
sig = mallocz(argc * sizeof(*sig), 1);
err = nil;
for (i=0; i < argc; i++) {
f = open(argv[i], OREAD);
if (f < 0) {
fprint(2, "%s: can't open %s: %r\n", argv0, argv[i]);
err = "open";
} else {
Binit(&bin, f, OREAD);
sig[i] = signature(&bin);
Bterm(&bin);
}
}
/* compare each signature pair-wise */
for (i=0; i < argc; i++) {
for (j=i+1; j < argc; j++) {
if (sig[i] == nil || sig[j] == nil)
continue;
percent = compare(sig[i], sig[j]);
if (percent >= Thresh)
print("%s and %s: %d%%\n",
argv[i], argv[j], percent);
}
}
exits(err);
}
char * read_word(Biobuf *bin, int *length, char *ignore, char *punct)
{
long max;
char *word;
long pos;
char *c;
int ch, is_ignore, is_punct;
pos = 0;
max = 128;
word = malloc(sizeof(*word) * max);
c = &word[pos];
if (!ignore)
ignore = "";
if (!punct)
punct = "";
/* read characters into the buffer, resizing it if necessary */
while ((ch = Bgetc(bin)) >= 0) {
is_ignore = (strchr(ignore, ch) != nil);
if (pos == 0) {
if (is_ignore)
continue;
}
if (is_ignore)
break;
is_punct = (strchr(punct, ch) != nil);
if (is_punct && (pos > 0)) {
Bungetc(bin);
break;
}
*c = ch;
c++;
pos++;
if (is_punct)
break;
if (pos == max) {
max += max;
word = realloc(word, max);
c = & word[pos];
}
}
/* set length and check for EOF condition */
*length = pos;
if (pos == 0) {
free(word);
return nil;
}
/* terminate the string and shrink to smallest required space */
*c = '\0';
word = realloc(word, pos+1);
return word;
}
/* ulcmp: compare *p1 and *p2 */
int ulcmp(const void *p1, const void *p2)
{
ulong v1, v2;
v1 = *(ulong *) p1;
v2 = *(ulong *) p2;
if (v1 < v2)
return -1;
else if (v1 == v2)
return 0;
else
return 1;
}
ulong hash(char *tok[])
{
ulong h;
uchar *s;
int i;
h = 0;
for (i=0; i < Ntoken; i++)
for (s=(uchar*)tok[i]; *s; s++)
h = h*31 + *s;
return h;
}
Sig * signature(Biobuf *bin)
{
int nv, na;
ulong *v, h;
char *str;
int i, ntoken;
Sig *sig;
v = nil;
na = 0;
nv = 0;
ntoken = 0;
while ((str = read_word(bin, &i, Ignore, Punct)) != nil)
{
free(token[0]);
for (i=0; i < Ntoken-1; i++)
token[i] = token[i+1];
token[Ntoken-1] = str;
ntoken++;
if (ntoken < Ntoken)
continue;
h = hash(token);
if ((h & zeromask) != 0)
continue;
h = h >> Zerobits;
if (nv == na) {
na += 100;
v = realloc(v, na*sizeof(ulong));
}
v[nv++] = h;
}
/* sort the array of hash values for speed */
qsort(v, nv, sizeof(v[0]), ulcmp);
sig = malloc(sizeof(Sig));
sig->nval = nv;
sig->val = v;
return sig;
}
int compare(Sig *s0, Sig *s1)
{
int i0, i1, nboth, nsimilar;
ulong v;
i0 = 0;
i1 = 0;
nboth = 0;
while (i0 < s0->nval && i1 < s1->nval) {
if (s0->val[i0] == s1->val[i1]) {
v = s0->val[i0];
while (i0 < s0->nval && v == s0->val[i0]) {
i0++;
nboth++;
}
while (i1 < s1->nval && v == s1->val[i1]) {
i1++;
nboth++;
}
continue;
}
if (s0->val[i0] < s1->val[i1])
i0++;
else
i1++;
}
if (s0->nval + s1->nval == 0)
return 0; /* ignore if both are empty files */
if (s0->nval + s1->nval == nboth)
return 100; /* perfect match if all hash codes match */
nsimilar = nboth / 2;
return 100 * nsimilar / (s0->nval + s1->nval - nsimilar);
}
|