orig/sherlock.c:2,13 c sherlock.c:2,6
< * sherlock.c - written by Loki from Rob Pike's sig and comp programs.
< *
< * This program takes filenames given on the command line,
< * and reads those files into memory, then compares them
< * all pairwise to find those which are most similar.
< *
< * It uses a digital signature generation scheme to randomly
< * discard information, thus allowing a better match.
< * Essentially it hashes up N adjacent 'words' of input,
< * and semi-randomly throws away many of the hashed values
< * so that it become hard to hide the plagiarised text.
< */
---
> * sherlock.c -
> * Originally written by Loki from Rob Pike's
> * sig and comp programs.
> * Ported to Plan 9 by Akshat Kumar.
> */
orig/sherlock.c:15,17 c sherlock.c:8,10
< #include <stdlib.h>
< #include <string.h>
< #include <stdio.h>
---
> #include <u.h>
> #include <libc.h>
> #include <bio.h>
orig/sherlock.c:19 d sherlock.c:11
< char * Progname = "sherlock";
orig/sherlock.c:22,25 c sherlock.c:14,15
< unsigned long zeromask;
< int ntoken = 0;
< char ** token;
< FILE * Outfile;
---
> ulong zeromask;
> char ** token;
orig/sherlock.c:39 c sherlock.c:29
< unsigned long *val;
---
> ulong *val;
orig/sherlock.c:42,43 c sherlock.c:32
< void init_token_array(void);
< Sig * signature(FILE *);
---
> Sig * signature(Biobuf *);
orig/sherlock.c:48,66 c sherlock.c:37,39
< fprintf(stderr, "%s: find similar files\n", Progname);
<
< fprintf(stderr, "usage: %s", Progname);
< fprintf(stderr, " [options] file1 file2 ...\n");
<
< fprintf(stderr, "options:");
< fprintf(stderr, " [-t threshold%%]");
< fprintf(stderr, " [-z zerobits]");
< fprintf(stderr, " [-n chainlength]");
< fprintf(stderr, " [-o outfile]");
< fprintf(stderr, "\n");
<
< fprintf(stderr, "defaults:");
< fprintf(stderr, " threshold=20%%");
< fprintf(stderr, " zerobits=3");
< fprintf(stderr, " chainlength=4");
< fprintf(stderr, " outfile=the screen");
< fprintf(stderr, "\n");
< exit(2);
---
> fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]"
> " file1 file2 ...\n", argv0);
> exits("usage");
orig/sherlock.c:69 c sherlock.c:42
< int main(int argc, char *argv[])
---
> void main(int argc, char *argv[])
orig/sherlock.c:71,73 c sherlock.c:44,45
< FILE *f;
< int i, j, nfiles, start, percent;
< char *s, *outname;
---
> int f, i, j, percent;
> Biobuf bin;
orig/sherlock.c:74 a sherlock.c:47
> char *err;
orig/sherlock.c:76,83 c sherlock.c:49
< Outfile = stdout;
< outname = NULL;
<
< /* handle options */
< for (start=1; start < argc; start++) {
< if (argv[start][0] != '-')
< break;
< switch (argv[start][1]) {
---
> ARGBEGIN {
orig/sherlock.c:85,90 c sherlock.c:51
< s = argv[++start];
< if (s == NULL)
< usage();
< Thresh = atoi(s);
< if (Thresh < 0 || Thresh > 100)
< usage();
---
> Thresh = atoi(EARGF(usage()));
orig/sherlock.c:93,98 c sherlock.c:54
< s = argv[++start];
< if (s == NULL)
< usage();
< Zerobits = atoi(s);
< if (Zerobits < 0 || Zerobits > 31)
< usage();
---
> Zerobits = atoi(EARGF(usage()));
orig/sherlock.c:101,106 c sherlock.c:57
< s = argv[++start];
< if (s == NULL)
< usage();
< Ntoken = atoi(s);
< if (Ntoken <= 0)
< usage();
---
> Ntoken = atoi(EARGF(usage()));
orig/sherlock.c:108,113 d sherlock.c:58
< case 'o':
< s = argv[++start];
< if (s == NULL)
< usage();
< outname = s;
< break;
orig/sherlock.c:116 c sherlock.c:61,65
< }
---
> } ARGEND;
>
> if (Thresh < 0 || Thresh > 100) {
> fprint(2, "%s: threshold must be between 0 and 100\n", argv0);
> exits("threshold");
orig/sherlock.c:119,120 c sherlock.c:68,78
< nfiles = argc - start;
< if (nfiles < 2)
---
> if (Zerobits < 0 || Zerobits > 31) {
> fprint(2, "%s: zerobits must be between 0 and 31\n", argv0);
> exits("zerobits");
> }
>
> if (Ntoken <= 0) {
> fprint(2, "%s: Ntoken must be greater than 0\n", argv0);
> exits("ntoken");
> }
>
> if (argc < 2)
orig/sherlock.c:123,126 c sherlock.c:81
< /* initialise */
< if (outname != NULL)
< Outfile = fopen(outname, "w");
< init_token_array();
---
> token = mallocz(Ntoken * sizeof(*token), 1);
orig/sherlock.c:128 c sherlock.c:83
< sig = malloc(nfiles * sizeof(Sig *));
---
> sig = mallocz(argc * sizeof(*sig), 1);
orig/sherlock.c:130,138 c sherlock.c:85,94
< /* generate signatures for each file */
< for (i=0; i < nfiles; i++) {
< /* fprintf(stderr, "%s: Reading %s\n", Progname, argv[i+start]); */
< f = fopen(argv[i+start], "r");
< if (f == NULL) {
< fprintf(stderr, "%s: can't open %s:",
< Progname, argv[i+start]);
< perror(NULL);
< continue;
---
> err = nil;
> for (i=0; i < argc; i++) {
> f = open(argv[i], OREAD);
> if (f < 0) {
> fprint(2, "%s: can't open %s: %r\n", argv0, argv[i]);
> err = "open";
> } else {
> Binit(&bin, f, OREAD);
> sig[i] = signature(&bin);
> Bterm(&bin);
orig/sherlock.c:140,141 d sherlock.c:95
< sig[i] = signature(f);
< fclose(f);
orig/sherlock.c:145,146 c sherlock.c:99,102
< for (i=0; i < nfiles; i++)
< for (j=i+1; j < nfiles; j++) {
---
> for (i=0; i < argc; i++) {
> for (j=i+1; j < argc; j++) {
> if (sig[i] == nil || sig[j] == nil)
> continue;
orig/sherlock.c:149,150 c sherlock.c:105,106
< fprintf(Outfile, "%s and %s: %d%%\n",
< argv[i+start], argv[j+start], percent);
---
> print("%s and %s: %d%%\n",
> argv[i], argv[j], percent);
orig/sherlock.c:151 a sherlock.c:108
> }
orig/sherlock.c:153 c sherlock.c:110
< return 0;
---
> exits(err);
orig/sherlock.c:156,162 c sherlock.c:113
< /* read_word: read a 'word' from the input, ignoring leading characters
< which are inside the 'ignore' string, and stopping if one of
< the 'ignore' or 'punct' characters is found.
< Uses memory allocation to avoid buffer overflow problems.
< */
<
< char * read_word(FILE *f, int *length, char *ignore, char *punct)
---
> char * read_word(Biobuf *bin, int *length, char *ignore, char *punct)
orig/sherlock.c:170,176 d sherlock.c:120
< /* check for EOF first */
< if (feof(f)) {
< length = 0;
< return NULL;
< }
<
< /* allocate a buffer to hold the string */
orig/sherlock.c:179,180 c sherlock.c:123,124
< word = malloc(sizeof(char) * max);
< c = & word[pos];
---
> word = malloc(sizeof(*word) * max);
> c = &word[pos];
orig/sherlock.c:182,183 c sherlock.c:126
< /* initialise some defaults */
< if (ignore == NULL)
---
> if (!ignore)
orig/sherlock.c:185 c sherlock.c:128
< if (punct == NULL)
---
> if (!punct)
orig/sherlock.c:189,190 c sherlock.c:132,133
< while ((ch = getc(f)) != EOF) {
< is_ignore = (strchr(ignore, ch) != NULL);
---
> while ((ch = Bgetc(bin)) >= 0) {
> is_ignore = (strchr(ignore, ch) != nil);
orig/sherlock.c:193 d sherlock.c:135
< /* ignorable char found at start, skip it */
orig/sherlock.c:197 d sherlock.c:138
< /* ignorable char found after start, stop */
orig/sherlock.c:199 c sherlock.c:140
< is_punct = (strchr(punct, ch) != NULL);
---
> is_punct = (strchr(punct, ch) != nil);
orig/sherlock.c:201 c sherlock.c:142
< ungetc(ch, f);
---
> Bungetc(bin);
orig/sherlock.c:210 d sherlock.c:150
< /* realloc buffer twice the size */
orig/sherlock.c:221 c sherlock.c:161
< return NULL;
---
> return nil;
orig/sherlock.c:233 c sherlock.c:173
< unsigned long v1, v2;
---
> ulong v1, v2;
orig/sherlock.c:235,236 c sherlock.c:175,176
< v1 = *(unsigned long *) p1;
< v2 = *(unsigned long *) p2;
---
> v1 = *(ulong *) p1;
> v2 = *(ulong *) p2;
orig/sherlock.c:245,246 c sherlock.c:185
< /* hash: hash an array of char* into an unsigned long hash code */
< unsigned long hash(char *tok[])
---
> ulong hash(char *tok[])
orig/sherlock.c:248,249 c sherlock.c:187,188
< unsigned long h;
< unsigned char *s;
---
> ulong h;
> uchar *s;
orig/sherlock.c:254 c sherlock.c:193
< for (s=(unsigned char*)tok[i]; *s; s++)
---
> for (s=(uchar*)tok[i]; *s; s++)
orig/sherlock.c:259 c sherlock.c:198
< void init_token_array(void)
---
> Sig * signature(Biobuf *bin)
orig/sherlock.c:261,270 d sherlock.c:199
< int i;
<
< /* create global array of char* and initialise all to NULL */
< token = malloc(Ntoken * sizeof(char*));
< for (i=0; i < Ntoken; i++)
< token[i] = NULL;
< }
<
< Sig * signature(FILE *f)
< {
orig/sherlock.c:272 c sherlock.c:201
< unsigned long *v, h;
---
> ulong *v, h;
orig/sherlock.c:277,278 c sherlock.c:206
< /* start loading hash values, after we have Ntoken of them */
< v = NULL;
---
> v = nil;
orig/sherlock.c:282 c sherlock.c:210
< while ((str = read_word(f, &i, Ignore, Punct)) != NULL)
---
> while ((str = read_word(bin, &i, Ignore, Punct)) != nil)
orig/sherlock.c:284 d sherlock.c:211
< /* step words down by one */
orig/sherlock.c:288 d sherlock.c:214
< /* add new word into array */
orig/sherlock.c:291 d sherlock.c:216
< /* if we don't yet have enough words in the array continue */
orig/sherlock.c:296 d sherlock.c:220
< /* hash the array of words */
orig/sherlock.c:301 d sherlock.c:224
< /* discard zeros from end of hash value */
orig/sherlock.c:304 d sherlock.c:226
< /* add value into the signature array, resizing if needed */
orig/sherlock.c:307 c sherlock.c:229
< v = realloc(v, na*sizeof(unsigned long));
---
> v = realloc(v, na*sizeof(ulong));
orig/sherlock.c:315 d sherlock.c:236
< /* allocate and return the Sig structure for this file */
orig/sherlock.c:325 c sherlock.c:246
< unsigned long v;
---
> ulong v;
orig/sherlock.c:358,377 d sherlock.c:278
<
< /*
< * Let f1 == filesize(file1) == A+B
< * and f2 == filesize(file2) == A+C
< * where A is the similar section and B or C are dissimilar
< *
< * Similarity = 100 * A / (f1 + f2 - A)
< * = 100 * A / (A+B + A+C - A)
< * = 100 * A / (A+B+C)
< *
< * Thus if A==B==C==n the similarity will be 33% (one third)
< * This is desireable since we are finding the ratio of similarities
< * as a fraction of (similarities+dissimilarities).
< *
< * The other way of doing things would be to find the ratio of
< * the sum of similarities as a fraction of total file size:
< * Similarity = 100 * (A+A) / (A+B + A+C)
< * This produces higher percentages and more false matches.
< */
<
|