Plan 9 from Bell Labs’s /usr/web/sources/contrib/akumar/cmd/sherlock/diff

Copyright © 2021 Plan 9 Foundation.
Distributed under the MIT License.
Download the Plan 9 distribution.


orig/sherlock.c:2,13 c sherlock.c:2,6
<  *  sherlock.c - written by Loki from Rob Pike's sig and comp programs.
<  *
<  *  This program takes filenames given on the command line,
<  *  and reads those files into memory, then compares them
<  *  all pairwise to find those which are most similar.
<  *
<  *  It uses a digital signature generation scheme to randomly
<  *  discard information, thus allowing a better match.
<  *  Essentially it hashes up N adjacent 'words' of input,
<  *  and semi-randomly throws away many of the hashed values
<  *  so that it become hard to hide the plagiarised text.
<  */
---
>   * sherlock.c - 
>   *	Originally written by Loki from Rob Pike's
>   *	sig and comp programs. 
>   *	Ported to Plan 9 by Akshat Kumar.
>   */
orig/sherlock.c:15,17 c sherlock.c:8,10
< #include <stdlib.h>
< #include <string.h>
< #include <stdio.h>
---
> #include <u.h>
> #include <libc.h>
> #include <bio.h>
orig/sherlock.c:19 d sherlock.c:11
< char *		Progname = "sherlock";
orig/sherlock.c:22,25 c sherlock.c:14,15
< unsigned long	zeromask;
< int		ntoken = 0;
< char **		token;
< FILE *		Outfile;
---
> ulong	zeromask;
> char **	token;
orig/sherlock.c:39 c sherlock.c:29
< 	unsigned long	*val;
---
> 	ulong	*val;
orig/sherlock.c:42,43 c sherlock.c:32
< void	init_token_array(void);
< Sig *	signature(FILE *);
---
> Sig *	signature(Biobuf *);
orig/sherlock.c:48,66 c sherlock.c:37,39
< 	fprintf(stderr, "%s: find similar files\n", Progname);
< 
< 	fprintf(stderr, "usage: %s", Progname);
< 	fprintf(stderr, " [options] file1 file2 ...\n");
< 
< 	fprintf(stderr, "options:");
< 	fprintf(stderr, " [-t threshold%%]");
< 	fprintf(stderr, " [-z zerobits]");
< 	fprintf(stderr, " [-n chainlength]");
< 	fprintf(stderr, " [-o outfile]");
< 	fprintf(stderr, "\n");
< 
< 	fprintf(stderr, "defaults:");
< 	fprintf(stderr, " threshold=20%%");
< 	fprintf(stderr, " zerobits=3");
< 	fprintf(stderr, " chainlength=4");
< 	fprintf(stderr, " outfile=the screen");
< 	fprintf(stderr, "\n");
< 	exit(2);
---
> 	fprint(2, "usage: %s [-t thresh] [-z zbits] [-n ntoks]"
> 		" file1 file2 ...\n", argv0);
> 	exits("usage");
orig/sherlock.c:69 c sherlock.c:42
< int main(int argc, char *argv[])
---
> void main(int argc, char *argv[])
orig/sherlock.c:71,73 c sherlock.c:44,45
< 	FILE *f;
< 	int i, j, nfiles, start, percent;
< 	char *s, *outname;
---
> 	int f, i, j, percent;
> 	Biobuf bin;
orig/sherlock.c:74 a sherlock.c:47
> 	char *err;
orig/sherlock.c:76,83 c sherlock.c:49
< 	Outfile = stdout;
< 	outname = NULL;
< 
< 	/* handle options */
< 	for (start=1; start < argc; start++) {
< 		if (argv[start][0] != '-')
< 			break;
< 		switch (argv[start][1]) {
---
> 	ARGBEGIN {
orig/sherlock.c:85,90 c sherlock.c:51
< 			s = argv[++start];
< 			if (s == NULL)
< 				usage();
< 			Thresh = atoi(s);
< 			if (Thresh < 0 || Thresh > 100)
< 				usage();
---
> 			Thresh = atoi(EARGF(usage()));
orig/sherlock.c:93,98 c sherlock.c:54
< 			s = argv[++start];
< 			if (s == NULL)
< 				usage();
< 			Zerobits = atoi(s);
< 			if (Zerobits < 0 || Zerobits > 31)
< 				usage();
---
> 			Zerobits = atoi(EARGF(usage()));
orig/sherlock.c:101,106 c sherlock.c:57
< 			s = argv[++start];
< 			if (s == NULL)
< 				usage();
< 			Ntoken = atoi(s);
< 			if (Ntoken <= 0)
< 				usage();
---
> 			Ntoken = atoi(EARGF(usage()));
orig/sherlock.c:108,113 d sherlock.c:58
< 		case 'o':
< 			s = argv[++start];
< 			if (s == NULL)
< 				usage();
< 			outname = s;
< 			break;
orig/sherlock.c:116 c sherlock.c:61,65
< 		}
---
> 	} ARGEND;
> 
> 	if (Thresh < 0 || Thresh > 100) {
> 		fprint(2, "%s: threshold must be between 0 and 100\n", argv0);
> 		exits("threshold");
orig/sherlock.c:119,120 c sherlock.c:68,78
< 	nfiles = argc - start;
< 	if (nfiles < 2)
---
> 	if (Zerobits < 0 || Zerobits > 31) {
> 		fprint(2, "%s: zerobits must be between 0 and 31\n", argv0);
> 		exits("zerobits");
> 	}
> 
> 	if (Ntoken <= 0) {
> 		fprint(2, "%s: Ntoken must be greater than 0\n", argv0);
> 		exits("ntoken");
> 	}
> 
> 	if (argc < 2)
orig/sherlock.c:123,126 c sherlock.c:81
< 	/* initialise */
< 	if (outname != NULL)
< 		Outfile = fopen(outname, "w");
< 	init_token_array();
---
> 	token = mallocz(Ntoken * sizeof(*token), 1);
orig/sherlock.c:128 c sherlock.c:83
< 	sig = malloc(nfiles * sizeof(Sig *));
---
> 	sig = mallocz(argc * sizeof(*sig), 1);
orig/sherlock.c:130,138 c sherlock.c:85,94
< 	/* generate signatures for each file */
< 	for (i=0; i < nfiles; i++) {
< 		/* fprintf(stderr, "%s: Reading %s\n", Progname, argv[i+start]); */
< 		f = fopen(argv[i+start], "r");
< 		if (f == NULL) {
< 			fprintf(stderr, "%s: can't open %s:",
< 				Progname, argv[i+start]);
< 			perror(NULL);
< 			continue;
---
> 	err = nil;
> 	for (i=0; i < argc; i++) {
> 		f = open(argv[i], OREAD);
> 		if (f < 0) {
> 			fprint(2, "%s: can't open %s: %r\n", argv0, argv[i]);
> 			err = "open";
> 		} else {
> 			Binit(&bin, f, OREAD);
> 			sig[i] = signature(&bin);
> 			Bterm(&bin);
orig/sherlock.c:140,141 d sherlock.c:95
< 		sig[i] = signature(f);
< 		fclose(f);
orig/sherlock.c:145,146 c sherlock.c:99,102
< 	for (i=0; i < nfiles; i++)
< 		for (j=i+1; j < nfiles; j++) {
---
> 	for (i=0; i < argc; i++) {
> 		for (j=i+1; j < argc; j++) {
> 			if (sig[i] == nil || sig[j] == nil)
> 				continue;
orig/sherlock.c:149,150 c sherlock.c:105,106
< 				fprintf(Outfile, "%s and %s: %d%%\n",
< 					argv[i+start], argv[j+start], percent);
---
> 				print("%s and %s: %d%%\n",
> 					argv[i], argv[j], percent);
orig/sherlock.c:151 a sherlock.c:108
> 	}
orig/sherlock.c:153 c sherlock.c:110
< 	return 0;
---
> 	exits(err);
orig/sherlock.c:156,162 c sherlock.c:113
< /* read_word: read a 'word' from the input, ignoring leading characters
<    which are inside the 'ignore' string, and stopping if one of
<    the 'ignore' or 'punct' characters is found.
<    Uses memory allocation to avoid buffer overflow problems.
< */
< 
< char * read_word(FILE *f, int *length, char *ignore, char *punct)
---
> char * read_word(Biobuf *bin, int *length, char *ignore, char *punct)
orig/sherlock.c:170,176 d sherlock.c:120
<         /* check for EOF first */
<         if (feof(f)) {
<                 length = 0;
<                 return NULL;
<         }
< 
<         /* allocate a buffer to hold the string */
orig/sherlock.c:179,180 c sherlock.c:123,124
<         word = malloc(sizeof(char) * max);
<         c = & word[pos];
---
>         word = malloc(sizeof(*word) * max);
>         c = &word[pos];
orig/sherlock.c:182,183 c sherlock.c:126
< 	/* initialise some defaults */
< 	if (ignore == NULL)
---
> 	if (!ignore)
orig/sherlock.c:185 c sherlock.c:128
< 	if (punct == NULL)
---
> 	if (!punct)
orig/sherlock.c:189,190 c sherlock.c:132,133
<         while ((ch = getc(f)) != EOF) {
< 		is_ignore = (strchr(ignore, ch) != NULL);
---
>         while ((ch = Bgetc(bin)) >= 0) {
> 		is_ignore = (strchr(ignore, ch) != nil);
orig/sherlock.c:193 d sherlock.c:135
< 				/* ignorable char found at start, skip it */
orig/sherlock.c:197 d sherlock.c:138
< 			/* ignorable char found after start, stop */
orig/sherlock.c:199 c sherlock.c:140
< 		is_punct = (strchr(punct, ch) != NULL);
---
> 		is_punct = (strchr(punct, ch) != nil);
orig/sherlock.c:201 c sherlock.c:142
< 			ungetc(ch, f);
---
> 			Bungetc(bin);
orig/sherlock.c:210 d sherlock.c:150
<                         /* realloc buffer twice the size */
orig/sherlock.c:221 c sherlock.c:161
<                 return NULL;
---
>                 return nil;
orig/sherlock.c:233 c sherlock.c:173
< 	unsigned long v1, v2;
---
> 	ulong v1, v2;
orig/sherlock.c:235,236 c sherlock.c:175,176
< 	v1 = *(unsigned long *) p1;
< 	v2 = *(unsigned long *) p2;
---
> 	v1 = *(ulong *) p1;
> 	v2 = *(ulong *) p2;
orig/sherlock.c:245,246 c sherlock.c:185
< /* hash:  hash an array of char* into an unsigned long hash code */
< unsigned long hash(char *tok[])
---
> ulong hash(char *tok[])
orig/sherlock.c:248,249 c sherlock.c:187,188
< 	unsigned long h;
< 	unsigned char *s;
---
> 	ulong h;
> 	uchar *s;
orig/sherlock.c:254 c sherlock.c:193
< 		for (s=(unsigned char*)tok[i]; *s; s++)
---
> 		for (s=(uchar*)tok[i]; *s; s++)
orig/sherlock.c:259 c sherlock.c:198
< void init_token_array(void)
---
> Sig * signature(Biobuf *bin)
orig/sherlock.c:261,270 d sherlock.c:199
< 	int i;
< 
< 	/* create global array of char* and initialise all to NULL */
< 	token = malloc(Ntoken * sizeof(char*));
< 	for (i=0; i < Ntoken; i++)
< 		token[i] = NULL;
< }
< 
< Sig * signature(FILE *f)
< {
orig/sherlock.c:272 c sherlock.c:201
< 	unsigned long *v, h;
---
> 	ulong *v, h;
orig/sherlock.c:277,278 c sherlock.c:206
< 	/* start loading hash values, after we have Ntoken of them */
< 	v = NULL;
---
> 	v = nil;
orig/sherlock.c:282 c sherlock.c:210
< 	while ((str = read_word(f, &i, Ignore, Punct)) != NULL)
---
> 	while ((str = read_word(bin, &i, Ignore, Punct)) != nil)
orig/sherlock.c:284 d sherlock.c:211
< 		/* step words down by one */
orig/sherlock.c:288 d sherlock.c:214
< 		/* add new word into array */
orig/sherlock.c:291 d sherlock.c:216
< 		/* if we don't yet have enough words in the array continue */
orig/sherlock.c:296 d sherlock.c:220
< 		/* hash the array of words */
orig/sherlock.c:301 d sherlock.c:224
< 		/* discard zeros from end of hash value */
orig/sherlock.c:304 d sherlock.c:226
< 		/* add value into the signature array, resizing if needed */
orig/sherlock.c:307 c sherlock.c:229
< 			v = realloc(v, na*sizeof(unsigned long));
---
> 			v = realloc(v, na*sizeof(ulong));
orig/sherlock.c:315 d sherlock.c:236
< 	/* allocate and return the Sig structure for this file */
orig/sherlock.c:325 c sherlock.c:246
< 	unsigned long v;
---
> 	ulong v;
orig/sherlock.c:358,377 d sherlock.c:278
< 
< /*
<  *  Let f1 == filesize(file1) == A+B
<  *  and f2 == filesize(file2) == A+C
<  *  where A is the similar section and B or C are dissimilar
<  *
<  *  Similarity = 100 * A / (f1 + f2 - A)
<  *             = 100 * A / (A+B + A+C - A)
<  *             = 100 * A / (A+B+C)
<  *
<  *  Thus if A==B==C==n the similarity will be 33% (one third)
<  *  This is desireable since we are finding the ratio of similarities
<  *  as a fraction of (similarities+dissimilarities).
<  *
<  *  The other way of doing things would be to find the ratio of
<  *  the sum of similarities as a fraction of total file size:
<  *  Similarity = 100 * (A+A) / (A+B + A+C)
<  *  This produces higher percentages and more false matches.
<  */
< 

Bell Labs OSI certified Powered by Plan 9

(Return to Plan 9 Home Page)

Copyright © 2021 Plan 9 Foundation. All Rights Reserved.
Comments to [email protected].