19b50d902SRodney W. Grimes /*
2389844c0SBaptiste Daroussin  * SPDX-License-Identifier: BSD-3-Clause
3df57947fSPedro F. Giffuni  *
4834a8fa1SWolfram Schneider  * Copyright (c) 1995-2022 Wolfram Schneider <[email protected]>
59b50d902SRodney W. Grimes  * Copyright (c) 1989, 1993
69b50d902SRodney W. Grimes  *	The Regents of the University of California.  All rights reserved.
79b50d902SRodney W. Grimes  *
89b50d902SRodney W. Grimes  * This code is derived from software contributed to Berkeley by
99b50d902SRodney W. Grimes  * James A. Woods.
109b50d902SRodney W. Grimes  *
119b50d902SRodney W. Grimes  * Redistribution and use in source and binary forms, with or without
129b50d902SRodney W. Grimes  * modification, are permitted provided that the following conditions
139b50d902SRodney W. Grimes  * are met:
149b50d902SRodney W. Grimes  * 1. Redistributions of source code must retain the above copyright
159b50d902SRodney W. Grimes  *    notice, this list of conditions and the following disclaimer.
169b50d902SRodney W. Grimes  * 2. Redistributions in binary form must reproduce the above copyright
179b50d902SRodney W. Grimes  *    notice, this list of conditions and the following disclaimer in the
189b50d902SRodney W. Grimes  *    documentation and/or other materials provided with the distribution.
19389844c0SBaptiste Daroussin  * 3. Neither the name of the University nor the names of its contributors
209b50d902SRodney W. Grimes  *    may be used to endorse or promote products derived from this software
219b50d902SRodney W. Grimes  *    without specific prior written permission.
229b50d902SRodney W. Grimes  *
239b50d902SRodney W. Grimes  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
249b50d902SRodney W. Grimes  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
259b50d902SRodney W. Grimes  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
269b50d902SRodney W. Grimes  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
279b50d902SRodney W. Grimes  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
289b50d902SRodney W. Grimes  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
299b50d902SRodney W. Grimes  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
309b50d902SRodney W. Grimes  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
319b50d902SRodney W. Grimes  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
329b50d902SRodney W. Grimes  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
339b50d902SRodney W. Grimes  * SUCH DAMAGE.
349b50d902SRodney W. Grimes  */
359b50d902SRodney W. Grimes 
3631fa4102SGavin Atkinson #if 0
379b50d902SRodney W. Grimes #ifndef lint
389b50d902SRodney W. Grimes static char copyright[] =
399b50d902SRodney W. Grimes "@(#) Copyright (c) 1989, 1993\n\
409b50d902SRodney W. Grimes 	The Regents of the University of California.  All rights reserved.\n";
419b50d902SRodney W. Grimes #endif /* not lint */
429b50d902SRodney W. Grimes 
439b50d902SRodney W. Grimes #ifndef lint
449b50d902SRodney W. Grimes static char sccsid[] = "@(#)locate.code.c	8.1 (Berkeley) 6/6/93";
459b50d902SRodney W. Grimes #endif /* not lint */
4631fa4102SGavin Atkinson #endif
479b50d902SRodney W. Grimes 
489b50d902SRodney W. Grimes /*
499b50d902SRodney W. Grimes  * PURPOSE:	sorted list compressor (works with a modified 'find'
509b50d902SRodney W. Grimes  *		to encode/decode a filename database)
519b50d902SRodney W. Grimes  *
529b50d902SRodney W. Grimes  * USAGE:	bigram < list > bigrams
539b50d902SRodney W. Grimes  *		process bigrams (see updatedb) > common_bigrams
549b50d902SRodney W. Grimes  *		code common_bigrams < list > squozen_list
559b50d902SRodney W. Grimes  *
569b50d902SRodney W. Grimes  * METHOD:	Uses 'front compression' (see ";login:", Volume 8, Number 1
579b50d902SRodney W. Grimes  *		February/March 1983, p. 8).  Output format is, per line, an
589b50d902SRodney W. Grimes  *		offset differential count byte followed by a partially bigram-
599b50d902SRodney W. Grimes  *		encoded ascii residue.  A bigram is a two-character sequence,
609b50d902SRodney W. Grimes  *		the first 128 most common of which are encoded in one byte.
619b50d902SRodney W. Grimes  *
629b50d902SRodney W. Grimes  * EXAMPLE:	For simple front compression with no bigram encoding,
639b50d902SRodney W. Grimes  *		if the input is...		then the output is...
649b50d902SRodney W. Grimes  *
659b50d902SRodney W. Grimes  *		/usr/src			 0 /usr/src
669b50d902SRodney W. Grimes  *		/usr/src/cmd/aardvark.c		 8 /cmd/aardvark.c
679b50d902SRodney W. Grimes  *		/usr/src/cmd/armadillo.c	14 armadillo.c
689b50d902SRodney W. Grimes  *		/usr/tmp/zoo			 5 tmp/zoo
699b50d902SRodney W. Grimes  *
709b50d902SRodney W. Grimes  *	The codes are:
719b50d902SRodney W. Grimes  *
729b50d902SRodney W. Grimes  *	0-28	likeliest differential counts + offset to make nonnegative
739b50d902SRodney W. Grimes  *	30	switch code for out-of-range count to follow in next word
74139764e8SWolfram Schneider  *      31      an 8 bit char followed
759b50d902SRodney W. Grimes  *	128-255 bigram codes (128 most common, as determined by 'updatedb')
769b50d902SRodney W. Grimes  *	32-127  single character (printable) ascii residue (ie, literal)
779b50d902SRodney W. Grimes  *
78139764e8SWolfram Schneider  * The locate database store any character except newline ('\n')
79139764e8SWolfram Schneider  * and NUL ('\0'). The 8-bit character support don't wast extra
80139764e8SWolfram Schneider  * space until you have characters in file names less than 32
81139764e8SWolfram Schneider  * or greather than 127.
82139764e8SWolfram Schneider  *
83139764e8SWolfram Schneider  *
84139764e8SWolfram Schneider  * SEE ALSO:	updatedb.sh, ../bigram/locate.bigram.c
859b50d902SRodney W. Grimes  *
869b50d902SRodney W. Grimes  * AUTHOR:	James A. Woods, Informatics General Corp.,
879b50d902SRodney W. Grimes  *		NASA Ames Research Center, 10/82
88139764e8SWolfram Schneider  *              8-bit file names characters:
89139764e8SWolfram Schneider  *              	Wolfram Schneider, Berlin September 1996
909b50d902SRodney W. Grimes  */
919b50d902SRodney W. Grimes 
929b50d902SRodney W. Grimes #include <sys/param.h>
939b50d902SRodney W. Grimes #include <err.h>
949b50d902SRodney W. Grimes #include <errno.h>
959b50d902SRodney W. Grimes #include <stdlib.h>
969b50d902SRodney W. Grimes #include <string.h>
979b50d902SRodney W. Grimes #include <stdio.h>
9862f882d6SWarner Losh #include <unistd.h>
999b50d902SRodney W. Grimes #include "locate.h"
1009b50d902SRodney W. Grimes 
1019b50d902SRodney W. Grimes #define	BGBUFSIZE	(NBG * 2)	/* size of bigram buffer */
1029b50d902SRodney W. Grimes 
103834a8fa1SWolfram Schneider u_char buf1[LOCATE_PATH_MAX] = " ";
104834a8fa1SWolfram Schneider u_char buf2[LOCATE_PATH_MAX];
105139764e8SWolfram Schneider u_char bigrams[BGBUFSIZE + 1] = { 0 };
1069b50d902SRodney W. Grimes 
107d43255b5SWolfram Schneider /* use a lookup array instead a function, 3x faster than linear search */
108*a6c20dddSWolfram Schneider int big [UCHAR_MAX + 1][UCHAR_MAX + 1];
109139764e8SWolfram Schneider #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)])
1107ec1929dSWolfram Schneider 
111f1bb2cd2SWarner Losh void   usage(void);
1129b50d902SRodney W. Grimes 
1139b50d902SRodney W. Grimes int
main(int argc,char * argv[])11431fa4102SGavin Atkinson main(int argc, char *argv[])
1159b50d902SRodney W. Grimes {
11631fa4102SGavin Atkinson 	u_char *cp, *oldpath, *path;
1179b50d902SRodney W. Grimes 	int ch, code, count, diffcount, oldcount;
11831fa4102SGavin Atkinson 	u_int i, j;
1199b50d902SRodney W. Grimes 	FILE *fp;
1209b50d902SRodney W. Grimes 
1211c8af878SWarner Losh 	while ((ch = getopt(argc, argv, "")) != -1)
1229b50d902SRodney W. Grimes 		switch(ch) {
1239b50d902SRodney W. Grimes 		default:
1249b50d902SRodney W. Grimes 			usage();
1259b50d902SRodney W. Grimes 		}
1269b50d902SRodney W. Grimes 	argc -= optind;
1279b50d902SRodney W. Grimes 	argv += optind;
1289b50d902SRodney W. Grimes 
1299b50d902SRodney W. Grimes 	if (argc != 1)
1309b50d902SRodney W. Grimes 		usage();
1319b50d902SRodney W. Grimes 
1329b50d902SRodney W. Grimes 	if ((fp = fopen(argv[0], "r")) == NULL)
1339b50d902SRodney W. Grimes 		err(1, "%s", argv[0]);
1349b50d902SRodney W. Grimes 
1359b50d902SRodney W. Grimes 	/* First copy bigram array to stdout. */
1368e7c0a6dSWolfram Schneider 	if (fgets(bigrams, BGBUFSIZE + 1, fp) == NULL) {
1378e7c0a6dSWolfram Schneider 		if (!feof(fp) || ferror(fp))
13872a0982cSWolfram Schneider 			err(1, "get bigram array");
1398e7c0a6dSWolfram Schneider 	}
1407ec1929dSWolfram Schneider 
1419b50d902SRodney W. Grimes 	if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE)
1429b50d902SRodney W. Grimes 		err(1, "stdout");
1439b50d902SRodney W. Grimes 	(void)fclose(fp);
1449b50d902SRodney W. Grimes 
14537002181SWolfram Schneider 	/* init lookup table */
146139764e8SWolfram Schneider 	for (i = 0; i < UCHAR_MAX + 1; i++)
147139764e8SWolfram Schneider 	    	for (j = 0; j < UCHAR_MAX + 1; j++)
148*a6c20dddSWolfram Schneider 			big[i][j] = -1;
14937002181SWolfram Schneider 
1501a1ee31fSWolfram Schneider 	for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2)
151*a6c20dddSWolfram Schneider 	        big[(u_char)*cp][(u_char)*(cp + 1)] = i;
152139764e8SWolfram Schneider 
1539b50d902SRodney W. Grimes 	oldpath = buf1;
1549b50d902SRodney W. Grimes 	path = buf2;
1559b50d902SRodney W. Grimes 	oldcount = 0;
1567ec1929dSWolfram Schneider 
15737002181SWolfram Schneider 	while (fgets(path, sizeof(buf2), stdin) != NULL) {
15837002181SWolfram Schneider 
15937002181SWolfram Schneider 		/* skip empty lines */
16037002181SWolfram Schneider 		if (*path == '\n')
16137002181SWolfram Schneider 			continue;
1629b50d902SRodney W. Grimes 
163139764e8SWolfram Schneider 		/* remove newline */
1641a1ee31fSWolfram Schneider 		for (cp = path; *cp != '\0'; cp++) {
16537002181SWolfram Schneider 			/* chop newline */
16637002181SWolfram Schneider 			if (*cp == '\n')
1671a1ee31fSWolfram Schneider 				*cp = '\0';
1689b50d902SRodney W. Grimes 		}
1699b50d902SRodney W. Grimes 
1709b50d902SRodney W. Grimes 		/* Skip longest common prefix. */
171139764e8SWolfram Schneider 		for (cp = path; *cp == *oldpath; cp++, oldpath++)
172139764e8SWolfram Schneider 			if (*cp == '\0')
173139764e8SWolfram Schneider 				break;
17437002181SWolfram Schneider 
1759b50d902SRodney W. Grimes 		count = cp - path;
1769b50d902SRodney W. Grimes 		diffcount = count - oldcount + OFFSET;
1779b50d902SRodney W. Grimes 		oldcount = count;
1789b50d902SRodney W. Grimes 		if (diffcount < 0 || diffcount > 2 * OFFSET) {
1799b50d902SRodney W. Grimes 			if (putchar(SWITCH) == EOF ||
1809b50d902SRodney W. Grimes 			    putw(diffcount, stdout) == EOF)
1819b50d902SRodney W. Grimes 				err(1, "stdout");
1829b50d902SRodney W. Grimes 		} else
1839b50d902SRodney W. Grimes 			if (putchar(diffcount) == EOF)
1849b50d902SRodney W. Grimes 				err(1, "stdout");
1859b50d902SRodney W. Grimes 
1861a1ee31fSWolfram Schneider 		while (*cp != '\0') {
187139764e8SWolfram Schneider 			/* print *two* characters */
188139764e8SWolfram Schneider 
189*a6c20dddSWolfram Schneider 			if ((code = BGINDEX(cp)) != -1) {
190139764e8SWolfram Schneider 				/*
191139764e8SWolfram Schneider 				 * print *one* as bigram
192139764e8SWolfram Schneider 				 * Found, so mark byte with
193139764e8SWolfram Schneider 				 *  parity bit.
194139764e8SWolfram Schneider 				 */
1959b50d902SRodney W. Grimes 				if (putchar((code / 2) | PARITY) == EOF)
1969b50d902SRodney W. Grimes 					err(1, "stdout");
1979b50d902SRodney W. Grimes 				cp += 2;
1989b50d902SRodney W. Grimes 			}
199139764e8SWolfram Schneider 
200139764e8SWolfram Schneider 			else {
201139764e8SWolfram Schneider 				for (i = 0; i < 2; i++) {
202139764e8SWolfram Schneider 					if (*cp == '\0')
203139764e8SWolfram Schneider 						break;
204139764e8SWolfram Schneider 
205139764e8SWolfram Schneider 					/* print umlauts in file names */
206139764e8SWolfram Schneider 					if (*cp < ASCII_MIN ||
207139764e8SWolfram Schneider 					    *cp > ASCII_MAX) {
208139764e8SWolfram Schneider 						if (putchar(UMLAUT) == EOF ||
209139764e8SWolfram Schneider 						    putchar(*cp++) == EOF)
210139764e8SWolfram Schneider 							err(1, "stdout");
2119b50d902SRodney W. Grimes 					}
212139764e8SWolfram Schneider 
213139764e8SWolfram Schneider 					else {
214139764e8SWolfram Schneider 						/* normal character */
215139764e8SWolfram Schneider 						if(putchar(*cp++) == EOF)
216139764e8SWolfram Schneider 							err(1, "stdout");
217139764e8SWolfram Schneider 					}
218139764e8SWolfram Schneider 				}
219139764e8SWolfram Schneider 
220139764e8SWolfram Schneider 			}
221139764e8SWolfram Schneider 		}
222139764e8SWolfram Schneider 
2239b50d902SRodney W. Grimes 		if (path == buf1) {		/* swap pointers */
2249b50d902SRodney W. Grimes 			path = buf2;
2259b50d902SRodney W. Grimes 			oldpath = buf1;
2269b50d902SRodney W. Grimes 		} else {
2279b50d902SRodney W. Grimes 			path = buf1;
2289b50d902SRodney W. Grimes 			oldpath = buf2;
2299b50d902SRodney W. Grimes 		}
2309b50d902SRodney W. Grimes 	}
23172a0982cSWolfram Schneider 
2329b50d902SRodney W. Grimes 	/* Non-zero status if there were errors */
2339b50d902SRodney W. Grimes 	if (fflush(stdout) != 0 || ferror(stdout))
23472a0982cSWolfram Schneider 		errx(1, "stdout");
23572a0982cSWolfram Schneider 
2369b50d902SRodney W. Grimes 	exit(0);
2379b50d902SRodney W. Grimes }
2389b50d902SRodney W. Grimes 
2399b50d902SRodney W. Grimes void
usage(void)24031fa4102SGavin Atkinson usage(void)
2419b50d902SRodney W. Grimes {
2429b50d902SRodney W. Grimes 	(void)fprintf(stderr,
2439b50d902SRodney W. Grimes 	    "usage: locate.code common_bigrams < list > squozen_list\n");
2449b50d902SRodney W. Grimes 	exit(1);
2459b50d902SRodney W. Grimes }
246