19b50d902SRodney W. Grimes /*
2389844c0SBaptiste Daroussin * SPDX-License-Identifier: BSD-3-Clause
3df57947fSPedro F. Giffuni *
4834a8fa1SWolfram Schneider * Copyright (c) 1995-2022 Wolfram Schneider <[email protected]>
59b50d902SRodney W. Grimes * Copyright (c) 1989, 1993
69b50d902SRodney W. Grimes * The Regents of the University of California. All rights reserved.
79b50d902SRodney W. Grimes *
89b50d902SRodney W. Grimes * This code is derived from software contributed to Berkeley by
99b50d902SRodney W. Grimes * James A. Woods.
109b50d902SRodney W. Grimes *
119b50d902SRodney W. Grimes * Redistribution and use in source and binary forms, with or without
129b50d902SRodney W. Grimes * modification, are permitted provided that the following conditions
139b50d902SRodney W. Grimes * are met:
149b50d902SRodney W. Grimes * 1. Redistributions of source code must retain the above copyright
159b50d902SRodney W. Grimes * notice, this list of conditions and the following disclaimer.
169b50d902SRodney W. Grimes * 2. Redistributions in binary form must reproduce the above copyright
179b50d902SRodney W. Grimes * notice, this list of conditions and the following disclaimer in the
189b50d902SRodney W. Grimes * documentation and/or other materials provided with the distribution.
19389844c0SBaptiste Daroussin * 3. Neither the name of the University nor the names of its contributors
209b50d902SRodney W. Grimes * may be used to endorse or promote products derived from this software
219b50d902SRodney W. Grimes * without specific prior written permission.
229b50d902SRodney W. Grimes *
239b50d902SRodney W. Grimes * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
249b50d902SRodney W. Grimes * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
259b50d902SRodney W. Grimes * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
269b50d902SRodney W. Grimes * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
279b50d902SRodney W. Grimes * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
289b50d902SRodney W. Grimes * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
299b50d902SRodney W. Grimes * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
309b50d902SRodney W. Grimes * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
319b50d902SRodney W. Grimes * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
329b50d902SRodney W. Grimes * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
339b50d902SRodney W. Grimes * SUCH DAMAGE.
349b50d902SRodney W. Grimes */
359b50d902SRodney W. Grimes
3631fa4102SGavin Atkinson #if 0
379b50d902SRodney W. Grimes #ifndef lint
389b50d902SRodney W. Grimes static char copyright[] =
399b50d902SRodney W. Grimes "@(#) Copyright (c) 1989, 1993\n\
409b50d902SRodney W. Grimes The Regents of the University of California. All rights reserved.\n";
419b50d902SRodney W. Grimes #endif /* not lint */
429b50d902SRodney W. Grimes
439b50d902SRodney W. Grimes #ifndef lint
449b50d902SRodney W. Grimes static char sccsid[] = "@(#)locate.code.c 8.1 (Berkeley) 6/6/93";
459b50d902SRodney W. Grimes #endif /* not lint */
4631fa4102SGavin Atkinson #endif
479b50d902SRodney W. Grimes
489b50d902SRodney W. Grimes /*
499b50d902SRodney W. Grimes * PURPOSE: sorted list compressor (works with a modified 'find'
509b50d902SRodney W. Grimes * to encode/decode a filename database)
519b50d902SRodney W. Grimes *
529b50d902SRodney W. Grimes * USAGE: bigram < list > bigrams
539b50d902SRodney W. Grimes * process bigrams (see updatedb) > common_bigrams
549b50d902SRodney W. Grimes * code common_bigrams < list > squozen_list
559b50d902SRodney W. Grimes *
569b50d902SRodney W. Grimes * METHOD: Uses 'front compression' (see ";login:", Volume 8, Number 1
579b50d902SRodney W. Grimes * February/March 1983, p. 8). Output format is, per line, an
589b50d902SRodney W. Grimes * offset differential count byte followed by a partially bigram-
599b50d902SRodney W. Grimes * encoded ascii residue. A bigram is a two-character sequence,
609b50d902SRodney W. Grimes * the first 128 most common of which are encoded in one byte.
619b50d902SRodney W. Grimes *
629b50d902SRodney W. Grimes * EXAMPLE: For simple front compression with no bigram encoding,
639b50d902SRodney W. Grimes * if the input is... then the output is...
649b50d902SRodney W. Grimes *
659b50d902SRodney W. Grimes * /usr/src 0 /usr/src
669b50d902SRodney W. Grimes * /usr/src/cmd/aardvark.c 8 /cmd/aardvark.c
679b50d902SRodney W. Grimes * /usr/src/cmd/armadillo.c 14 armadillo.c
689b50d902SRodney W. Grimes * /usr/tmp/zoo 5 tmp/zoo
699b50d902SRodney W. Grimes *
709b50d902SRodney W. Grimes * The codes are:
719b50d902SRodney W. Grimes *
729b50d902SRodney W. Grimes * 0-28 likeliest differential counts + offset to make nonnegative
739b50d902SRodney W. Grimes * 30 switch code for out-of-range count to follow in next word
74139764e8SWolfram Schneider * 31 an 8 bit char followed
759b50d902SRodney W. Grimes * 128-255 bigram codes (128 most common, as determined by 'updatedb')
769b50d902SRodney W. Grimes * 32-127 single character (printable) ascii residue (ie, literal)
779b50d902SRodney W. Grimes *
78139764e8SWolfram Schneider * The locate database store any character except newline ('\n')
79139764e8SWolfram Schneider * and NUL ('\0'). The 8-bit character support don't wast extra
80139764e8SWolfram Schneider * space until you have characters in file names less than 32
81139764e8SWolfram Schneider * or greather than 127.
82139764e8SWolfram Schneider *
83139764e8SWolfram Schneider *
84139764e8SWolfram Schneider * SEE ALSO: updatedb.sh, ../bigram/locate.bigram.c
859b50d902SRodney W. Grimes *
869b50d902SRodney W. Grimes * AUTHOR: James A. Woods, Informatics General Corp.,
879b50d902SRodney W. Grimes * NASA Ames Research Center, 10/82
88139764e8SWolfram Schneider * 8-bit file names characters:
89139764e8SWolfram Schneider * Wolfram Schneider, Berlin September 1996
909b50d902SRodney W. Grimes */
919b50d902SRodney W. Grimes
929b50d902SRodney W. Grimes #include <sys/param.h>
939b50d902SRodney W. Grimes #include <err.h>
949b50d902SRodney W. Grimes #include <errno.h>
959b50d902SRodney W. Grimes #include <stdlib.h>
969b50d902SRodney W. Grimes #include <string.h>
979b50d902SRodney W. Grimes #include <stdio.h>
9862f882d6SWarner Losh #include <unistd.h>
999b50d902SRodney W. Grimes #include "locate.h"
1009b50d902SRodney W. Grimes
1019b50d902SRodney W. Grimes #define BGBUFSIZE (NBG * 2) /* size of bigram buffer */
1029b50d902SRodney W. Grimes
103834a8fa1SWolfram Schneider u_char buf1[LOCATE_PATH_MAX] = " ";
104834a8fa1SWolfram Schneider u_char buf2[LOCATE_PATH_MAX];
105139764e8SWolfram Schneider u_char bigrams[BGBUFSIZE + 1] = { 0 };
1069b50d902SRodney W. Grimes
107d43255b5SWolfram Schneider /* use a lookup array instead a function, 3x faster than linear search */
108*a6c20dddSWolfram Schneider int big [UCHAR_MAX + 1][UCHAR_MAX + 1];
109139764e8SWolfram Schneider #define BGINDEX(x) (big[(u_char)*x][(u_char)*(x + 1)])
1107ec1929dSWolfram Schneider
111f1bb2cd2SWarner Losh void usage(void);
1129b50d902SRodney W. Grimes
1139b50d902SRodney W. Grimes int
main(int argc,char * argv[])11431fa4102SGavin Atkinson main(int argc, char *argv[])
1159b50d902SRodney W. Grimes {
11631fa4102SGavin Atkinson u_char *cp, *oldpath, *path;
1179b50d902SRodney W. Grimes int ch, code, count, diffcount, oldcount;
11831fa4102SGavin Atkinson u_int i, j;
1199b50d902SRodney W. Grimes FILE *fp;
1209b50d902SRodney W. Grimes
1211c8af878SWarner Losh while ((ch = getopt(argc, argv, "")) != -1)
1229b50d902SRodney W. Grimes switch(ch) {
1239b50d902SRodney W. Grimes default:
1249b50d902SRodney W. Grimes usage();
1259b50d902SRodney W. Grimes }
1269b50d902SRodney W. Grimes argc -= optind;
1279b50d902SRodney W. Grimes argv += optind;
1289b50d902SRodney W. Grimes
1299b50d902SRodney W. Grimes if (argc != 1)
1309b50d902SRodney W. Grimes usage();
1319b50d902SRodney W. Grimes
1329b50d902SRodney W. Grimes if ((fp = fopen(argv[0], "r")) == NULL)
1339b50d902SRodney W. Grimes err(1, "%s", argv[0]);
1349b50d902SRodney W. Grimes
1359b50d902SRodney W. Grimes /* First copy bigram array to stdout. */
1368e7c0a6dSWolfram Schneider if (fgets(bigrams, BGBUFSIZE + 1, fp) == NULL) {
1378e7c0a6dSWolfram Schneider if (!feof(fp) || ferror(fp))
13872a0982cSWolfram Schneider err(1, "get bigram array");
1398e7c0a6dSWolfram Schneider }
1407ec1929dSWolfram Schneider
1419b50d902SRodney W. Grimes if (fwrite(bigrams, 1, BGBUFSIZE, stdout) != BGBUFSIZE)
1429b50d902SRodney W. Grimes err(1, "stdout");
1439b50d902SRodney W. Grimes (void)fclose(fp);
1449b50d902SRodney W. Grimes
14537002181SWolfram Schneider /* init lookup table */
146139764e8SWolfram Schneider for (i = 0; i < UCHAR_MAX + 1; i++)
147139764e8SWolfram Schneider for (j = 0; j < UCHAR_MAX + 1; j++)
148*a6c20dddSWolfram Schneider big[i][j] = -1;
14937002181SWolfram Schneider
1501a1ee31fSWolfram Schneider for (cp = bigrams, i = 0; *cp != '\0'; i += 2, cp += 2)
151*a6c20dddSWolfram Schneider big[(u_char)*cp][(u_char)*(cp + 1)] = i;
152139764e8SWolfram Schneider
1539b50d902SRodney W. Grimes oldpath = buf1;
1549b50d902SRodney W. Grimes path = buf2;
1559b50d902SRodney W. Grimes oldcount = 0;
1567ec1929dSWolfram Schneider
15737002181SWolfram Schneider while (fgets(path, sizeof(buf2), stdin) != NULL) {
15837002181SWolfram Schneider
15937002181SWolfram Schneider /* skip empty lines */
16037002181SWolfram Schneider if (*path == '\n')
16137002181SWolfram Schneider continue;
1629b50d902SRodney W. Grimes
163139764e8SWolfram Schneider /* remove newline */
1641a1ee31fSWolfram Schneider for (cp = path; *cp != '\0'; cp++) {
16537002181SWolfram Schneider /* chop newline */
16637002181SWolfram Schneider if (*cp == '\n')
1671a1ee31fSWolfram Schneider *cp = '\0';
1689b50d902SRodney W. Grimes }
1699b50d902SRodney W. Grimes
1709b50d902SRodney W. Grimes /* Skip longest common prefix. */
171139764e8SWolfram Schneider for (cp = path; *cp == *oldpath; cp++, oldpath++)
172139764e8SWolfram Schneider if (*cp == '\0')
173139764e8SWolfram Schneider break;
17437002181SWolfram Schneider
1759b50d902SRodney W. Grimes count = cp - path;
1769b50d902SRodney W. Grimes diffcount = count - oldcount + OFFSET;
1779b50d902SRodney W. Grimes oldcount = count;
1789b50d902SRodney W. Grimes if (diffcount < 0 || diffcount > 2 * OFFSET) {
1799b50d902SRodney W. Grimes if (putchar(SWITCH) == EOF ||
1809b50d902SRodney W. Grimes putw(diffcount, stdout) == EOF)
1819b50d902SRodney W. Grimes err(1, "stdout");
1829b50d902SRodney W. Grimes } else
1839b50d902SRodney W. Grimes if (putchar(diffcount) == EOF)
1849b50d902SRodney W. Grimes err(1, "stdout");
1859b50d902SRodney W. Grimes
1861a1ee31fSWolfram Schneider while (*cp != '\0') {
187139764e8SWolfram Schneider /* print *two* characters */
188139764e8SWolfram Schneider
189*a6c20dddSWolfram Schneider if ((code = BGINDEX(cp)) != -1) {
190139764e8SWolfram Schneider /*
191139764e8SWolfram Schneider * print *one* as bigram
192139764e8SWolfram Schneider * Found, so mark byte with
193139764e8SWolfram Schneider * parity bit.
194139764e8SWolfram Schneider */
1959b50d902SRodney W. Grimes if (putchar((code / 2) | PARITY) == EOF)
1969b50d902SRodney W. Grimes err(1, "stdout");
1979b50d902SRodney W. Grimes cp += 2;
1989b50d902SRodney W. Grimes }
199139764e8SWolfram Schneider
200139764e8SWolfram Schneider else {
201139764e8SWolfram Schneider for (i = 0; i < 2; i++) {
202139764e8SWolfram Schneider if (*cp == '\0')
203139764e8SWolfram Schneider break;
204139764e8SWolfram Schneider
205139764e8SWolfram Schneider /* print umlauts in file names */
206139764e8SWolfram Schneider if (*cp < ASCII_MIN ||
207139764e8SWolfram Schneider *cp > ASCII_MAX) {
208139764e8SWolfram Schneider if (putchar(UMLAUT) == EOF ||
209139764e8SWolfram Schneider putchar(*cp++) == EOF)
210139764e8SWolfram Schneider err(1, "stdout");
2119b50d902SRodney W. Grimes }
212139764e8SWolfram Schneider
213139764e8SWolfram Schneider else {
214139764e8SWolfram Schneider /* normal character */
215139764e8SWolfram Schneider if(putchar(*cp++) == EOF)
216139764e8SWolfram Schneider err(1, "stdout");
217139764e8SWolfram Schneider }
218139764e8SWolfram Schneider }
219139764e8SWolfram Schneider
220139764e8SWolfram Schneider }
221139764e8SWolfram Schneider }
222139764e8SWolfram Schneider
2239b50d902SRodney W. Grimes if (path == buf1) { /* swap pointers */
2249b50d902SRodney W. Grimes path = buf2;
2259b50d902SRodney W. Grimes oldpath = buf1;
2269b50d902SRodney W. Grimes } else {
2279b50d902SRodney W. Grimes path = buf1;
2289b50d902SRodney W. Grimes oldpath = buf2;
2299b50d902SRodney W. Grimes }
2309b50d902SRodney W. Grimes }
23172a0982cSWolfram Schneider
2329b50d902SRodney W. Grimes /* Non-zero status if there were errors */
2339b50d902SRodney W. Grimes if (fflush(stdout) != 0 || ferror(stdout))
23472a0982cSWolfram Schneider errx(1, "stdout");
23572a0982cSWolfram Schneider
2369b50d902SRodney W. Grimes exit(0);
2379b50d902SRodney W. Grimes }
2389b50d902SRodney W. Grimes
2399b50d902SRodney W. Grimes void
usage(void)24031fa4102SGavin Atkinson usage(void)
2419b50d902SRodney W. Grimes {
2429b50d902SRodney W. Grimes (void)fprintf(stderr,
2439b50d902SRodney W. Grimes "usage: locate.code common_bigrams < list > squozen_list\n");
2449b50d902SRodney W. Grimes exit(1);
2459b50d902SRodney W. Grimes }
246