1 /*	$OpenBSD: util.c,v 1.30 2005/04/03 19:12:40 otto Exp $	*/
2 
3 /*-
4  * Copyright (c) 1999 James Howard and Dag-Erling Coïdan Smørgrav
5  * All rights reserved.
6  *
7  * Redistribution and use in source and binary forms, with or without
8  * modification, are permitted provided that the following conditions
9  * are met:
10  * 1. Redistributions of source code must retain the above copyright
11  *    notice, this list of conditions and the following disclaimer.
12  * 2. Redistributions in binary form must reproduce the above copyright
13  *    notice, this list of conditions and the following disclaimer in the
14  *    documentation and/or other materials provided with the distribution.
15  *
16  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
17  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
18  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
19  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
20  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
21  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
22  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
23  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
24  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
25  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
26  * SUCH DAMAGE.
27  */
28 
29 #include <sys/types.h>
30 #include <sys/stat.h>
31 
32 #include <ctype.h>
33 #include <err.h>
34 #include <errno.h>
35 #include <fts.h>
36 #include <regex.h>
37 #include <stdio.h>
38 #include <stdlib.h>
39 #include <string.h>
40 #include <unistd.h>
41 #include <zlib.h>
42 
43 #include "grep.h"
44 
45 __RCSID("$MirOS: src/usr.bin/grep/util.c,v 1.3 2013/08/06 16:59:33 tg Exp $");
46 
47 /*
48  * Process a file line by line...
49  */
50 
51 static int	linesqueued;
52 static int	procline(str_t *l, int);
53 static int	grep_search(fastgrep_t *, unsigned char *, size_t, regmatch_t *pmatch);
54 static int	grep_cmp(const unsigned char *, const unsigned char *, size_t);
55 static void	grep_revstr(unsigned char *, int);
56 
57 int
grep_tree(char ** argv)58 grep_tree(char **argv)
59 {
60 	FTS	*fts;
61 	FTSENT	*p;
62 	int	c, fts_flags;
63 
64 	c = fts_flags = 0;
65 
66 	if (Hflag)
67 		fts_flags = FTS_COMFOLLOW;
68 	if (Pflag)
69 		fts_flags = FTS_PHYSICAL;
70 	if (Sflag)
71 		fts_flags = FTS_LOGICAL;
72 
73 	fts_flags |= FTS_NOSTAT | FTS_NOCHDIR;
74 
75 	if (!(fts = fts_open(argv, fts_flags, NULL)))
76 		err(2, NULL);
77 	while ((p = fts_read(fts)) != NULL) {
78 		switch (p->fts_info) {
79 		case FTS_DNR:
80 			break;
81 		case FTS_ERR:
82 			errx(2, "%s: %s", p->fts_path, strerror(p->fts_errno));
83 			break;
84 		case FTS_DP:
85 			break;
86 		default:
87 			c += procfile(p->fts_path);
88 			break;
89 		}
90 	}
91 
92 	return c;
93 }
94 
95 int
procfile(const char * fn)96 procfile(const char *fn)
97 {
98 	str_t ln;
99 	file_t *f;
100 	int c, t, z, nottext;
101 
102 	if (fn == NULL) {
103 		fn = "(standard input)";
104 		f = grep_fdopen(STDIN_FILENO, "r");
105 	} else {
106 		f = grep_open(fn, "r");
107 	}
108 	if (f == NULL) {
109 		if (!sflag)
110 			warn("%s", fn);
111 		return 0;
112 	}
113 
114 	nottext = grep_bin_file(f);
115 	if (nottext && binbehave == BIN_FILE_SKIP) {
116 		grep_close(f);
117 		return 0;
118 	}
119 
120 	ln.file = fn;
121 	ln.line_no = 0;
122 	ln.len = 0;
123 	linesqueued = 0;
124 	ln.off = -1;
125 
126 	if (Bflag > 0)
127 		initqueue();
128 	for (c = 0;  c == 0 || !(lflag || qflag); ) {
129 		ln.off += ln.len + 1;
130 		if ((ln.dat = grep_fgetln(f, &ln.len)) == NULL)
131 			break;
132 		if (ln.len > 0 && ln.dat[ln.len - 1] == '\n')
133 			--ln.len;
134 		ln.line_no++;
135 
136 		z = tail;
137 
138 		if ((t = procline(&ln, nottext)) == 0 && Bflag > 0 && z == 0) {
139 			enqueue(&ln);
140 			linesqueued++;
141 		}
142 		c += t;
143 	}
144 	if (Bflag > 0)
145 		clearqueue();
146 	grep_close(f);
147 
148 	if (cflag) {
149 		if (!hflag)
150 			printf("%s:", ln.file);
151 		printf("%u\n", c);
152 	}
153 	if (lflag && c != 0)
154 		printf("%s\n", fn);
155 	if (Lflag && c == 0)
156 		printf("%s\n", fn);
157 	if (c && !cflag && !lflag && !Lflag &&
158 	    binbehave == BIN_FILE_BIN && nottext && !qflag)
159 		printf("Binary file %s matches\n", fn);
160 
161 	return c;
162 }
163 
164 
165 /*
166  * Process an individual line in a file. Return non-zero if it matches.
167  */
168 
169 #define isword(x) (isalnum(x) || (x) == '_')
170 
171 static int
procline(str_t * l,int nottext)172 procline(str_t *l, int nottext)
173 {
174 	regmatch_t	pmatch;
175 	int		c, i, r;
176 
177 	if (matchall) {
178 		c = !vflag;
179 		goto print;
180 	}
181 
182 	for (c = i = 0; i < patterns; i++) {
183 		if (fg_pattern[i].pattern) {
184 			r = grep_search(&fg_pattern[i], (unsigned char *)l->dat,
185 			    l->len, &pmatch);
186 		} else {
187 			pmatch.rm_so = 0;
188 			pmatch.rm_eo = l->len;
189 			r = regexec(&r_pattern[i], l->dat, 1, &pmatch, eflags);
190 		}
191 		if (r == 0 && xflag) {
192 			if (pmatch.rm_so != 0 || pmatch.rm_eo != l->len)
193 				r = REG_NOMATCH;
194 		}
195 		if (r == 0) {
196 			c++;
197 			break;
198 		}
199 	}
200 	if (vflag)
201 		c = !c;
202 
203 print:
204 	if (c && binbehave == BIN_FILE_BIN && nottext)
205 		return c; /* Binary file */
206 
207 	if ((tail > 0 || c) && !cflag && !qflag) {
208 		if (c) {
209 			if (first > 0 && tail == 0 && (Bflag < linesqueued) &&
210 			    (Aflag || Bflag))
211 				printf("--\n");
212 			first = 1;
213 			tail = Aflag;
214 			if (Bflag > 0)
215 				printqueue();
216 			linesqueued = 0;
217 			printline(l, ':');
218 		} else {
219 			printline(l, '-');
220 			tail--;
221 		}
222 	}
223 	return c;
224 }
225 
226 /*
227  * Returns: -1 on failure, 0 on success
228  */
229 int
fgrepcomp(fastgrep_t * fg,const char * patternx)230 fgrepcomp(fastgrep_t *fg, const char *patternx)
231 {
232 	int i;
233 
234 	/* Initialize. */
235 	fg->patternLen = strlen(patternx);
236 	fg->bol = 0;
237 	fg->eol = 0;
238 	fg->wmatch = wflag;
239 	fg->reversedSearch = 0;
240 
241 	/*
242 	 * Make a copy and upper case it for later if in -i mode,
243 	 * else just copy the pointer.
244 	 */
245 	if (iflag) {
246 		unsigned char *cp;
247 
248 		fg->pattern = cp = grep_malloc(fg->patternLen + 1);
249 		for (i = 0; i < fg->patternLen; i++)
250 			cp[i] = toupper(patternx[i]);
251 		cp[fg->patternLen] = '\0';
252 	} else
253 		fg->pattern = (const unsigned char *)patternx;
254 
255 	/* Preprocess pattern. */
256 	for (i = 0; (unsigned)i <= UCHAR_MAX; i++)
257 		fg->qsBc[i] = fg->patternLen;
258 	for (i = 1; i < fg->patternLen; i++) {
259 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
260 		/*
261 		 * If case is ignored, make the jump apply to both upper and
262 		 * lower cased characters.  As the pattern is stored in upper
263 		 * case, apply the same to the lower case equivalents.
264 		 */
265 		if (iflag)
266 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
267 	}
268 
269 	return (0);
270 }
271 
272 /*
273  * Returns: -1 on failure, 0 on success
274  */
275 int
fastcomp(fastgrep_t * fg,const char * patternx)276 fastcomp(fastgrep_t *fg, const char *patternx)
277 {
278 	int i;
279 	int bol = 0;
280 	int eol = 0;
281 	int shiftPatternLen;
282 	int hasDot = 0;
283 	int firstHalfDot = -1;
284 	int firstLastHalfDot = -1;
285 	int lastHalfDot = 0;
286 	char *cp;
287 
288 	/* Initialize. */
289 	fg->patternLen = strlen(patternx);
290 	fg->bol = 0;
291 	fg->eol = 0;
292 	fg->wmatch = 0;
293 	fg->reversedSearch = 0;
294 
295 	/* Remove end-of-line character ('$'). */
296 	if (patternx[fg->patternLen - 1] == '$') {
297 		eol++;
298 		fg->eol = 1;
299 		fg->patternLen--;
300 	}
301 
302 	/* Remove beginning-of-line character ('^'). */
303 	if (patternx[0] == '^') {
304 		bol++;
305 		fg->bol = 1;
306 		fg->patternLen--;
307 	}
308 
309 	/* Remove enclosing [[:<:]] and [[:>:]] (word match). */
310 	if (wflag) {
311 		/* basic re's use \( \), extended re's ( ) */
312 		int extra = Eflag ? 1 : 2;
313 		fg->patternLen -= 14 + 2 * extra;
314 		fg->wmatch = 7 + extra;
315 	} else if (fg->patternLen >= 14 &&
316 	    strncmp(patternx + fg->bol, "[[:<:]]", 7) == 0 &&
317 	    strncmp(patternx + fg->bol + fg->patternLen - 7, "[[:>:]]", 7) == 0) {
318 		fg->patternLen -= 14;
319 		fg->wmatch = 7;
320 	}
321 
322 	/*
323 	 * Copy pattern minus '^' and '$' characters as well as word
324 	 * match character classes at the beginning and ending of the
325 	 * string respectively.
326 	 */
327 	fg->pattern = cp = grep_malloc(fg->patternLen + 1);
328 	memcpy(cp, patternx + bol + fg->wmatch, fg->patternLen);
329 	cp[fg->patternLen] = '\0';
330 
331 	/* Look for ways to cheat...er...avoid the full regex engine. */
332 	for (i = 0; i < fg->patternLen; i++)
333 	{
334 		/* Can still cheat? */
335 		if ((isalnum(fg->pattern[i])) || isspace(fg->pattern[i]) ||
336 		    (fg->pattern[i] == '_') || (fg->pattern[i] == ',') ||
337 		    (fg->pattern[i] == '^') || (fg->pattern[i] == '$') ||
338 		    (fg->pattern[i] == '=') || (fg->pattern[i] == '-') ||
339 		    (fg->pattern[i] == ':') || (fg->pattern[i] == '/')) {
340 			/* As long as it is good, upper case it for later. */
341 			if (iflag)
342 				cp[i] = toupper(fg->pattern[i]);
343 		} else if (fg->pattern[i] == '.') {
344 			hasDot = i;
345 			if (i < fg->patternLen / 2) {
346 				if (firstHalfDot < 0)
347 					/* Closest dot to the beginning */
348 					firstHalfDot = i;
349 			} else {
350 				/* Closest dot to the end of the pattern. */
351 				lastHalfDot = i;
352 				if (firstLastHalfDot < 0)
353 					firstLastHalfDot = i;
354 			}
355 		} else {
356 			/* Free memory and let others know this is empty. */
357 			free(cp);
358 			fg->pattern = NULL;
359 			return (-1);
360 		}
361 	}
362 
363 	/*
364 	 * Determine if a reverse search would be faster based on the placement
365 	 * of the dots.
366 	 */
367 	if ((!(lflag || cflag)) && ((!(bol || eol)) &&
368 	    ((lastHalfDot) && ((firstHalfDot < 0) ||
369 	    ((fg->patternLen - (lastHalfDot + 1)) < firstHalfDot))))) {
370 		fg->reversedSearch = 1;
371 		hasDot = fg->patternLen - (firstHalfDot < 0 ?
372 		    firstLastHalfDot : firstHalfDot) - 1;
373 		grep_revstr(cp, fg->patternLen);
374 	}
375 
376 	/*
377 	 * Normal Quick Search would require a shift based on the position the
378 	 * next character after the comparison is within the pattern.  With
379 	 * wildcards, the position of the last dot effects the maximum shift
380 	 * distance.
381 	 * The closer to the end the wild card is the slower the search.  A
382 	 * reverse version of this algorithm would be useful for wildcards near
383 	 * the end of the string.
384 	 *
385 	 * Examples:
386 	 * Pattern	Max shift
387 	 * -------	---------
388 	 * this		5
389 	 * .his		4
390 	 * t.is		3
391 	 * th.s		2
392 	 * thi.		1
393 	 */
394 
395 	/* Adjust the shift based on location of the last dot ('.'). */
396 	shiftPatternLen = fg->patternLen - hasDot;
397 
398 	/* Preprocess pattern. */
399 	for (i = 0; (unsigned)i <= UCHAR_MAX; i++)
400 		fg->qsBc[i] = shiftPatternLen;
401 	for (i = hasDot + 1; i < fg->patternLen; i++) {
402 		fg->qsBc[fg->pattern[i]] = fg->patternLen - i;
403 		/*
404 		 * If case is ignored, make the jump apply to both upper and
405 		 * lower cased characters.  As the pattern is stored in upper
406 		 * case, apply the same to the lower case equivalents.
407 		 */
408 		if (iflag)
409 			fg->qsBc[tolower(fg->pattern[i])] = fg->patternLen - i;
410 	}
411 
412 	/*
413 	 * Put pattern back to normal after pre-processing to allow for easy
414 	 * comparisons later.
415 	 */
416 	if (fg->reversedSearch)
417 		grep_revstr(cp, fg->patternLen);
418 
419 	return (0);
420 }
421 
422 /*
423  * Word boundaries using regular expressions are defined as the point
424  * of transition from a non-word char to a word char, or vice versa.
425  * This means that grep -w +a and grep -w a+ never match anything,
426  * because they lack a starting or ending transition, but grep -w a+b
427  * does match a line containing a+b.
428  */
429 #define wmatch(d, l, s, e)	\
430 	((s == 0 || !isword(d[s-1])) && (e == l || !isword(d[e])) && \
431 	  e > s && isword(d[s]) && isword(d[e-1]))
432 
433 static int
grep_search(fastgrep_t * fg,unsigned char * data,size_t dataLen,regmatch_t * pmatch)434 grep_search(fastgrep_t *fg, unsigned char *data, size_t dataLen, regmatch_t *pmatch)
435 {
436 	size_t j;
437 	int rtrnVal = REG_NOMATCH;
438 
439 	pmatch->rm_so = -1;
440 	pmatch->rm_eo = -1;
441 
442 	/* No point in going farther if we do not have enough data. */
443 	if (dataLen < (size_t)fg->patternLen)
444 		return (rtrnVal);
445 
446 	/* Only try once at the beginning or ending of the line. */
447 	if (fg->bol || fg->eol) {
448 		/* Simple text comparison. */
449 		/* Verify data is >= pattern length before searching on it. */
450 		if (dataLen >= (size_t)fg->patternLen) {
451 			/* Determine where in data to start search at. */
452 			if (fg->eol)
453 				j = dataLen - fg->patternLen;
454 			else
455 				j = 0;
456 			if (!((fg->bol && fg->eol) &&
457 			    (dataLen != (size_t)fg->patternLen)))
458 				if (grep_cmp(fg->pattern, data + j,
459 				    fg->patternLen) == -1) {
460 					pmatch->rm_so = j;
461 					pmatch->rm_eo = j + fg->patternLen;
462 					if (!fg->wmatch || wmatch(data, dataLen,
463 					    pmatch->rm_so, pmatch->rm_eo))
464 						rtrnVal = 0;
465 				}
466 		}
467 	} else if (fg->reversedSearch) {
468 		/* Quick Search algorithm. */
469 		j = dataLen;
470 		do {
471 			if (grep_cmp(fg->pattern, data + j - fg->patternLen,
472 			    fg->patternLen) == -1) {
473 				pmatch->rm_so = j - fg->patternLen;
474 				pmatch->rm_eo = j;
475 				if (!fg->wmatch || wmatch(data, dataLen,
476 				    pmatch->rm_so, pmatch->rm_eo)) {
477 					rtrnVal = 0;
478 					break;
479 				}
480 			}
481 			/* Shift if within bounds, otherwise, we are done. */
482 			if (j == (size_t)fg->patternLen)
483 				break;
484 			j -= fg->qsBc[data[j - fg->patternLen - 1]];
485 		} while (j >= (size_t)fg->patternLen);
486 	} else {
487 		/* Quick Search algorithm. */
488 		j = 0;
489 		do {
490 			if (grep_cmp(fg->pattern, data + j, fg->patternLen) == -1) {
491 				pmatch->rm_so = j;
492 				pmatch->rm_eo = j + fg->patternLen;
493 				if (!fg->wmatch || wmatch(data, dataLen,
494 				    pmatch->rm_so, pmatch->rm_eo)) {
495 					rtrnVal = 0;
496 					break;
497 				}
498 			}
499 
500 			/* Shift if within bounds, otherwise, we are done. */
501 			if (j + fg->patternLen == dataLen)
502 				break;
503 			else
504 				j += fg->qsBc[data[j + fg->patternLen]];
505 		} while (j <= (dataLen - fg->patternLen));
506 	}
507 
508 	return (rtrnVal);
509 }
510 
511 
512 void *
grep_malloc(size_t size)513 grep_malloc(size_t size)
514 {
515 	void	*ptr;
516 
517 	if ((ptr = malloc(size)) == NULL)
518 		err(2, "malloc");
519 	return ptr;
520 }
521 
522 void *
grep_realloc(void * ptr,size_t size)523 grep_realloc(void *ptr, size_t size)
524 {
525 	if ((ptr = realloc(ptr, size)) == NULL)
526 		err(2, "realloc");
527 	return ptr;
528 }
529 
530 /*
531  * Returns:	i >= 0 on failure (position that it failed)
532  *		-1 on success
533  */
534 static int
grep_cmp(const unsigned char * patternx,const unsigned char * data,size_t len)535 grep_cmp(const unsigned char *patternx, const unsigned char *data, size_t len)
536 {
537 	size_t i;
538 
539 	for (i = 0; i < len; i++) {
540 		if (((patternx[i] == data[i]) || (!Fflag && patternx[i] == '.'))
541 		    || (iflag && patternx[i] == toupper(data[i])))
542 			continue;
543 		return (i);
544 	}
545 
546 	return (-1);
547 }
548 
549 static void
grep_revstr(unsigned char * str,int len)550 grep_revstr(unsigned char *str, int len)
551 {
552 	int i;
553 	char c;
554 
555 	for (i = 0; i < len / 2; i++) {
556 		c = str[i];
557 		str[i] = str[len - i - 1];
558 		str[len - i - 1] = c;
559 	}
560 }
561 
562 void
printline(str_t * line,int sep)563 printline(str_t *line, int sep)
564 {
565 	int n;
566 
567 	n = 0;
568 	if (!hflag) {
569 		fputs(line->file, stdout);
570 		++n;
571 	}
572 	if (nflag) {
573 		if (n)
574 			putchar(sep);
575 		printf("%d", line->line_no);
576 		++n;
577 	}
578 	if (bflag) {
579 		if (n)
580 			putchar(sep);
581 		printf("%lld", (long long)line->off);
582 		++n;
583 	}
584 	if (n)
585 		putchar(sep);
586 	fwrite(line->dat, line->len, 1, stdout);
587 	putchar('\n');
588 }
589