1 /*	$OpenBSD: fsort.c,v 1.12 2004/09/14 22:57:58 deraadt Exp $	*/
2 
3 /*-
4  * Copyright (c) 1993
5  *	The Regents of the University of California.  All rights reserved.
6  *
7  * This code is derived from software contributed to Berkeley by
8  * Peter McIlroy.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  * 3. Neither the name of the University nor the names of its contributors
19  *    may be used to endorse or promote products derived from this software
20  *    without specific prior written permission.
21  *
22  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32  * SUCH DAMAGE.
33  */
34 
35 #ifndef lint
36 #if 0
37 static char sccsid[] = "@(#)fsort.c	8.1 (Berkeley) 6/6/93";
38 #else
39 static char rcsid[] = "$OpenBSD: fsort.c,v 1.12 2004/09/14 22:57:58 deraadt Exp $";
40 #endif
41 #endif /* not lint */
42 
43 /*
44  * Read in the next bin.  If it fits in one segment sort it;
45  * otherwise refine it by segment deeper by one character,
46  * and try again on smaller bins.  Sort the final bin at this level
47  * of recursion to keep the head of fstack at 0.
48  * After PANIC passes, abort to merge sort.
49  */
50 #include "sort.h"
51 #include "fsort.h"
52 
53 #include <stdlib.h>
54 #include <string.h>
55 
56 u_char **keylist = 0, *buffer = 0, *linebuf = 0;
57 size_t bufsize, linebuf_size;
58 struct tempfile fstack[MAXFCT];
59 extern char toutpath[];
60 #define FSORTMAX 4
61 int PANIC = FSORTMAX;
62 
63 void
fsort(int binno,int depth,union f_handle infiles,int nfiles,FILE * outfp,struct field * ftbl)64 fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
65     struct field *ftbl)
66 {
67 	u_char *bufend, **keypos, *tmpbuf;
68 	u_char *weights;
69 	int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
70 	int c, nelem;
71 	long sizes[NBINS+1];
72 	union f_handle tfiles, mstart = {MAXFCT-16};
73 	int (*get)(int, union f_handle, int, RECHEADER *,
74 		u_char *, struct field *);
75 	RECHEADER *crec;
76 	struct field tfield[2];
77 	FILE *prevfp, *tailfp[FSORTMAX+1];
78 
79 	memset(tailfp, 0, sizeof(tailfp));
80 	prevfp = outfp;
81 	memset(tfield, 0, sizeof(tfield));
82 	if (ftbl[0].flags & R)
83 		tfield[0].weights = Rascii;
84 	else
85 		tfield[0].weights = ascii;
86 	tfield[0].icol.num = 1;
87 	weights = ftbl[0].weights;
88 	if (!buffer) {
89 		bufsize = BUFSIZE;
90 		if ((buffer = malloc(bufsize + 1)) == NULL ||
91 		    (keylist = calloc(MAXNUM, sizeof(u_char *))) == NULL)
92 			errx(2, "cannot allocate memory");
93 		if (!SINGL_FLD) {
94 			linebuf_size = MAXLLEN;
95 			if ((linebuf = malloc(linebuf_size)) == NULL)
96 				errx(2, "cannot allocate memory");
97 		}
98 	}
99 	bufend = buffer + bufsize;
100 	if (binno >= 0) {
101 		tfiles.top = infiles.top + nfiles;
102 		get = getnext;
103 	} else {
104 		tfiles.top = 0;
105 		if (SINGL_FLD)
106 			get = makeline;
107 		else
108 			get = makekey;
109 	}
110 	for (;;) {
111 		memset(sizes, 0, sizeof(sizes));
112 		c = ntfiles = 0;
113 		if (binno == weights[REC_D] &&
114 		    !(SINGL_FLD && ftbl[0].flags & F)) {	/* pop */
115 			rd_append(weights[REC_D],
116 			    infiles, nfiles, prevfp, buffer, bufend);
117 			break;
118 		} else if (binno == weights[REC_D]) {
119 			depth = 0;		/* start over on flat weights */
120 			ftbl = tfield;
121 			weights = ftbl[0].weights;
122 		}
123 		while (c != EOF) {
124 			keypos = keylist;
125 			nelem = 0;
126 			crec = (RECHEADER *) buffer;
127 			while((c = get(binno, infiles, nfiles, crec, bufend,
128 			    ftbl)) == 0) {
129 				*keypos++ = crec->data + depth;
130 				if (++nelem == MAXNUM) {
131 					c = BUFFEND;
132 					break;
133 				}
134 				crec =(RECHEADER *)	((char *) crec +
135 				SALIGN(crec->length) + sizeof(TRECHEADER));
136 			}
137 			/*
138 			 * buffer was too small for data, allocate
139 			 * a bigger buffer.
140 			 */
141 			if (c == BUFFEND && nelem == 0) {
142 				bufsize *= 2;
143 				buffer = realloc(buffer, bufsize);
144 				if (!buffer)
145 					err(2, "failed to realloc buffer");
146 				bufend = buffer + bufsize;
147 				continue;
148 			}
149 			if (c == BUFFEND || ntfiles || mfct) {	/* push */
150 				if (panic >= PANIC) {
151 					fstack[MAXFCT-16+mfct].fp = ftmp();
152 					if (radixsort((const u_char **)keylist,
153 					    nelem, weights, REC_D))
154 						err(2, NULL);
155 					append(keylist, nelem, depth, fstack[
156 					 MAXFCT-16+mfct].fp, putrec, ftbl);
157 					mfct++;
158 					/* reduce number of open files */
159 					if (mfct == 16 ||(c == EOF && ntfiles)) {
160 						/*
161 						 * Only copy extra incomplete
162 						 * crec data if there is any.
163 						 */
164 						int nodata = (bufend
165 						    >= (u_char *)crec
166 						    && bufend <= crec->data);
167 						size_t sz = 0;
168 
169 						if (!nodata) {
170 							sz = bufend
171 							    - crec->data;
172 							tmpbuf = malloc(sz);
173 							if (tmpbuf == NULL)
174 								errx(2, "cannot"
175 								    " allocate"
176 								    " memory");
177 							memmove(tmpbuf,
178 							    crec->data, sz);
179 						}
180 
181 						fstack[tfiles.top + ntfiles].fp
182 						    = ftmp();
183 						fmerge(0, mstart, mfct, geteasy,
184 						  fstack[tfiles.top+ntfiles].fp,
185 						  putrec, ftbl);
186 						ntfiles++;
187 						mfct = 0;
188 
189 						if (!nodata) {
190 							memmove(crec->data,
191 							    tmpbuf, sz);
192 							free(tmpbuf);
193 						}
194 					}
195 				} else {
196 					fstack[tfiles.top + ntfiles].fp= ftmp();
197 					onepass(keylist, depth, nelem, sizes,
198 					weights, fstack[tfiles.top+ntfiles].fp);
199 					ntfiles++;
200 				}
201 			}
202 		}
203 		get = getnext;
204 		if (!ntfiles && !mfct) {	/* everything in memory--pop */
205 			if (nelem > 1 && radixsort((const u_char **)keylist,
206 			    nelem, weights, REC_D))
207 				err(2, NULL);
208 			append(keylist, nelem, depth, outfp, putline, ftbl);
209 			break;					/* pop */
210 		}
211 		if (panic >= PANIC) {
212 			if (!ntfiles)
213 				fmerge(0, mstart, mfct, geteasy,
214 				    outfp, putline, ftbl);
215 			else
216 				fmerge(0, tfiles, ntfiles, geteasy,
217 				    outfp, putline, ftbl);
218 			break;
219 
220 		}
221 		total = maxb = lastb = 0;	/* find if one bin dominates */
222 		for (i = 0; i < NBINS; i++)
223 		  if (sizes[i]) {
224 			if (sizes[i] > sizes[maxb])
225 				maxb = i;
226 			lastb = i;
227 			total += sizes[i];
228 		}
229 		if (sizes[maxb] < max((total / 2) , BUFSIZE))
230 			maxb = lastb;	/* otherwise pop after last bin */
231 		fstack[tfiles.top].lastb = lastb;
232 		fstack[tfiles.top].maxb = maxb;
233 
234 			/* start refining next level. */
235 		get(-1, tfiles, ntfiles, crec, bufend, 0);	/* rewind */
236 		for (i = 0; i < maxb; i++) {
237 			if (!sizes[i])	/* bin empty; step ahead file offset */
238 				get(i, tfiles, ntfiles, crec, bufend, 0);
239 			else
240 				fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
241 		}
242 		if (lastb != maxb) {
243 			if (prevfp != outfp)
244 				tailfp[panic] = prevfp;
245 			prevfp = ftmp();
246 			for (i = maxb+1; i <= lastb; i++)
247 				if (!sizes[i])
248 					get(i, tfiles, ntfiles, crec, bufend,0);
249 				else
250 					fsort(i, depth+1, tfiles, ntfiles,
251 					    prevfp, ftbl);
252 		}
253 
254 		/* sort biggest (or last) bin at this level */
255 		depth++;
256 		panic++;
257 		binno = maxb;
258 		infiles.top = tfiles.top;	/* getnext will free tfiles, */
259 		nfiles = ntfiles;		/* so overwrite them */
260 	}
261 	if (prevfp != outfp) {
262 		concat(outfp, prevfp);
263 		fclose(prevfp);
264 	}
265 	for (i = panic; i >= 0; --i)
266 		if (tailfp[i]) {
267 			concat(outfp, tailfp[i]);
268 			fclose(tailfp[i]);
269 		}
270 }
271 
272 /*
273  * This is one pass of radix exchange, dumping the bins to disk.
274  */
275 #define swap(a, b, t) t = a, a = b, b = t
276 void
onepass(u_char ** a,int depth,long n,long sizes[],u_char * tr,FILE * fp)277 onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
278 {
279 	size_t tsizes[NBINS+1];
280 	u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
281 	static int histo[256];
282 	int *hp;
283 	int c;
284 	u_char **an, *t, **aj;
285 	u_char **ak, *r;
286 
287 	memset(tsizes, 0, sizeof(tsizes));
288 	depth += sizeof(TRECHEADER);
289 	an = &a[n];
290 	for (ak = a; ak < an; ak++) {
291 		histo[c = tr[**ak]]++;
292 		tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
293 	}
294 
295 	bin[0] = a;
296 	bpmax = bin + 256;
297 	tp = top, hp = histo;
298 	for (bp = bin; bp < bpmax; bp++) {
299 		*tp++ = *(bp+1) = *bp + (c = *hp);
300 		*hp++ = 0;
301 		if (c <= 1)
302 			continue;
303 	}
304 	for (aj = a; aj < an; *aj = r, aj = bin[c+1])
305 		for(r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
306 			swap(*ak, r, t);
307 
308 	for (ak = a, c = 0; c < 256; c++) {
309 		an = bin[c+1];
310 		n = an - ak;
311 		tsizes[c] += n * sizeof(TRECHEADER);
312 		/* tell getnext how many elements in this bin, this segment. */
313 		EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
314 		sizes[c] += tsizes[c];
315 		for (; ak < an; ++ak)
316 			putrec((RECHEADER *) *ak, fp);
317 	}
318 }
319