1 /* $OpenBSD: fsort.c,v 1.12 2004/09/14 22:57:58 deraadt Exp $ */
2
3 /*-
4 * Copyright (c) 1993
5 * The Regents of the University of California. All rights reserved.
6 *
7 * This code is derived from software contributed to Berkeley by
8 * Peter McIlroy.
9 *
10 * Redistribution and use in source and binary forms, with or without
11 * modification, are permitted provided that the following conditions
12 * are met:
13 * 1. Redistributions of source code must retain the above copyright
14 * notice, this list of conditions and the following disclaimer.
15 * 2. Redistributions in binary form must reproduce the above copyright
16 * notice, this list of conditions and the following disclaimer in the
17 * documentation and/or other materials provided with the distribution.
18 * 3. Neither the name of the University nor the names of its contributors
19 * may be used to endorse or promote products derived from this software
20 * without specific prior written permission.
21 *
22 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
23 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
24 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
25 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
26 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
27 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
28 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
29 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
30 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
31 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
32 * SUCH DAMAGE.
33 */
34
35 #ifndef lint
36 #if 0
37 static char sccsid[] = "@(#)fsort.c 8.1 (Berkeley) 6/6/93";
38 #else
39 static char rcsid[] = "$OpenBSD: fsort.c,v 1.12 2004/09/14 22:57:58 deraadt Exp $";
40 #endif
41 #endif /* not lint */
42
43 /*
44 * Read in the next bin. If it fits in one segment sort it;
45 * otherwise refine it by segment deeper by one character,
46 * and try again on smaller bins. Sort the final bin at this level
47 * of recursion to keep the head of fstack at 0.
48 * After PANIC passes, abort to merge sort.
49 */
50 #include "sort.h"
51 #include "fsort.h"
52
53 #include <stdlib.h>
54 #include <string.h>
55
56 u_char **keylist = 0, *buffer = 0, *linebuf = 0;
57 size_t bufsize, linebuf_size;
58 struct tempfile fstack[MAXFCT];
59 extern char toutpath[];
60 #define FSORTMAX 4
61 int PANIC = FSORTMAX;
62
63 void
fsort(int binno,int depth,union f_handle infiles,int nfiles,FILE * outfp,struct field * ftbl)64 fsort(int binno, int depth, union f_handle infiles, int nfiles, FILE *outfp,
65 struct field *ftbl)
66 {
67 u_char *bufend, **keypos, *tmpbuf;
68 u_char *weights;
69 int ntfiles, mfct = 0, total, i, maxb, lastb, panic = 0;
70 int c, nelem;
71 long sizes[NBINS+1];
72 union f_handle tfiles, mstart = {MAXFCT-16};
73 int (*get)(int, union f_handle, int, RECHEADER *,
74 u_char *, struct field *);
75 RECHEADER *crec;
76 struct field tfield[2];
77 FILE *prevfp, *tailfp[FSORTMAX+1];
78
79 memset(tailfp, 0, sizeof(tailfp));
80 prevfp = outfp;
81 memset(tfield, 0, sizeof(tfield));
82 if (ftbl[0].flags & R)
83 tfield[0].weights = Rascii;
84 else
85 tfield[0].weights = ascii;
86 tfield[0].icol.num = 1;
87 weights = ftbl[0].weights;
88 if (!buffer) {
89 bufsize = BUFSIZE;
90 if ((buffer = malloc(bufsize + 1)) == NULL ||
91 (keylist = calloc(MAXNUM, sizeof(u_char *))) == NULL)
92 errx(2, "cannot allocate memory");
93 if (!SINGL_FLD) {
94 linebuf_size = MAXLLEN;
95 if ((linebuf = malloc(linebuf_size)) == NULL)
96 errx(2, "cannot allocate memory");
97 }
98 }
99 bufend = buffer + bufsize;
100 if (binno >= 0) {
101 tfiles.top = infiles.top + nfiles;
102 get = getnext;
103 } else {
104 tfiles.top = 0;
105 if (SINGL_FLD)
106 get = makeline;
107 else
108 get = makekey;
109 }
110 for (;;) {
111 memset(sizes, 0, sizeof(sizes));
112 c = ntfiles = 0;
113 if (binno == weights[REC_D] &&
114 !(SINGL_FLD && ftbl[0].flags & F)) { /* pop */
115 rd_append(weights[REC_D],
116 infiles, nfiles, prevfp, buffer, bufend);
117 break;
118 } else if (binno == weights[REC_D]) {
119 depth = 0; /* start over on flat weights */
120 ftbl = tfield;
121 weights = ftbl[0].weights;
122 }
123 while (c != EOF) {
124 keypos = keylist;
125 nelem = 0;
126 crec = (RECHEADER *) buffer;
127 while((c = get(binno, infiles, nfiles, crec, bufend,
128 ftbl)) == 0) {
129 *keypos++ = crec->data + depth;
130 if (++nelem == MAXNUM) {
131 c = BUFFEND;
132 break;
133 }
134 crec =(RECHEADER *) ((char *) crec +
135 SALIGN(crec->length) + sizeof(TRECHEADER));
136 }
137 /*
138 * buffer was too small for data, allocate
139 * a bigger buffer.
140 */
141 if (c == BUFFEND && nelem == 0) {
142 bufsize *= 2;
143 buffer = realloc(buffer, bufsize);
144 if (!buffer)
145 err(2, "failed to realloc buffer");
146 bufend = buffer + bufsize;
147 continue;
148 }
149 if (c == BUFFEND || ntfiles || mfct) { /* push */
150 if (panic >= PANIC) {
151 fstack[MAXFCT-16+mfct].fp = ftmp();
152 if (radixsort((const u_char **)keylist,
153 nelem, weights, REC_D))
154 err(2, NULL);
155 append(keylist, nelem, depth, fstack[
156 MAXFCT-16+mfct].fp, putrec, ftbl);
157 mfct++;
158 /* reduce number of open files */
159 if (mfct == 16 ||(c == EOF && ntfiles)) {
160 /*
161 * Only copy extra incomplete
162 * crec data if there is any.
163 */
164 int nodata = (bufend
165 >= (u_char *)crec
166 && bufend <= crec->data);
167 size_t sz = 0;
168
169 if (!nodata) {
170 sz = bufend
171 - crec->data;
172 tmpbuf = malloc(sz);
173 if (tmpbuf == NULL)
174 errx(2, "cannot"
175 " allocate"
176 " memory");
177 memmove(tmpbuf,
178 crec->data, sz);
179 }
180
181 fstack[tfiles.top + ntfiles].fp
182 = ftmp();
183 fmerge(0, mstart, mfct, geteasy,
184 fstack[tfiles.top+ntfiles].fp,
185 putrec, ftbl);
186 ntfiles++;
187 mfct = 0;
188
189 if (!nodata) {
190 memmove(crec->data,
191 tmpbuf, sz);
192 free(tmpbuf);
193 }
194 }
195 } else {
196 fstack[tfiles.top + ntfiles].fp= ftmp();
197 onepass(keylist, depth, nelem, sizes,
198 weights, fstack[tfiles.top+ntfiles].fp);
199 ntfiles++;
200 }
201 }
202 }
203 get = getnext;
204 if (!ntfiles && !mfct) { /* everything in memory--pop */
205 if (nelem > 1 && radixsort((const u_char **)keylist,
206 nelem, weights, REC_D))
207 err(2, NULL);
208 append(keylist, nelem, depth, outfp, putline, ftbl);
209 break; /* pop */
210 }
211 if (panic >= PANIC) {
212 if (!ntfiles)
213 fmerge(0, mstart, mfct, geteasy,
214 outfp, putline, ftbl);
215 else
216 fmerge(0, tfiles, ntfiles, geteasy,
217 outfp, putline, ftbl);
218 break;
219
220 }
221 total = maxb = lastb = 0; /* find if one bin dominates */
222 for (i = 0; i < NBINS; i++)
223 if (sizes[i]) {
224 if (sizes[i] > sizes[maxb])
225 maxb = i;
226 lastb = i;
227 total += sizes[i];
228 }
229 if (sizes[maxb] < max((total / 2) , BUFSIZE))
230 maxb = lastb; /* otherwise pop after last bin */
231 fstack[tfiles.top].lastb = lastb;
232 fstack[tfiles.top].maxb = maxb;
233
234 /* start refining next level. */
235 get(-1, tfiles, ntfiles, crec, bufend, 0); /* rewind */
236 for (i = 0; i < maxb; i++) {
237 if (!sizes[i]) /* bin empty; step ahead file offset */
238 get(i, tfiles, ntfiles, crec, bufend, 0);
239 else
240 fsort(i, depth+1, tfiles, ntfiles, outfp, ftbl);
241 }
242 if (lastb != maxb) {
243 if (prevfp != outfp)
244 tailfp[panic] = prevfp;
245 prevfp = ftmp();
246 for (i = maxb+1; i <= lastb; i++)
247 if (!sizes[i])
248 get(i, tfiles, ntfiles, crec, bufend,0);
249 else
250 fsort(i, depth+1, tfiles, ntfiles,
251 prevfp, ftbl);
252 }
253
254 /* sort biggest (or last) bin at this level */
255 depth++;
256 panic++;
257 binno = maxb;
258 infiles.top = tfiles.top; /* getnext will free tfiles, */
259 nfiles = ntfiles; /* so overwrite them */
260 }
261 if (prevfp != outfp) {
262 concat(outfp, prevfp);
263 fclose(prevfp);
264 }
265 for (i = panic; i >= 0; --i)
266 if (tailfp[i]) {
267 concat(outfp, tailfp[i]);
268 fclose(tailfp[i]);
269 }
270 }
271
272 /*
273 * This is one pass of radix exchange, dumping the bins to disk.
274 */
275 #define swap(a, b, t) t = a, a = b, b = t
276 void
onepass(u_char ** a,int depth,long n,long sizes[],u_char * tr,FILE * fp)277 onepass(u_char **a, int depth, long n, long sizes[], u_char *tr, FILE *fp)
278 {
279 size_t tsizes[NBINS+1];
280 u_char **bin[257], **top[256], ***bp, ***bpmax, ***tp;
281 static int histo[256];
282 int *hp;
283 int c;
284 u_char **an, *t, **aj;
285 u_char **ak, *r;
286
287 memset(tsizes, 0, sizeof(tsizes));
288 depth += sizeof(TRECHEADER);
289 an = &a[n];
290 for (ak = a; ak < an; ak++) {
291 histo[c = tr[**ak]]++;
292 tsizes[c] += ((RECHEADER *) (*ak -= depth))->length;
293 }
294
295 bin[0] = a;
296 bpmax = bin + 256;
297 tp = top, hp = histo;
298 for (bp = bin; bp < bpmax; bp++) {
299 *tp++ = *(bp+1) = *bp + (c = *hp);
300 *hp++ = 0;
301 if (c <= 1)
302 continue;
303 }
304 for (aj = a; aj < an; *aj = r, aj = bin[c+1])
305 for(r = *aj; aj < (ak = --top[c = tr[r[depth]]]) ;)
306 swap(*ak, r, t);
307
308 for (ak = a, c = 0; c < 256; c++) {
309 an = bin[c+1];
310 n = an - ak;
311 tsizes[c] += n * sizeof(TRECHEADER);
312 /* tell getnext how many elements in this bin, this segment. */
313 EWRITE(&tsizes[c], sizeof(size_t), 1, fp);
314 sizes[c] += tsizes[c];
315 for (; ak < an; ++ak)
316 putrec((RECHEADER *) *ak, fp);
317 }
318 }
319