1 /* Search algorithm.
2 Copyright (C) 1989-1998, 2000, 2002 Free Software Foundation, Inc.
3 Written by Douglas C. Schmidt <schmidt@ics.uci.edu>
4 and Bruno Haible <bruno@clisp.org>.
5
6 This file is part of GNU GPERF.
7
8 GNU GPERF is free software; you can redistribute it and/or modify
9 it under the terms of the GNU General Public License as published by
10 the Free Software Foundation; either version 2, or (at your option)
11 any later version.
12
13 GNU GPERF is distributed in the hope that it will be useful,
14 but WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
16 GNU General Public License for more details.
17
18 You should have received a copy of the GNU General Public License
19 along with this program; see the file COPYING.
20 If not, write to the Free Software Foundation, Inc.,
21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA. */
22
23 /* Specification. */
24 #include "search.h"
25
26 #include <stdio.h>
27 #include <stdlib.h> /* declares exit(), rand(), srand() */
28 #include <string.h> /* declares memset(), memcmp() */
29 #include <time.h> /* declares time() */
30 #include <math.h> /* declares exp() */
31 #include <limits.h> /* defines INT_MIN, INT_MAX, UINT_MAX */
32 #include "options.h"
33 #include "hash-table.h"
34 #include "config.h"
35
36 /* ============================== Portability ============================== */
37
38 /* Assume ISO C++ 'for' scoping rule. */
39 #define for if (0) ; else for
40
41 /* Dynamically allocated array with dynamic extent:
42
43 Example:
44 DYNAMIC_ARRAY (my_array, int, n);
45 ...
46 FREE_DYNAMIC_ARRAY (my_array);
47
48 Attention: depending on your implementation my_array is either the array
49 itself or a pointer to the array! Always use my_array only as expression!
50 */
51 #if HAVE_DYNAMIC_ARRAY
52 #define DYNAMIC_ARRAY(var,eltype,size) eltype var[size]
53 #define FREE_DYNAMIC_ARRAY(var)
54 #else
55 #define DYNAMIC_ARRAY(var,eltype,size) eltype *var = new eltype[size]
56 #define FREE_DYNAMIC_ARRAY(var) delete[] var
57 #endif
58
59 /* ================================ Theory ================================= */
60
61 /* The general form of the hash function is
62
63 hash (keyword) = sum (asso_values[keyword[i] + alpha_inc[i]] : i in Pos)
64 + len (keyword)
65
66 where Pos is a set of byte positions,
67 each alpha_inc[i] is a nonnegative integer,
68 each asso_values[c] is a nonnegative integer,
69 len (keyword) is the keyword's length if !option[NOLENGTH], or 0 otherwise.
70
71 Theorem 1: If all keywords are different, there is a set Pos such that
72 all tuples (keyword[i] : i in Pos) are different.
73
74 Theorem 2: If all tuples (keyword[i] : i in Pos) are different, there
75 are nonnegative integers alpha_inc[i] such that all multisets
76 {keyword[i] + alpha_inc[i] : i in Pos} are different.
77
78 Define selchars[keyword] := {keyword[i] + alpha_inc[i] : i in Pos}.
79
80 Theorem 3: If all multisets selchars[keyword] are different, there are
81 nonnegative integers asso_values[c] such that all hash values
82 sum (asso_values[c] : c in selchars[keyword]) are different.
83
84 Based on these three facts, we find the hash function in three steps:
85
86 Step 1 (Finding good byte positions):
87 Find a set Pos, as small as possible, such that all tuples
88 (keyword[i] : i in Pos) are different.
89
90 Step 2 (Finding good alpha increments):
91 Find nonnegative integers alpha_inc[i], as many of them as possible being
92 zero, and the others being as small as possible, such that all multisets
93 {keyword[i] + alpha_inc[i] : i in Pos} are different.
94
95 Step 3 (Finding good asso_values):
96 Find asso_values[c] such that all hash (keyword) are different.
97
98 In other words, each step finds a projection that is injective on the
99 given finite set:
100 proj1 : String --> Map (Pos --> N)
101 proj2 : Map (Pos --> N) --> Map (Pos --> N) / S(Pos)
102 proj3 : Map (Pos --> N) / S(Pos) --> N
103 where
104 N denotes the set of nonnegative integers,
105 Map (A --> B) := Hom_Set (A, B) is the set of maps from A to B, and
106 S(Pos) is the symmetric group over Pos.
107
108 This was the theory for option[NOLENGTH]; if !option[NOLENGTH], slight
109 modifications apply:
110 proj1 : String --> Map (Pos --> N) x N
111 proj2 : Map (Pos --> N) x N --> Map (Pos --> N) / S(Pos) x N
112 proj3 : Map (Pos --> N) / S(Pos) x N --> N
113
114 For a case-insensitive hash function, the general form is
115
116 hash (keyword) =
117 sum (asso_values[alpha_unify[keyword[i] + alpha_inc[i]]] : i in Pos)
118 + len (keyword)
119
120 where alpha_unify[c] is chosen so that an upper/lower case change in
121 keyword[i] doesn't change alpha_unify[keyword[i] + alpha_inc[i]].
122 */
123
124 /* ==================== Initialization and Preparation ===================== */
125
Search(KeywordExt_List * list)126 Search::Search (KeywordExt_List *list)
127 : _head (list)
128 {
129 }
130
131 void
prepare()132 Search::prepare ()
133 {
134 /* Compute the total number of keywords. */
135 _total_keys = 0;
136 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
137 _total_keys++;
138
139 /* Compute the minimum and maximum keyword length. */
140 _max_key_len = INT_MIN;
141 _min_key_len = INT_MAX;
142 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
143 {
144 KeywordExt *keyword = temp->first();
145
146 if (_max_key_len < keyword->_allchars_length)
147 _max_key_len = keyword->_allchars_length;
148 if (_min_key_len > keyword->_allchars_length)
149 _min_key_len = keyword->_allchars_length;
150 }
151
152 /* Exit program if an empty string is used as keyword, since the comparison
153 expressions don't work correctly for looking up an empty string. */
154 if (_min_key_len == 0)
155 {
156 fprintf (stderr, "Empty input keyword is not allowed.\n"
157 "To recognize an empty input keyword, your code should check for\n"
158 "len == 0 before calling the gperf generated lookup function.\n");
159 exit (1);
160 }
161
162 /* Exit program if the characters in the keywords are not in the required
163 range. */
164 if (option[SEVENBIT])
165 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
166 {
167 KeywordExt *keyword = temp->first();
168
169 const char *k = keyword->_allchars;
170 for (int i = keyword->_allchars_length; i > 0; k++, i--)
171 if (!(static_cast<unsigned char>(*k) < 128))
172 {
173 fprintf (stderr, "Option --seven-bit has been specified,\n"
174 "but keyword \"%.*s\" contains non-ASCII characters.\n"
175 "Try removing option --seven-bit.\n",
176 keyword->_allchars_length, keyword->_allchars);
177 exit (1);
178 }
179 }
180 }
181
182 /* ====================== Finding good byte positions ====================== */
183
184 /* Computes the upper bound on the indices passed to asso_values[],
185 assuming no alpha_increments. */
186 unsigned int
compute_alpha_size() const187 Search::compute_alpha_size () const
188 {
189 return (option[SEVENBIT] ? 128 : 256);
190 }
191
192 /* Computes the unification rules between different asso_values[c],
193 assuming no alpha_increments. */
194 unsigned int *
compute_alpha_unify() const195 Search::compute_alpha_unify () const
196 {
197 if (option[UPPERLOWER])
198 {
199 /* Uppercase to lowercase mapping. */
200 unsigned int alpha_size = compute_alpha_size();
201 unsigned int *alpha_unify = new unsigned int[alpha_size];
202 for (unsigned int c = 0; c < alpha_size; c++)
203 alpha_unify[c] = c;
204 for (unsigned int c = 'A'; c <= 'Z'; c++)
205 alpha_unify[c] = c + ('a'-'A');
206 return alpha_unify;
207 }
208 else
209 /* Identity mapping. */
210 return NULL;
211 }
212
213 /* Initializes each keyword's _selchars array. */
214 void
init_selchars_tuple(const Positions & positions,const unsigned int * alpha_unify) const215 Search::init_selchars_tuple (const Positions& positions, const unsigned int *alpha_unify) const
216 {
217 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
218 temp->first()->init_selchars_tuple(positions, alpha_unify);
219 }
220
221 /* Deletes each keyword's _selchars array. */
222 void
delete_selchars() const223 Search::delete_selchars () const
224 {
225 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
226 temp->first()->delete_selchars();
227 }
228
229 /* Count the duplicate keywords that occur with a given set of positions.
230 In other words, it returns the difference
231 # K - # proj1 (K)
232 where K is the multiset of given keywords. */
233 unsigned int
count_duplicates_tuple(const Positions & positions,const unsigned int * alpha_unify) const234 Search::count_duplicates_tuple (const Positions& positions, const unsigned int *alpha_unify) const
235 {
236 /* Run through the keyword list and count the duplicates incrementally.
237 The result does not depend on the order of the keyword list, thanks to
238 the formula above. */
239 init_selchars_tuple (positions, alpha_unify);
240
241 unsigned int count = 0;
242 {
243 Hash_Table representatives (_total_keys, option[NOLENGTH]);
244 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
245 {
246 KeywordExt *keyword = temp->first();
247 if (representatives.insert (keyword))
248 count++;
249 }
250 }
251
252 delete_selchars ();
253
254 return count;
255 }
256
257 /* Find good key positions. */
258
259 void
find_positions()260 Search::find_positions ()
261 {
262 /* If the user gave the key positions, we use them. */
263 if (option[POSITIONS])
264 {
265 _key_positions = option.get_key_positions();
266 return;
267 }
268
269 /* Compute preliminary alpha_unify table. */
270 unsigned int *alpha_unify = compute_alpha_unify ();
271
272 /* 1. Find positions that must occur in order to distinguish duplicates. */
273 Positions mandatory;
274
275 if (!option[DUP])
276 {
277 for (KeywordExt_List *l1 = _head; l1 && l1->rest(); l1 = l1->rest())
278 {
279 KeywordExt *keyword1 = l1->first();
280 for (KeywordExt_List *l2 = l1->rest(); l2; l2 = l2->rest())
281 {
282 KeywordExt *keyword2 = l2->first();
283
284 /* If keyword1 and keyword2 have the same length and differ
285 in just one position, and it is not the last character,
286 this position is mandatory. */
287 if (keyword1->_allchars_length == keyword2->_allchars_length)
288 {
289 int n = keyword1->_allchars_length;
290 int i;
291 for (i = 0; i < n - 1; i++)
292 {
293 unsigned char c1 = keyword1->_allchars[i];
294 unsigned char c2 = keyword2->_allchars[i];
295 if (option[UPPERLOWER])
296 {
297 if (c1 >= 'A' && c1 <= 'Z')
298 c1 += 'a' - 'A';
299 if (c2 >= 'A' && c2 <= 'Z')
300 c2 += 'a' - 'A';
301 }
302 if (c1 != c2)
303 break;
304 }
305 if (i < n - 1)
306 {
307 int j;
308 for (j = i + 1; j < n; j++)
309 {
310 unsigned char c1 = keyword1->_allchars[j];
311 unsigned char c2 = keyword2->_allchars[j];
312 if (option[UPPERLOWER])
313 {
314 if (c1 >= 'A' && c1 <= 'Z')
315 c1 += 'a' - 'A';
316 if (c2 >= 'A' && c2 <= 'Z')
317 c2 += 'a' - 'A';
318 }
319 if (c1 != c2)
320 break;
321 }
322 if (j >= n)
323 {
324 /* Position i is mandatory. */
325 if (!mandatory.contains (i))
326 mandatory.add (i);
327 }
328 }
329 }
330 }
331 }
332 }
333
334 /* 2. Add positions, as long as this decreases the duplicates count. */
335 int imax = (_max_key_len - 1 < Positions::MAX_KEY_POS - 1
336 ? _max_key_len - 1 : Positions::MAX_KEY_POS - 1);
337 Positions current = mandatory;
338 unsigned int current_duplicates_count =
339 count_duplicates_tuple (current, alpha_unify);
340 for (;;)
341 {
342 Positions best;
343 unsigned int best_duplicates_count = UINT_MAX;
344
345 for (int i = imax; i >= -1; i--)
346 if (!current.contains (i))
347 {
348 Positions tryal = current;
349 tryal.add (i);
350 unsigned int try_duplicates_count =
351 count_duplicates_tuple (tryal, alpha_unify);
352
353 /* We prefer 'try' to 'best' if it produces less duplicates,
354 or if it produces the same number of duplicates but with
355 a more efficient hash function. */
356 if (try_duplicates_count < best_duplicates_count
357 || (try_duplicates_count == best_duplicates_count && i >= 0))
358 {
359 best = tryal;
360 best_duplicates_count = try_duplicates_count;
361 }
362 }
363
364 /* Stop adding positions when it gives no improvement. */
365 if (best_duplicates_count >= current_duplicates_count)
366 break;
367
368 current = best;
369 current_duplicates_count = best_duplicates_count;
370 }
371
372 /* 3. Remove positions, as long as this doesn't increase the duplicates
373 count. */
374 for (;;)
375 {
376 Positions best;
377 unsigned int best_duplicates_count = UINT_MAX;
378
379 for (int i = imax; i >= -1; i--)
380 if (current.contains (i) && !mandatory.contains (i))
381 {
382 Positions tryal = current;
383 tryal.remove (i);
384 unsigned int try_duplicates_count =
385 count_duplicates_tuple (tryal, alpha_unify);
386
387 /* We prefer 'try' to 'best' if it produces less duplicates,
388 or if it produces the same number of duplicates but with
389 a more efficient hash function. */
390 if (try_duplicates_count < best_duplicates_count
391 || (try_duplicates_count == best_duplicates_count && i == -1))
392 {
393 best = tryal;
394 best_duplicates_count = try_duplicates_count;
395 }
396 }
397
398 /* Stop removing positions when it gives no improvement. */
399 if (best_duplicates_count > current_duplicates_count)
400 break;
401
402 current = best;
403 current_duplicates_count = best_duplicates_count;
404 }
405
406 /* 4. Replace two positions by one, as long as this doesn't increase the
407 duplicates count. */
408 for (;;)
409 {
410 Positions best;
411 unsigned int best_duplicates_count = UINT_MAX;
412
413 for (int i1 = imax; i1 >= -1; i1--)
414 if (current.contains (i1) && !mandatory.contains (i1))
415 for (int i2 = imax; i2 >= -1; i2--)
416 if (current.contains (i2) && !mandatory.contains (i2) && i2 != i1)
417 for (int i3 = imax; i3 >= 0; i3--)
418 if (!current.contains (i3))
419 {
420 Positions tryal = current;
421 tryal.remove (i1);
422 tryal.remove (i2);
423 tryal.add (i3);
424 unsigned int try_duplicates_count =
425 count_duplicates_tuple (tryal, alpha_unify);
426
427 /* We prefer 'try' to 'best' if it produces less duplicates,
428 or if it produces the same number of duplicates but with
429 a more efficient hash function. */
430 if (try_duplicates_count < best_duplicates_count
431 || (try_duplicates_count == best_duplicates_count
432 && (i1 == -1 || i2 == -1 || i3 >= 0)))
433 {
434 best = tryal;
435 best_duplicates_count = try_duplicates_count;
436 }
437 }
438
439 /* Stop removing positions when it gives no improvement. */
440 if (best_duplicates_count > current_duplicates_count)
441 break;
442
443 current = best;
444 current_duplicates_count = best_duplicates_count;
445 }
446
447 /* That's it. Hope it's good enough. */
448 _key_positions = current;
449
450 if (option[DEBUG])
451 {
452 /* Print the result. */
453 fprintf (stderr, "\nComputed positions: ");
454 PositionReverseIterator iter = _key_positions.reviterator();
455 bool seen_lastchar = false;
456 bool first = true;
457 for (int i; (i = iter.next ()) != PositionReverseIterator::EOS; )
458 {
459 if (!first)
460 fprintf (stderr, ", ");
461 if (i == Positions::LASTCHAR)
462 seen_lastchar = true;
463 else
464 {
465 fprintf (stderr, "%d", i + 1);
466 first = false;
467 }
468 }
469 if (seen_lastchar)
470 {
471 if (!first)
472 fprintf (stderr, ", ");
473 fprintf (stderr, "$");
474 }
475 fprintf (stderr, "\n");
476 }
477
478 /* Free preliminary alpha_unify table. */
479 delete[] alpha_unify;
480 }
481
482 /* Count the duplicate keywords that occur with the found set of positions.
483 In other words, it returns the difference
484 # K - # proj1 (K)
485 where K is the multiset of given keywords. */
486 unsigned int
count_duplicates_tuple() const487 Search::count_duplicates_tuple () const
488 {
489 unsigned int *alpha_unify = compute_alpha_unify ();
490 unsigned int count = count_duplicates_tuple (_key_positions, alpha_unify);
491 delete[] alpha_unify;
492 return count;
493 }
494
495 /* ===================== Finding good alpha increments ===================== */
496
497 /* Computes the upper bound on the indices passed to asso_values[]. */
498 unsigned int
compute_alpha_size(const unsigned int * alpha_inc) const499 Search::compute_alpha_size (const unsigned int *alpha_inc) const
500 {
501 unsigned int max_alpha_inc = 0;
502 for (int i = 0; i < _max_key_len; i++)
503 if (max_alpha_inc < alpha_inc[i])
504 max_alpha_inc = alpha_inc[i];
505 return (option[SEVENBIT] ? 128 : 256) + max_alpha_inc;
506 }
507
508 /* Computes the unification rules between different asso_values[c]. */
509 unsigned int *
compute_alpha_unify(const Positions & positions,const unsigned int * alpha_inc) const510 Search::compute_alpha_unify (const Positions& positions, const unsigned int *alpha_inc) const
511 {
512 if (option[UPPERLOWER])
513 {
514 /* Without alpha increments, we would simply unify
515 'A' -> 'a', ..., 'Z' -> 'z'.
516 But when a keyword contains at position i a character c,
517 we have the constraint
518 asso_values[tolower(c) + alpha_inc[i]] ==
519 asso_values[toupper(c) + alpha_inc[i]].
520 This introduces a unification
521 toupper(c) + alpha_inc[i] -> tolower(c) + alpha_inc[i].
522 Note that this unification can extend outside the range of
523 ASCII letters! But still every unified character pair is at
524 a distance of 'a'-'A' = 32, or (after chained unification)
525 at a multiple of 32. So in the end the alpha_unify vector has
526 the form c -> c + 32 * f(c) where f(c) is a nonnegative
527 integer. */
528 unsigned int alpha_size = compute_alpha_size (alpha_inc);
529
530 unsigned int *alpha_unify = new unsigned int[alpha_size];
531 for (unsigned int c = 0; c < alpha_size; c++)
532 alpha_unify[c] = c;
533
534 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
535 {
536 KeywordExt *keyword = temp->first();
537
538 /* Iterate through the selected character positions. */
539 PositionIterator iter = positions.iterator(keyword->_allchars_length);
540
541 for (int i; (i = iter.next ()) != PositionIterator::EOS; )
542 {
543 unsigned int c;
544 if (i == Positions::LASTCHAR)
545 c = static_cast<unsigned char>(keyword->_allchars[keyword->_allchars_length - 1]);
546 else if (i < keyword->_allchars_length)
547 c = static_cast<unsigned char>(keyword->_allchars[i]);
548 else
549 abort ();
550 if (c >= 'A' && c <= 'Z')
551 c += 'a' - 'A';
552 if (c >= 'a' && c <= 'z')
553 {
554 if (i != Positions::LASTCHAR)
555 c += alpha_inc[i];
556 /* Unify c with c - ('a'-'A'). */
557 unsigned int d = alpha_unify[c];
558 unsigned int b = c - ('a'-'A');
559 for (int a = b; a >= 0 && alpha_unify[a] == b; a -= ('a'-'A'))
560 alpha_unify[a] = d;
561 }
562 }
563 }
564 return alpha_unify;
565 }
566 else
567 /* Identity mapping. */
568 return NULL;
569 }
570
571 /* Initializes each keyword's _selchars array. */
572 void
init_selchars_multiset(const Positions & positions,const unsigned int * alpha_unify,const unsigned int * alpha_inc) const573 Search::init_selchars_multiset (const Positions& positions, const unsigned int *alpha_unify, const unsigned int *alpha_inc) const
574 {
575 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
576 temp->first()->init_selchars_multiset(positions, alpha_unify, alpha_inc);
577 }
578
579 /* Count the duplicate keywords that occur with the given set of positions
580 and a given alpha_inc[] array.
581 In other words, it returns the difference
582 # K - # proj2 (proj1 (K))
583 where K is the multiset of given keywords. */
584 unsigned int
count_duplicates_multiset(const unsigned int * alpha_inc) const585 Search::count_duplicates_multiset (const unsigned int *alpha_inc) const
586 {
587 /* Run through the keyword list and count the duplicates incrementally.
588 The result does not depend on the order of the keyword list, thanks to
589 the formula above. */
590 unsigned int *alpha_unify = compute_alpha_unify (_key_positions, alpha_inc);
591 init_selchars_multiset (_key_positions, alpha_unify, alpha_inc);
592
593 unsigned int count = 0;
594 {
595 Hash_Table representatives (_total_keys, option[NOLENGTH]);
596 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
597 {
598 KeywordExt *keyword = temp->first();
599 if (representatives.insert (keyword))
600 count++;
601 }
602 }
603
604 delete_selchars ();
605 delete[] alpha_unify;
606
607 return count;
608 }
609
610 /* Find good _alpha_inc[]. */
611
612 void
find_alpha_inc()613 Search::find_alpha_inc ()
614 {
615 /* The goal is to choose _alpha_inc[] such that it doesn't introduce
616 artificial duplicates.
617 In other words, the goal is # proj2 (proj1 (K)) = # proj1 (K). */
618 unsigned int duplicates_goal = count_duplicates_tuple ();
619
620 /* Start with zero increments. This is sufficient in most cases. */
621 unsigned int *current = new unsigned int [_max_key_len];
622 for (int i = 0; i < _max_key_len; i++)
623 current[i] = 0;
624 unsigned int current_duplicates_count = count_duplicates_multiset (current);
625
626 if (current_duplicates_count > duplicates_goal)
627 {
628 /* Look which _alpha_inc[i] we are free to increment. */
629 unsigned int nindices;
630 {
631 nindices = 0;
632 PositionIterator iter = _key_positions.iterator(_max_key_len);
633 for (;;)
634 {
635 int key_pos = iter.next ();
636 if (key_pos == PositionIterator::EOS)
637 break;
638 if (key_pos != Positions::LASTCHAR)
639 nindices++;
640 }
641 }
642
643 DYNAMIC_ARRAY (indices, unsigned int, nindices);
644 {
645 unsigned int j = 0;
646 PositionIterator iter = _key_positions.iterator(_max_key_len);
647 for (;;)
648 {
649 int key_pos = iter.next ();
650 if (key_pos == PositionIterator::EOS)
651 break;
652 if (key_pos != Positions::LASTCHAR)
653 indices[j++] = key_pos;
654 }
655 if (!(j == nindices))
656 abort ();
657 }
658
659 /* Perform several rounds of searching for a good alpha increment.
660 Each round reduces the number of artificial collisions by adding
661 an increment in a single key position. */
662 DYNAMIC_ARRAY (best, unsigned int, _max_key_len);
663 DYNAMIC_ARRAY (tryal, unsigned int, _max_key_len);
664 do
665 {
666 /* An increment of 1 is not always enough. Try higher increments
667 also. */
668 for (unsigned int inc = 1; ; inc++)
669 {
670 unsigned int best_duplicates_count = UINT_MAX;
671
672 for (unsigned int j = 0; j < nindices; j++)
673 {
674 memcpy (tryal, current, _max_key_len * sizeof (unsigned int));
675 tryal[indices[j]] += inc;
676 unsigned int try_duplicates_count =
677 count_duplicates_multiset (tryal);
678
679 /* We prefer 'try' to 'best' if it produces less
680 duplicates. */
681 if (try_duplicates_count < best_duplicates_count)
682 {
683 memcpy (best, tryal, _max_key_len * sizeof (unsigned int));
684 best_duplicates_count = try_duplicates_count;
685 }
686 }
687
688 /* Stop this round when we got an improvement. */
689 if (best_duplicates_count < current_duplicates_count)
690 {
691 memcpy (current, best, _max_key_len * sizeof (unsigned int));
692 current_duplicates_count = best_duplicates_count;
693 break;
694 }
695 }
696 }
697 while (current_duplicates_count > duplicates_goal);
698 FREE_DYNAMIC_ARRAY (tryal);
699 FREE_DYNAMIC_ARRAY (best);
700
701 if (option[DEBUG])
702 {
703 /* Print the result. */
704 fprintf (stderr, "\nComputed alpha increments: ");
705 bool first = true;
706 for (unsigned int j = nindices; j-- > 0; )
707 if (current[indices[j]] != 0)
708 {
709 if (!first)
710 fprintf (stderr, ", ");
711 fprintf (stderr, "%u:+%u",
712 indices[j] + 1, current[indices[j]]);
713 first = false;
714 }
715 fprintf (stderr, "\n");
716 }
717 FREE_DYNAMIC_ARRAY (indices);
718 }
719
720 _alpha_inc = current;
721 _alpha_size = compute_alpha_size (_alpha_inc);
722 _alpha_unify = compute_alpha_unify (_key_positions, _alpha_inc);
723 }
724
725 /* ======================= Finding good asso_values ======================== */
726
727 /* Initializes the asso_values[] related parameters. */
728
729 void
prepare_asso_values()730 Search::prepare_asso_values ()
731 {
732 KeywordExt_List *temp;
733
734 /* Initialize each keyword's _selchars array. */
735 init_selchars_multiset(_key_positions, _alpha_unify, _alpha_inc);
736
737 /* Compute the maximum _selchars_length over all keywords. */
738 _max_selchars_length = _key_positions.iterator(_max_key_len).remaining();
739
740 /* Check for duplicates, i.e. keywords with the same _selchars array
741 (and - if !option[NOLENGTH] - also the same length).
742 We deal with these by building an equivalence class, so that only
743 1 keyword is representative of the entire collection. Only this
744 representative remains in the keyword list; the others are accessible
745 through the _duplicate_link chain, starting at the representative.
746 This *greatly* simplifies processing during later stages of the program.
747 Set _total_duplicates and _list_len = _total_keys - _total_duplicates. */
748 {
749 _list_len = _total_keys;
750 _total_duplicates = 0;
751 /* Make hash table for efficiency. */
752 Hash_Table representatives (_list_len, option[NOLENGTH]);
753
754 KeywordExt_List *prev = NULL; /* list node before temp */
755 for (temp = _head; temp; )
756 {
757 KeywordExt *keyword = temp->first();
758 KeywordExt *other_keyword = representatives.insert (keyword);
759 KeywordExt_List *garbage = NULL;
760
761 if (other_keyword)
762 {
763 _total_duplicates++;
764 _list_len--;
765 /* Remove keyword from the main list. */
766 prev->rest() = temp->rest();
767 garbage = temp;
768 /* And insert it on other_keyword's duplicate list. */
769 keyword->_duplicate_link = other_keyword->_duplicate_link;
770 other_keyword->_duplicate_link = keyword;
771
772 /* Complain if user hasn't enabled the duplicate option. */
773 if (!option[DUP] || option[DEBUG])
774 {
775 fprintf (stderr, "Key link: \"%.*s\" = \"%.*s\", with key set \"",
776 keyword->_allchars_length, keyword->_allchars,
777 other_keyword->_allchars_length, other_keyword->_allchars);
778 for (int j = 0; j < keyword->_selchars_length; j++)
779 putc (keyword->_selchars[j], stderr);
780 fprintf (stderr, "\".\n");
781 }
782 }
783 else
784 {
785 keyword->_duplicate_link = NULL;
786 prev = temp;
787 }
788 temp = temp->rest();
789 if (garbage)
790 delete garbage;
791 }
792 if (option[DEBUG])
793 representatives.dump();
794 }
795
796 /* Exit program if duplicates exists and option[DUP] not set, since we
797 don't want to continue in this case. (We don't want to turn on
798 option[DUP] implicitly, because the generated code is usually much
799 slower. */
800 if (_total_duplicates)
801 {
802 if (option[DUP])
803 fprintf (stderr, "%d input keys have identical hash values, examine output carefully...\n",
804 _total_duplicates);
805 else
806 {
807 fprintf (stderr, "%d input keys have identical hash values,\n",
808 _total_duplicates);
809 if (option[POSITIONS])
810 fprintf (stderr, "try different key positions or use option -D.\n");
811 else
812 fprintf (stderr, "use option -D.\n");
813 exit (1);
814 }
815 }
816
817 /* Compute the occurrences of each character in the alphabet. */
818 _occurrences = new int[_alpha_size];
819 memset (_occurrences, 0, _alpha_size * sizeof (_occurrences[0]));
820 for (temp = _head; temp; temp = temp->rest())
821 {
822 KeywordExt *keyword = temp->first();
823 const unsigned int *ptr = keyword->_selchars;
824 for (int count = keyword->_selchars_length; count > 0; ptr++, count--)
825 _occurrences[*ptr]++;
826 }
827
828 /* Memory allocation. */
829 _asso_values = new int[_alpha_size];
830
831 int non_linked_length = _list_len;
832 unsigned int asso_value_max;
833
834 asso_value_max =
835 static_cast<unsigned int>(non_linked_length * option.get_size_multiple());
836 /* Round up to the next power of two. This makes it easy to ensure
837 an _asso_value[c] is >= 0 and < asso_value_max. Also, the jump value
838 being odd, it guarantees that Search::try_asso_value() will iterate
839 through different values for _asso_value[c]. */
840 if (asso_value_max == 0)
841 asso_value_max = 1;
842 asso_value_max |= asso_value_max >> 1;
843 asso_value_max |= asso_value_max >> 2;
844 asso_value_max |= asso_value_max >> 4;
845 asso_value_max |= asso_value_max >> 8;
846 asso_value_max |= asso_value_max >> 16;
847 asso_value_max++;
848 _asso_value_max = asso_value_max;
849
850 /* Given the bound for _asso_values[c], we have a bound for the possible
851 hash values, as computed in compute_hash(). */
852 _max_hash_value = (option[NOLENGTH] ? 0 : _max_key_len)
853 + (_asso_value_max - 1) * _max_selchars_length;
854 /* Allocate a sparse bit vector for detection of collisions of hash
855 values. */
856 _collision_detector = new Bool_Array (_max_hash_value + 1);
857
858 if (option[DEBUG])
859 {
860 fprintf (stderr, "total non-linked keys = %d\nmaximum associated value is %d"
861 "\nmaximum size of generated hash table is %d\n",
862 non_linked_length, asso_value_max, _max_hash_value);
863
864 int field_width;
865
866 field_width = 0;
867 {
868 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
869 {
870 KeywordExt *keyword = temp->first();
871 if (field_width < keyword->_selchars_length)
872 field_width = keyword->_selchars_length;
873 }
874 }
875
876 fprintf (stderr, "\ndumping the keyword list without duplicates\n");
877 fprintf (stderr, "keyword #, %*s, keyword\n", field_width, "keysig");
878 int i = 0;
879 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
880 {
881 KeywordExt *keyword = temp->first();
882 fprintf (stderr, "%9d, ", ++i);
883 if (field_width > keyword->_selchars_length)
884 fprintf (stderr, "%*s", field_width - keyword->_selchars_length, "");
885 for (int j = 0; j < keyword->_selchars_length; j++)
886 putc (keyword->_selchars[j], stderr);
887 fprintf (stderr, ", %.*s\n",
888 keyword->_allchars_length, keyword->_allchars);
889 }
890 fprintf (stderr, "\nend of keyword list\n\n");
891 }
892
893 if (option[RANDOM] || option.get_jump () == 0)
894 /* We will use rand(), so initialize the random number generator. */
895 srand (static_cast<long>(time (0)));
896
897 _initial_asso_value = (option[RANDOM] ? -1 : option.get_initial_asso_value ());
898 _jump = option.get_jump ();
899 }
900
901 /* Finds some _asso_values[] that fit. */
902
903 /* The idea is to choose the _asso_values[] one by one, in a way that
904 a choice that has been made never needs to be undone later. This
905 means that we split the work into several steps. Each step chooses
906 one or more _asso_values[c]. The result of choosing one or more
907 _asso_values[c] is that the partitioning of the keyword set gets
908 broader.
909 Look at this partitioning: After every step, the _asso_values[] of a
910 certain set C of characters are undetermined. (At the beginning, C
911 is the set of characters c with _occurrences[c] > 0. At the end, C
912 is empty.) To each keyword K, we associate the multiset of _selchars
913 for which the _asso_values[] are undetermined:
914 K --> K->_selchars intersect C.
915 Consider two keywords equivalent if their value under this mapping is
916 the same. This introduces an equivalence relation on the set of
917 keywords. The equivalence classes partition the keyword set. (At the
918 beginning, the partition is the finest possible: each K is an equivalence
919 class by itself, because all K have a different _selchars. At the end,
920 all K have been merged into a single equivalence class.)
921 The partition before a step is always a refinement of the partition
922 after the step.
923 We choose the steps in such a way that the partition really becomes
924 broader at each step. (A step that only chooses an _asso_values[c]
925 without changing the partition is better merged with the previous step,
926 to avoid useless backtracking.) */
927
928 struct EquivalenceClass
929 {
930 /* The keywords in this equivalence class. */
931 KeywordExt_List * _keywords;
932 KeywordExt_List * _keywords_last;
933 /* The number of keywords in this equivalence class. */
934 unsigned int _cardinality;
935 /* The undetermined selected characters for the keywords in this
936 equivalence class, as a canonically reordered multiset. */
937 unsigned int * _undetermined_chars;
938 unsigned int _undetermined_chars_length;
939
940 EquivalenceClass * _next;
941 };
942
943 struct Step
944 {
945 /* The characters whose values are being determined in this step. */
946 unsigned int _changing_count;
947 unsigned int * _changing;
948 /* Exclusive upper bound for the _asso_values[c] of this step.
949 A power of 2. */
950 unsigned int _asso_value_max;
951 /* The characters whose values will be determined after this step. */
952 bool * _undetermined;
953 /* The keyword set partition after this step. */
954 EquivalenceClass * _partition;
955 /* The expected number of iterations in this step. */
956 double _expected_lower;
957 double _expected_upper;
958
959 Step * _next;
960 };
961
962 static inline bool
equals(const unsigned int * ptr1,const unsigned int * ptr2,unsigned int len)963 equals (const unsigned int *ptr1, const unsigned int *ptr2, unsigned int len)
964 {
965 while (len > 0)
966 {
967 if (*ptr1 != *ptr2)
968 return false;
969 ptr1++;
970 ptr2++;
971 len--;
972 }
973 return true;
974 }
975
976 EquivalenceClass *
compute_partition(bool * undetermined) const977 Search::compute_partition (bool *undetermined) const
978 {
979 EquivalenceClass *partition = NULL;
980 EquivalenceClass *partition_last = NULL;
981 for (KeywordExt_List *temp = _head; temp; temp = temp->rest())
982 {
983 KeywordExt *keyword = temp->first();
984
985 /* Compute the undetermined characters for this keyword. */
986 unsigned int *undetermined_chars =
987 new unsigned int[keyword->_selchars_length];
988 unsigned int undetermined_chars_length = 0;
989
990 for (int i = 0; i < keyword->_selchars_length; i++)
991 if (undetermined[keyword->_selchars[i]])
992 undetermined_chars[undetermined_chars_length++] = keyword->_selchars[i];
993
994 /* Look up the equivalence class to which this keyword belongs. */
995 EquivalenceClass *equclass;
996 for (equclass = partition; equclass; equclass = equclass->_next)
997 if (equclass->_undetermined_chars_length == undetermined_chars_length
998 && equals (equclass->_undetermined_chars, undetermined_chars,
999 undetermined_chars_length))
1000 break;
1001 if (equclass == NULL)
1002 {
1003 equclass = new EquivalenceClass();
1004 equclass->_keywords = NULL;
1005 equclass->_keywords_last = NULL;
1006 equclass->_cardinality = 0;
1007 equclass->_undetermined_chars = undetermined_chars;
1008 equclass->_undetermined_chars_length = undetermined_chars_length;
1009 equclass->_next = NULL;
1010 if (partition)
1011 partition_last->_next = equclass;
1012 else
1013 partition = equclass;
1014 partition_last = equclass;
1015 }
1016 else
1017 delete[] undetermined_chars;
1018
1019 /* Add the keyword to the equivalence class. */
1020 KeywordExt_List *cons = new KeywordExt_List(keyword);
1021 if (equclass->_keywords)
1022 equclass->_keywords_last->rest() = cons;
1023 else
1024 equclass->_keywords = cons;
1025 equclass->_keywords_last = cons;
1026 equclass->_cardinality++;
1027 }
1028
1029 /* Free some of the allocated memory. The caller doesn't need it. */
1030 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1031 delete[] cls->_undetermined_chars;
1032
1033 return partition;
1034 }
1035
1036 static void
delete_partition(EquivalenceClass * partition)1037 delete_partition (EquivalenceClass *partition)
1038 {
1039 while (partition != NULL)
1040 {
1041 EquivalenceClass *equclass = partition;
1042 partition = equclass->_next;
1043 delete_list (equclass->_keywords);
1044 //delete[] equclass->_undetermined_chars; // already freed above
1045 delete equclass;
1046 }
1047 }
1048
1049 /* Compute the possible number of collisions when _asso_values[c] is
1050 chosen, leading to the given partition. */
1051 unsigned int
count_possible_collisions(EquivalenceClass * partition,unsigned int c) const1052 Search::count_possible_collisions (EquivalenceClass *partition, unsigned int c) const
1053 {
1054 /* Every equivalence class p is split according to the frequency of
1055 occurrence of c, leading to equivalence classes p1, p2, ...
1056 This leads to (|p|^2 - |p1|^2 - |p2|^2 - ...)/2 possible collisions.
1057 Return the sum of this expression over all equivalence classes. */
1058 unsigned int sum = 0;
1059 unsigned int m = _max_selchars_length;
1060 DYNAMIC_ARRAY (split_cardinalities, unsigned int, m + 1);
1061 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1062 {
1063 for (unsigned int i = 0; i <= m; i++)
1064 split_cardinalities[i] = 0;
1065
1066 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1067 {
1068 KeywordExt *keyword = temp->first();
1069
1070 unsigned int count = 0;
1071 for (int i = 0; i < keyword->_selchars_length; i++)
1072 if (keyword->_selchars[i] == c)
1073 count++;
1074
1075 split_cardinalities[count]++;
1076 }
1077
1078 sum += cls->_cardinality * cls->_cardinality;
1079 for (unsigned int i = 0; i <= m; i++)
1080 sum -= split_cardinalities[i] * split_cardinalities[i];
1081 }
1082 FREE_DYNAMIC_ARRAY (split_cardinalities);
1083 return sum;
1084 }
1085
1086 /* Test whether adding c to the undetermined characters changes the given
1087 partition. */
1088 bool
unchanged_partition(EquivalenceClass * partition,unsigned int c) const1089 Search::unchanged_partition (EquivalenceClass *partition, unsigned int c) const
1090 {
1091 for (EquivalenceClass *cls = partition; cls; cls = cls->_next)
1092 {
1093 unsigned int first_count = UINT_MAX;
1094
1095 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1096 {
1097 KeywordExt *keyword = temp->first();
1098
1099 unsigned int count = 0;
1100 for (int i = 0; i < keyword->_selchars_length; i++)
1101 if (keyword->_selchars[i] == c)
1102 count++;
1103
1104 if (temp == cls->_keywords)
1105 first_count = count;
1106 else if (count != first_count)
1107 /* c would split this equivalence class. */
1108 return false;
1109 }
1110 }
1111 return true;
1112 }
1113
1114 void
find_asso_values()1115 Search::find_asso_values ()
1116 {
1117 Step *steps;
1118
1119 /* Determine the steps, starting with the last one. */
1120 {
1121 bool *undetermined;
1122 bool *determined;
1123
1124 steps = NULL;
1125
1126 undetermined = new bool[_alpha_size];
1127 for (unsigned int c = 0; c < _alpha_size; c++)
1128 undetermined[c] = false;
1129
1130 determined = new bool[_alpha_size];
1131 for (unsigned int c = 0; c < _alpha_size; c++)
1132 determined[c] = true;
1133
1134 for (;;)
1135 {
1136 /* Compute the partition that needs to be refined. */
1137 EquivalenceClass *partition = compute_partition (undetermined);
1138
1139 /* Determine the main character to be chosen in this step.
1140 Choosing such a character c has the effect of splitting every
1141 equivalence class (according the the frequency of occurrence of c).
1142 We choose the c with the minimum number of possible collisions,
1143 so that characters which lead to a large number of collisions get
1144 handled early during the search. */
1145 unsigned int chosen_c;
1146 unsigned int chosen_possible_collisions;
1147 {
1148 unsigned int best_c = 0;
1149 unsigned int best_possible_collisions = UINT_MAX;
1150 for (unsigned int c = 0; c < _alpha_size; c++)
1151 if (_occurrences[c] > 0 && determined[c])
1152 {
1153 unsigned int possible_collisions =
1154 count_possible_collisions (partition, c);
1155 if (possible_collisions < best_possible_collisions)
1156 {
1157 best_c = c;
1158 best_possible_collisions = possible_collisions;
1159 }
1160 }
1161 if (best_possible_collisions == UINT_MAX)
1162 {
1163 /* All c with _occurrences[c] > 0 are undetermined. We are
1164 are the starting situation and don't need any more step. */
1165 delete_partition (partition);
1166 break;
1167 }
1168 chosen_c = best_c;
1169 chosen_possible_collisions = best_possible_collisions;
1170 }
1171
1172 /* We need one more step. */
1173 Step *step = new Step();
1174
1175 step->_undetermined = new bool[_alpha_size];
1176 memcpy (step->_undetermined, undetermined, _alpha_size*sizeof(bool));
1177
1178 step->_partition = partition;
1179
1180 /* Now determine how the equivalence classes will be before this
1181 step. */
1182 undetermined[chosen_c] = true;
1183 partition = compute_partition (undetermined);
1184
1185 /* Now determine which other characters should be determined in this
1186 step, because they will not change the equivalence classes at
1187 this point. It is the set of all c which, for all equivalence
1188 classes, have the same frequency of occurrence in every keyword
1189 of the equivalence class. */
1190 for (unsigned int c = 0; c < _alpha_size; c++)
1191 if (_occurrences[c] > 0 && determined[c]
1192 && unchanged_partition (partition, c))
1193 {
1194 undetermined[c] = true;
1195 determined[c] = false;
1196 }
1197
1198 /* main_c must be one of these. */
1199 if (determined[chosen_c])
1200 abort ();
1201
1202 /* Now the set of changing characters of this step. */
1203 unsigned int changing_count;
1204
1205 changing_count = 0;
1206 for (unsigned int c = 0; c < _alpha_size; c++)
1207 if (undetermined[c] && !step->_undetermined[c])
1208 changing_count++;
1209
1210 unsigned int *changing = new unsigned int[changing_count];
1211 changing_count = 0;
1212 for (unsigned int c = 0; c < _alpha_size; c++)
1213 if (undetermined[c] && !step->_undetermined[c])
1214 changing[changing_count++] = c;
1215
1216 step->_changing = changing;
1217 step->_changing_count = changing_count;
1218
1219 step->_asso_value_max = _asso_value_max;
1220
1221 step->_expected_lower =
1222 exp (static_cast<double>(chosen_possible_collisions)
1223 / static_cast<double>(_max_hash_value));
1224 step->_expected_upper =
1225 exp (static_cast<double>(chosen_possible_collisions)
1226 / static_cast<double>(_asso_value_max));
1227
1228 delete_partition (partition);
1229
1230 step->_next = steps;
1231 steps = step;
1232 }
1233
1234 delete[] determined;
1235 delete[] undetermined;
1236 }
1237
1238 if (option[DEBUG])
1239 {
1240 unsigned int stepno = 0;
1241 for (Step *step = steps; step; step = step->_next)
1242 {
1243 stepno++;
1244 fprintf (stderr, "Step %u chooses _asso_values[", stepno);
1245 for (unsigned int i = 0; i < step->_changing_count; i++)
1246 {
1247 if (i > 0)
1248 fprintf (stderr, ",");
1249 fprintf (stderr, "'%c'", step->_changing[i]);
1250 }
1251 fprintf (stderr, "], expected number of iterations between %g and %g.\n",
1252 step->_expected_lower, step->_expected_upper);
1253 fprintf (stderr, "Keyword equivalence classes:\n");
1254 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1255 {
1256 fprintf (stderr, "\n");
1257 for (KeywordExt_List *temp = cls->_keywords; temp; temp = temp->rest())
1258 {
1259 KeywordExt *keyword = temp->first();
1260 fprintf (stderr, " %.*s\n",
1261 keyword->_allchars_length, keyword->_allchars);
1262 }
1263 }
1264 fprintf (stderr, "\n");
1265 }
1266 }
1267
1268 /* Initialize _asso_values[]. (The value given here matters only
1269 for those c which occur in all keywords with equal multiplicity.) */
1270 for (unsigned int c = 0; c < _alpha_size; c++)
1271 _asso_values[c] = 0;
1272
1273 unsigned int stepno = 0;
1274 for (Step *step = steps; step; step = step->_next)
1275 {
1276 stepno++;
1277
1278 /* Initialize the asso_values[]. */
1279 unsigned int k = step->_changing_count;
1280 for (unsigned int i = 0; i < k; i++)
1281 {
1282 unsigned int c = step->_changing[i];
1283 _asso_values[c] =
1284 (_initial_asso_value < 0 ? rand () : _initial_asso_value)
1285 & (step->_asso_value_max - 1);
1286 }
1287
1288 unsigned int iterations = 0;
1289 DYNAMIC_ARRAY (iter, unsigned int, k);
1290 for (unsigned int i = 0; i < k; i++)
1291 iter[i] = 0;
1292 unsigned int ii = (_jump != 0 ? k - 1 : 0);
1293
1294 for (;;)
1295 {
1296 /* Test whether these asso_values[] lead to collisions among
1297 the equivalence classes that should be collision-free. */
1298 bool has_collision = false;
1299 for (EquivalenceClass *cls = step->_partition; cls; cls = cls->_next)
1300 {
1301 /* Iteration Number array is a win, O(1) initialization time! */
1302 _collision_detector->clear ();
1303
1304 for (KeywordExt_List *ptr = cls->_keywords; ptr; ptr = ptr->rest())
1305 {
1306 KeywordExt *keyword = ptr->first();
1307
1308 /* Compute the new hash code for the keyword, leaving apart
1309 the yet undetermined asso_values[]. */
1310 int hashcode;
1311 {
1312 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1313 const unsigned int *p = keyword->_selchars;
1314 int i = keyword->_selchars_length;
1315 for (; i > 0; p++, i--)
1316 if (!step->_undetermined[*p])
1317 sum += _asso_values[*p];
1318 hashcode = sum;
1319 }
1320
1321 /* See whether it collides with another keyword's hash code,
1322 from the same equivalence class. */
1323 if (_collision_detector->set_bit (hashcode))
1324 {
1325 has_collision = true;
1326 break;
1327 }
1328 }
1329
1330 /* Don't need to continue looking at the other equivalence
1331 classes if we already have found a collision. */
1332 if (has_collision)
1333 break;
1334 }
1335
1336 iterations++;
1337 if (!has_collision)
1338 break;
1339
1340 /* Try other asso_values[]. */
1341 if (_jump != 0)
1342 {
1343 /* The way we try various values for
1344 asso_values[step->_changing[0],...step->_changing[k-1]]
1345 is like this:
1346 for (bound = 0,1,...)
1347 for (ii = 0,...,k-1)
1348 iter[ii] := bound
1349 iter[0..ii-1] := values <= bound
1350 iter[ii+1..k-1] := values < bound
1351 and
1352 asso_values[step->_changing[i]] =
1353 _initial_asso_value + iter[i] * _jump.
1354 This makes it more likely to find small asso_values[].
1355 */
1356 unsigned int bound = iter[ii];
1357 unsigned int i = 0;
1358 while (i < ii)
1359 {
1360 unsigned int c = step->_changing[i];
1361 iter[i]++;
1362 _asso_values[c] =
1363 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1364 if (iter[i] <= bound)
1365 goto found_next;
1366 _asso_values[c] =
1367 (_asso_values[c] - iter[i] * _jump)
1368 & (step->_asso_value_max - 1);
1369 iter[i] = 0;
1370 i++;
1371 }
1372 i = ii + 1;
1373 while (i < k)
1374 {
1375 unsigned int c = step->_changing[i];
1376 iter[i]++;
1377 _asso_values[c] =
1378 (_asso_values[c] + _jump) & (step->_asso_value_max - 1);
1379 if (iter[i] < bound)
1380 goto found_next;
1381 _asso_values[c] =
1382 (_asso_values[c] - iter[i] * _jump)
1383 & (step->_asso_value_max - 1);
1384 iter[i] = 0;
1385 i++;
1386 }
1387 /* Switch from one ii to the next. */
1388 {
1389 unsigned int c = step->_changing[ii];
1390 _asso_values[c] =
1391 (_asso_values[c] - bound * _jump)
1392 & (step->_asso_value_max - 1);
1393 iter[ii] = 0;
1394 }
1395 /* Here all iter[i] == 0. */
1396 ii++;
1397 if (ii == k)
1398 {
1399 ii = 0;
1400 bound++;
1401 if (bound == step->_asso_value_max)
1402 {
1403 /* Out of search space! We can either backtrack, or
1404 increase the available search space of this step.
1405 It seems simpler to choose the latter solution. */
1406 step->_asso_value_max = 2 * step->_asso_value_max;
1407 if (step->_asso_value_max > _asso_value_max)
1408 {
1409 _asso_value_max = step->_asso_value_max;
1410 /* Reinitialize _max_hash_value. */
1411 _max_hash_value =
1412 (option[NOLENGTH] ? 0 : _max_key_len)
1413 + (_asso_value_max - 1) * _max_selchars_length;
1414 /* Reinitialize _collision_detector. */
1415 delete _collision_detector;
1416 _collision_detector =
1417 new Bool_Array (_max_hash_value + 1);
1418 }
1419 }
1420 }
1421 {
1422 unsigned int c = step->_changing[ii];
1423 iter[ii] = bound;
1424 _asso_values[c] =
1425 (_asso_values[c] + bound * _jump)
1426 & (step->_asso_value_max - 1);
1427 }
1428 found_next: ;
1429 }
1430 else
1431 {
1432 /* Random. */
1433 unsigned int c = step->_changing[ii];
1434 _asso_values[c] =
1435 (_asso_values[c] + rand ()) & (step->_asso_value_max - 1);
1436 /* Next time, change the next c. */
1437 ii++;
1438 if (ii == k)
1439 ii = 0;
1440 }
1441 }
1442 FREE_DYNAMIC_ARRAY (iter);
1443
1444 if (option[DEBUG])
1445 {
1446 fprintf (stderr, "Step %u chose _asso_values[", stepno);
1447 for (unsigned int i = 0; i < step->_changing_count; i++)
1448 {
1449 if (i > 0)
1450 fprintf (stderr, ",");
1451 fprintf (stderr, "'%c'", step->_changing[i]);
1452 }
1453 fprintf (stderr, "] in %u iterations.\n", iterations);
1454 }
1455 }
1456
1457 /* Free allocated memory. */
1458 while (steps != NULL)
1459 {
1460 Step *step = steps;
1461 steps = step->_next;
1462 delete[] step->_changing;
1463 delete[] step->_undetermined;
1464 delete_partition (step->_partition);
1465 delete step;
1466 }
1467 }
1468
1469 /* Computes a keyword's hash value, relative to the current _asso_values[],
1470 and stores it in keyword->_hash_value. */
1471
1472 inline int
compute_hash(KeywordExt * keyword) const1473 Search::compute_hash (KeywordExt *keyword) const
1474 {
1475 int sum = option[NOLENGTH] ? 0 : keyword->_allchars_length;
1476
1477 const unsigned int *p = keyword->_selchars;
1478 int i = keyword->_selchars_length;
1479 for (; i > 0; p++, i--)
1480 sum += _asso_values[*p];
1481
1482 return keyword->_hash_value = sum;
1483 }
1484
1485 /* Finds good _asso_values[]. */
1486
1487 void
find_good_asso_values()1488 Search::find_good_asso_values ()
1489 {
1490 prepare_asso_values ();
1491
1492 /* Search for good _asso_values[]. */
1493 int asso_iteration;
1494 if ((asso_iteration = option.get_asso_iterations ()) == 0)
1495 /* Try only the given _initial_asso_value and _jump. */
1496 find_asso_values ();
1497 else
1498 {
1499 /* Try different pairs of _initial_asso_value and _jump, in the
1500 following order:
1501 (0, 1)
1502 (1, 1)
1503 (2, 1) (0, 3)
1504 (3, 1) (1, 3)
1505 (4, 1) (2, 3) (0, 5)
1506 (5, 1) (3, 3) (1, 5)
1507 ..... */
1508 KeywordExt_List *saved_head = _head;
1509 int best_initial_asso_value = 0;
1510 int best_jump = 1;
1511 int *best_asso_values = new int[_alpha_size];
1512 int best_collisions = INT_MAX;
1513 int best_max_hash_value = INT_MAX;
1514
1515 _initial_asso_value = 0; _jump = 1;
1516 for (;;)
1517 {
1518 /* Restore the keyword list in its original order. */
1519 _head = copy_list (saved_head);
1520 /* Find good _asso_values[]. */
1521 find_asso_values ();
1522 /* Test whether it is the best solution so far. */
1523 int collisions = 0;
1524 int max_hash_value = INT_MIN;
1525 _collision_detector->clear ();
1526 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1527 {
1528 KeywordExt *keyword = ptr->first();
1529 int hashcode = compute_hash (keyword);
1530 if (max_hash_value < hashcode)
1531 max_hash_value = hashcode;
1532 if (_collision_detector->set_bit (hashcode))
1533 collisions++;
1534 }
1535 if (collisions < best_collisions
1536 || (collisions == best_collisions
1537 && max_hash_value < best_max_hash_value))
1538 {
1539 memcpy (best_asso_values, _asso_values,
1540 _alpha_size * sizeof (_asso_values[0]));
1541 best_collisions = collisions;
1542 best_max_hash_value = max_hash_value;
1543 }
1544 /* Delete the copied keyword list. */
1545 delete_list (_head);
1546
1547 if (--asso_iteration == 0)
1548 break;
1549 /* Prepare for next iteration. */
1550 if (_initial_asso_value >= 2)
1551 _initial_asso_value -= 2, _jump += 2;
1552 else
1553 _initial_asso_value += _jump, _jump = 1;
1554 }
1555 _head = saved_head;
1556 /* Install the best found asso_values. */
1557 _initial_asso_value = best_initial_asso_value;
1558 _jump = best_jump;
1559 memcpy (_asso_values, best_asso_values,
1560 _alpha_size * sizeof (_asso_values[0]));
1561 delete[] best_asso_values;
1562 /* The keywords' _hash_value fields are recomputed below. */
1563 }
1564 }
1565
1566 /* ========================================================================= */
1567
1568 /* Comparison function for sorting by increasing _hash_value. */
1569 static bool
less_by_hash_value(KeywordExt * keyword1,KeywordExt * keyword2)1570 less_by_hash_value (KeywordExt *keyword1, KeywordExt *keyword2)
1571 {
1572 return keyword1->_hash_value < keyword2->_hash_value;
1573 }
1574
1575 /* Sorts the keyword list by hash value. */
1576
1577 void
sort()1578 Search::sort ()
1579 {
1580 _head = mergesort_list (_head, less_by_hash_value);
1581 }
1582
1583 void
optimize()1584 Search::optimize ()
1585 {
1586 /* Preparations. */
1587 prepare ();
1588
1589 /* Step 1: Finding good byte positions. */
1590 find_positions ();
1591
1592 /* Step 2: Finding good alpha increments. */
1593 find_alpha_inc ();
1594
1595 /* Step 3: Finding good asso_values. */
1596 find_good_asso_values ();
1597
1598 /* Make one final check, just to make sure nothing weird happened.... */
1599 _collision_detector->clear ();
1600 for (KeywordExt_List *curr_ptr = _head; curr_ptr; curr_ptr = curr_ptr->rest())
1601 {
1602 KeywordExt *curr = curr_ptr->first();
1603 unsigned int hashcode = compute_hash (curr);
1604 if (_collision_detector->set_bit (hashcode))
1605 {
1606 /* This shouldn't happen. proj1, proj2, proj3 must have been
1607 computed to be injective on the given keyword set. */
1608 fprintf (stderr,
1609 "\nInternal error, unexpected duplicate hash code\n");
1610 if (option[POSITIONS])
1611 fprintf (stderr, "try options -m or -r, or use new key positions.\n\n");
1612 else
1613 fprintf (stderr, "try options -m or -r.\n\n");
1614 exit (1);
1615 }
1616 }
1617
1618 /* Sorts the keyword list by hash value. */
1619 sort ();
1620
1621 /* Set unused asso_values[c] to max_hash_value + 1. This is not absolutely
1622 necessary, but speeds up the lookup function in many cases of lookup
1623 failure: no string comparison is needed once the hash value of a string
1624 is larger than the hash value of any keyword. */
1625 int max_hash_value;
1626 {
1627 KeywordExt_List *temp;
1628 for (temp = _head; temp->rest(); temp = temp->rest())
1629 ;
1630 max_hash_value = temp->first()->_hash_value;
1631 }
1632 for (unsigned int c = 0; c < _alpha_size; c++)
1633 if (_occurrences[c] == 0)
1634 _asso_values[c] = max_hash_value + 1;
1635
1636 /* Propagate unified asso_values. */
1637 if (_alpha_unify)
1638 for (unsigned int c = 0; c < _alpha_size; c++)
1639 if (_alpha_unify[c] != c)
1640 _asso_values[c] = _asso_values[_alpha_unify[c]];
1641 }
1642
1643 /* Prints out some diagnostics upon completion. */
1644
~Search()1645 Search::~Search ()
1646 {
1647 delete _collision_detector;
1648 if (option[DEBUG])
1649 {
1650 fprintf (stderr, "\ndumping occurrence and associated values tables\n");
1651
1652 for (unsigned int i = 0; i < _alpha_size; i++)
1653 if (_occurrences[i])
1654 fprintf (stderr, "asso_values[%c] = %6d, occurrences[%c] = %6d\n",
1655 i, _asso_values[i], i, _occurrences[i]);
1656
1657 fprintf (stderr, "end table dumping\n");
1658
1659 fprintf (stderr, "\nDumping key list information:\ntotal non-static linked keywords = %d"
1660 "\ntotal keywords = %d\ntotal duplicates = %d\nmaximum key length = %d\n",
1661 _list_len, _total_keys, _total_duplicates, _max_key_len);
1662
1663 int field_width = _max_selchars_length;
1664 fprintf (stderr, "\nList contents are:\n(hash value, key length, index, %*s, keyword):\n",
1665 field_width, "selchars");
1666 for (KeywordExt_List *ptr = _head; ptr; ptr = ptr->rest())
1667 {
1668 fprintf (stderr, "%11d,%11d,%6d, ",
1669 ptr->first()->_hash_value, ptr->first()->_allchars_length, ptr->first()->_final_index);
1670 if (field_width > ptr->first()->_selchars_length)
1671 fprintf (stderr, "%*s", field_width - ptr->first()->_selchars_length, "");
1672 for (int j = 0; j < ptr->first()->_selchars_length; j++)
1673 putc (ptr->first()->_selchars[j], stderr);
1674 fprintf (stderr, ", %.*s\n",
1675 ptr->first()->_allchars_length, ptr->first()->_allchars);
1676 }
1677
1678 fprintf (stderr, "End dumping list.\n\n");
1679 }
1680 delete[] _asso_values;
1681 delete[] _occurrences;
1682 delete[] _alpha_unify;
1683 delete[] _alpha_inc;
1684 }
1685