xref: /NextBSD/contrib/groff/src/preproc/refer/label.y (revision eb1a5f8de9f7ea602c373a710f531abbf81141c4)
1 /* -*- C++ -*-
2    Copyright (C) 1989, 1990, 1991, 1992, 2000, 2004
3    Free Software Foundation, Inc.
4      Written by James Clark (jjc@jclark.com)
5 
6 This file is part of groff.
7 
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12 
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING.  If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21 
22 %{
23 
24 #include "refer.h"
25 #include "refid.h"
26 #include "ref.h"
27 #include "token.h"
28 
29 int yylex();
30 void yyerror(const char *);
31 int yyparse();
32 
33 static const char *format_serial(char c, int n);
34 
35 struct label_info {
36   int start;
37   int length;
38   int count;
39   int total;
40   label_info(const string &);
41 };
42 
43 label_info *lookup_label(const string &label);
44 
45 struct expression {
46   enum {
47     // Does the tentative label depend on the reference?
48     CONTAINS_VARIABLE = 01,
49     CONTAINS_STAR = 02,
50     CONTAINS_FORMAT = 04,
51     CONTAINS_AT = 010
52   };
~expressionexpression53   virtual ~expression() { }
54   virtual void evaluate(int, const reference &, string &,
55 			substring_position &) = 0;
analyzeexpression56   virtual unsigned analyze() { return 0; }
57 };
58 
59 class at_expr : public expression {
60 public:
at_expr()61   at_expr() { }
62   void evaluate(int, const reference &, string &, substring_position &);
analyze()63   unsigned analyze() { return CONTAINS_VARIABLE|CONTAINS_AT; }
64 };
65 
66 class format_expr : public expression {
67   char type;
68   int width;
69   int first_number;
70 public:
71   format_expr(char c, int w = 0, int f = 1)
type(c)72     : type(c), width(w), first_number(f) { }
73   void evaluate(int, const reference &, string &, substring_position &);
analyze()74   unsigned analyze() { return CONTAINS_FORMAT; }
75 };
76 
77 class field_expr : public expression {
78   int number;
79   char name;
80 public:
field_expr(char nm,int num)81   field_expr(char nm, int num) : number(num), name(nm) { }
82   void evaluate(int, const reference &, string &, substring_position &);
analyze()83   unsigned analyze() { return CONTAINS_VARIABLE; }
84 };
85 
86 class literal_expr : public expression {
87   string s;
88 public:
literal_expr(const char * ptr,int len)89   literal_expr(const char *ptr, int len) : s(ptr, len) { }
90   void evaluate(int, const reference &, string &, substring_position &);
91 };
92 
93 class unary_expr : public expression {
94 protected:
95   expression *expr;
96 public:
unary_expr(expression * e)97   unary_expr(expression *e) : expr(e) { }
~unary_expr()98   ~unary_expr() { delete expr; }
99   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()100   unsigned analyze() { return expr ? expr->analyze() : 0; }
101 };
102 
103 // This caches the analysis of an expression.
104 
105 class analyzed_expr : public unary_expr {
106   unsigned flags;
107 public:
108   analyzed_expr(expression *);
109   void evaluate(int, const reference &, string &, substring_position &);
analyze()110   unsigned analyze() { return flags; }
111 };
112 
113 class star_expr : public unary_expr {
114 public:
star_expr(expression * e)115   star_expr(expression *e) : unary_expr(e) { }
116   void evaluate(int, const reference &, string &, substring_position &);
analyze()117   unsigned analyze() {
118     return ((expr ? (expr->analyze() & ~CONTAINS_VARIABLE) : 0)
119 	    | CONTAINS_STAR);
120   }
121 };
122 
123 typedef void map_func(const char *, const char *, string &);
124 
125 class map_expr : public unary_expr {
126   map_func *func;
127 public:
map_expr(expression * e,map_func * f)128   map_expr(expression *e, map_func *f) : unary_expr(e), func(f) { }
129   void evaluate(int, const reference &, string &, substring_position &);
130 };
131 
132 typedef const char *extractor_func(const char *, const char *, const char **);
133 
134 class extractor_expr : public unary_expr {
135   int part;
136   extractor_func *func;
137 public:
138   enum { BEFORE = +1, MATCH = 0, AFTER = -1 };
extractor_expr(expression * e,extractor_func * f,int pt)139   extractor_expr(expression *e, extractor_func *f, int pt)
140     : unary_expr(e), part(pt), func(f) { }
141   void evaluate(int, const reference &, string &, substring_position &);
142 };
143 
144 class truncate_expr : public unary_expr {
145   int n;
146 public:
truncate_expr(expression * e,int i)147   truncate_expr(expression *e, int i) : unary_expr(e), n(i) { }
148   void evaluate(int, const reference &, string &, substring_position &);
149 };
150 
151 class separator_expr : public unary_expr {
152 public:
separator_expr(expression * e)153   separator_expr(expression *e) : unary_expr(e) { }
154   void evaluate(int, const reference &, string &, substring_position &);
155 };
156 
157 class binary_expr : public expression {
158 protected:
159   expression *expr1;
160   expression *expr2;
161 public:
binary_expr(expression * e1,expression * e2)162   binary_expr(expression *e1, expression *e2) : expr1(e1), expr2(e2) { }
~binary_expr()163   ~binary_expr() { delete expr1; delete expr2; }
164   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()165   unsigned analyze() {
166     return (expr1 ? expr1->analyze() : 0) | (expr2 ? expr2->analyze() : 0);
167   }
168 };
169 
170 class alternative_expr : public binary_expr {
171 public:
alternative_expr(expression * e1,expression * e2)172   alternative_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
173   void evaluate(int, const reference &, string &, substring_position &);
174 };
175 
176 class list_expr : public binary_expr {
177 public:
list_expr(expression * e1,expression * e2)178   list_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
179   void evaluate(int, const reference &, string &, substring_position &);
180 };
181 
182 class substitute_expr : public binary_expr {
183 public:
substitute_expr(expression * e1,expression * e2)184   substitute_expr(expression *e1, expression *e2) : binary_expr(e1, e2) { }
185   void evaluate(int, const reference &, string &, substring_position &);
186 };
187 
188 class ternary_expr : public expression {
189 protected:
190   expression *expr1;
191   expression *expr2;
192   expression *expr3;
193 public:
ternary_expr(expression * e1,expression * e2,expression * e3)194   ternary_expr(expression *e1, expression *e2, expression *e3)
195     : expr1(e1), expr2(e2), expr3(e3) { }
~ternary_expr()196   ~ternary_expr() { delete expr1; delete expr2; delete expr3; }
197   void evaluate(int, const reference &, string &, substring_position &) = 0;
analyze()198   unsigned analyze() {
199     return ((expr1 ? expr1->analyze() : 0)
200 	    | (expr2 ? expr2->analyze() : 0)
201 	    | (expr3 ? expr3->analyze() : 0));
202   }
203 };
204 
205 class conditional_expr : public ternary_expr {
206 public:
conditional_expr(expression * e1,expression * e2,expression * e3)207   conditional_expr(expression *e1, expression *e2, expression *e3)
208     : ternary_expr(e1, e2, e3) { }
209   void evaluate(int, const reference &, string &, substring_position &);
210 };
211 
212 static expression *parsed_label = 0;
213 static expression *parsed_date_label = 0;
214 static expression *parsed_short_label = 0;
215 
216 static expression *parse_result;
217 
218 string literals;
219 
220 %}
221 
222 %union {
223   int num;
224   expression *expr;
225   struct { int ndigits; int val; } dig;
226   struct { int start; int len; } str;
227 }
228 
229 /* uppercase or lowercase letter */
230 %token <num> TOKEN_LETTER
231 /* literal characters */
232 %token <str> TOKEN_LITERAL
233 /* digit */
234 %token <num> TOKEN_DIGIT
235 
236 %type <expr> conditional
237 %type <expr> alternative
238 %type <expr> list
239 %type <expr> string
240 %type <expr> substitute
241 %type <expr> optional_conditional
242 %type <num> number
243 %type <dig> digits
244 %type <num> optional_number
245 %type <num> flag
246 
247 %%
248 
249 expr:
250 	optional_conditional
251 		{ parse_result = ($1 ? new analyzed_expr($1) : 0); }
252 	;
253 
254 conditional:
255 	alternative
256 		{ $$ = $1; }
257 	| alternative '?' optional_conditional ':' conditional
258 		{ $$ = new conditional_expr($1, $3, $5); }
259 	;
260 
261 optional_conditional:
262 	/* empty */
263 		{ $$ = 0; }
264 	| conditional
265 		{ $$ = $1; }
266 	;
267 
268 alternative:
269 	list
270 		{ $$ = $1; }
271 	| alternative '|' list
272 		{ $$ = new alternative_expr($1, $3); }
273 	| alternative '&' list
274 		{ $$ = new conditional_expr($1, $3, 0); }
275 	;
276 
277 list:
278 	substitute
279 		{ $$ = $1; }
280 	| list substitute
281 		{ $$ = new list_expr($1, $2); }
282 	;
283 
284 substitute:
285 	string
286 		{ $$ = $1; }
287 	| substitute '~' string
288 		{ $$ = new substitute_expr($1, $3); }
289 	;
290 
291 string:
292 	'@'
293 		{ $$ = new at_expr; }
294 	| TOKEN_LITERAL
295 		{
296 		  $$ = new literal_expr(literals.contents() + $1.start,
297 					$1.len);
298 		}
299 	| TOKEN_LETTER
300 		{ $$ = new field_expr($1, 0); }
301 	| TOKEN_LETTER number
302 		{ $$ = new field_expr($1, $2 - 1); }
303 	| '%' TOKEN_LETTER
304 		{
305 		  switch ($2) {
306 		  case 'I':
307 		  case 'i':
308 		  case 'A':
309 		  case 'a':
310 		    $$ = new format_expr($2);
311 		    break;
312 		  default:
313 		    command_error("unrecognized format `%1'", char($2));
314 		    $$ = new format_expr('a');
315 		    break;
316 		  }
317 		}
318 
319 	| '%' digits
320 		{
321 		  $$ = new format_expr('0', $2.ndigits, $2.val);
322 		}
323 	| string '.' flag TOKEN_LETTER optional_number
324 		{
325 		  switch ($4) {
326 		  case 'l':
327 		    $$ = new map_expr($1, lowercase);
328 		    break;
329 		  case 'u':
330 		    $$ = new map_expr($1, uppercase);
331 		    break;
332 		  case 'c':
333 		    $$ = new map_expr($1, capitalize);
334 		    break;
335 		  case 'r':
336 		    $$ = new map_expr($1, reverse_name);
337 		    break;
338 		  case 'a':
339 		    $$ = new map_expr($1, abbreviate_name);
340 		    break;
341 		  case 'y':
342 		    $$ = new extractor_expr($1, find_year, $3);
343 		    break;
344 		  case 'n':
345 		    $$ = new extractor_expr($1, find_last_name, $3);
346 		    break;
347 		  default:
348 		    $$ = $1;
349 		    command_error("unknown function `%1'", char($4));
350 		    break;
351 		  }
352 		}
353 
354 	| string '+' number
355 		{ $$ = new truncate_expr($1, $3); }
356 	| string '-' number
357 		{ $$ = new truncate_expr($1, -$3); }
358 	| string '*'
359 		{ $$ = new star_expr($1); }
360 	| '(' optional_conditional ')'
361 		{ $$ = $2; }
362 	| '<' optional_conditional '>'
363 		{ $$ = new separator_expr($2); }
364 	;
365 
366 optional_number:
367 	/* empty */
368 		{ $$ = -1; }
369 	| number
370 		{ $$ = $1; }
371 	;
372 
373 number:
374 	TOKEN_DIGIT
375 		{ $$ = $1; }
376 	| number TOKEN_DIGIT
377 		{ $$ = $1*10 + $2; }
378 	;
379 
380 digits:
381 	TOKEN_DIGIT
382 		{ $$.ndigits = 1; $$.val = $1; }
383 	| digits TOKEN_DIGIT
384 		{ $$.ndigits = $1.ndigits + 1; $$.val = $1.val*10 + $2; }
385 	;
386 
387 
388 flag:
389 	/* empty */
390 		{ $$ = 0; }
391 	| '+'
392 		{ $$ = 1; }
393 	| '-'
394 		{ $$ = -1; }
395 	;
396 
397 %%
398 
399 /* bison defines const to be empty unless __STDC__ is defined, which it
400 isn't under cfront */
401 
402 #ifdef const
403 #undef const
404 #endif
405 
406 const char *spec_ptr;
407 const char *spec_end;
408 const char *spec_cur;
409 
410 static char uppercase_array[] = {
411   'A', 'B', 'C', 'D', 'E', 'F', 'G', 'H',
412   'I', 'J', 'K', 'L', 'M', 'N', 'O', 'P',
413   'Q', 'R', 'S', 'T', 'U', 'V', 'W', 'X',
414   'Y', 'Z',
415 };
416 
417 static char lowercase_array[] = {
418   'a', 'b', 'c', 'd', 'e', 'f', 'g', 'h',
419   'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p',
420   'q', 'r', 's', 't', 'u', 'v', 'w', 'x',
421   'y', 'z',
422 };
423 
yylex()424 int yylex()
425 {
426   while (spec_ptr < spec_end && csspace(*spec_ptr))
427     spec_ptr++;
428   spec_cur = spec_ptr;
429   if (spec_ptr >= spec_end)
430     return 0;
431   unsigned char c = *spec_ptr++;
432   if (csalpha(c)) {
433     yylval.num = c;
434     return TOKEN_LETTER;
435   }
436   if (csdigit(c)) {
437     yylval.num = c - '0';
438     return TOKEN_DIGIT;
439   }
440   if (c == '\'') {
441     yylval.str.start = literals.length();
442     for (; spec_ptr < spec_end; spec_ptr++) {
443       if (*spec_ptr == '\'') {
444 	if (++spec_ptr < spec_end && *spec_ptr == '\'')
445 	  literals += '\'';
446 	else {
447 	  yylval.str.len = literals.length() - yylval.str.start;
448 	  return TOKEN_LITERAL;
449 	}
450       }
451       else
452 	literals += *spec_ptr;
453     }
454     yylval.str.len = literals.length() - yylval.str.start;
455     return TOKEN_LITERAL;
456   }
457   return c;
458 }
459 
set_label_spec(const char * label_spec)460 int set_label_spec(const char *label_spec)
461 {
462   spec_cur = spec_ptr = label_spec;
463   spec_end = strchr(label_spec, '\0');
464   literals.clear();
465   if (yyparse())
466     return 0;
467   delete parsed_label;
468   parsed_label = parse_result;
469   return 1;
470 }
471 
set_date_label_spec(const char * label_spec)472 int set_date_label_spec(const char *label_spec)
473 {
474   spec_cur = spec_ptr = label_spec;
475   spec_end = strchr(label_spec, '\0');
476   literals.clear();
477   if (yyparse())
478     return 0;
479   delete parsed_date_label;
480   parsed_date_label = parse_result;
481   return 1;
482 }
483 
set_short_label_spec(const char * label_spec)484 int set_short_label_spec(const char *label_spec)
485 {
486   spec_cur = spec_ptr = label_spec;
487   spec_end = strchr(label_spec, '\0');
488   literals.clear();
489   if (yyparse())
490     return 0;
491   delete parsed_short_label;
492   parsed_short_label = parse_result;
493   return 1;
494 }
495 
yyerror(const char * message)496 void yyerror(const char *message)
497 {
498   if (spec_cur < spec_end)
499     command_error("label specification %1 before `%2'", message, spec_cur);
500   else
501     command_error("label specification %1 at end of string",
502 		  message, spec_cur);
503 }
504 
evaluate(int tentative,const reference & ref,string & result,substring_position &)505 void at_expr::evaluate(int tentative, const reference &ref,
506 		       string &result, substring_position &)
507 {
508   if (tentative)
509     ref.canonicalize_authors(result);
510   else {
511     const char *end, *start = ref.get_authors(&end);
512     if (start)
513       result.append(start, end - start);
514   }
515 }
516 
evaluate(int tentative,const reference & ref,string & result,substring_position &)517 void format_expr::evaluate(int tentative, const reference &ref,
518 			   string &result, substring_position &)
519 {
520   if (tentative)
521     return;
522   const label_info *lp = ref.get_label_ptr();
523   int num = lp == 0 ? ref.get_number() : lp->count;
524   if (type != '0')
525     result += format_serial(type, num + 1);
526   else {
527     const char *ptr = i_to_a(num + first_number);
528     int pad = width - strlen(ptr);
529     while (--pad >= 0)
530       result += '0';
531     result += ptr;
532   }
533 }
534 
format_serial(char c,int n)535 static const char *format_serial(char c, int n)
536 {
537   assert(n > 0);
538   static char buf[128]; // more than enough.
539   switch (c) {
540   case 'i':
541   case 'I':
542     {
543       char *p = buf;
544       // troff uses z and w to represent 10000 and 5000 in Roman
545       // numerals; I can find no historical basis for this usage
546       const char *s = c == 'i' ? "zwmdclxvi" : "ZWMDCLXVI";
547       if (n >= 40000)
548 	return i_to_a(n);
549       while (n >= 10000) {
550 	*p++ = s[0];
551 	n -= 10000;
552       }
553       for (int i = 1000; i > 0; i /= 10, s += 2) {
554 	int m = n/i;
555 	n -= m*i;
556 	switch (m) {
557 	case 3:
558 	  *p++ = s[2];
559 	  /* falls through */
560 	case 2:
561 	  *p++ = s[2];
562 	  /* falls through */
563 	case 1:
564 	  *p++ = s[2];
565 	  break;
566 	case 4:
567 	  *p++ = s[2];
568 	  *p++ = s[1];
569 	  break;
570 	case 8:
571 	  *p++ = s[1];
572 	  *p++ = s[2];
573 	  *p++ = s[2];
574 	  *p++ = s[2];
575 	  break;
576 	case 7:
577 	  *p++ = s[1];
578 	  *p++ = s[2];
579 	  *p++ = s[2];
580 	  break;
581 	case 6:
582 	  *p++ = s[1];
583 	  *p++ = s[2];
584 	  break;
585 	case 5:
586 	  *p++ = s[1];
587 	  break;
588 	case 9:
589 	  *p++ = s[2];
590 	  *p++ = s[0];
591 	}
592       }
593       *p = 0;
594       break;
595     }
596   case 'a':
597   case 'A':
598     {
599       char *p = buf;
600       // this is derived from troff/reg.c
601       while (n > 0) {
602 	int d = n % 26;
603 	if (d == 0)
604 	  d = 26;
605 	n -= d;
606 	n /= 26;
607 	*p++ = c == 'a' ? lowercase_array[d - 1] :
608 			       uppercase_array[d - 1];
609       }
610       *p-- = 0;
611       // Reverse it.
612       char *q = buf;
613       while (q < p) {
614 	char temp = *q;
615 	*q = *p;
616 	*p = temp;
617 	--p;
618 	++q;
619       }
620       break;
621     }
622   default:
623     assert(0);
624   }
625   return buf;
626 }
627 
evaluate(int,const reference & ref,string & result,substring_position &)628 void field_expr::evaluate(int, const reference &ref,
629 			  string &result, substring_position &)
630 {
631   const char *end;
632   const char *start = ref.get_field(name, &end);
633   if (start) {
634     start = nth_field(number, start, &end);
635     if (start)
636       result.append(start, end - start);
637   }
638 }
639 
evaluate(int,const reference &,string & result,substring_position &)640 void literal_expr::evaluate(int, const reference &,
641 			    string &result, substring_position &)
642 {
643   result += s;
644 }
645 
analyzed_expr(expression * e)646 analyzed_expr::analyzed_expr(expression *e)
647 : unary_expr(e), flags(e ? e->analyze() : 0)
648 {
649 }
650 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)651 void analyzed_expr::evaluate(int tentative, const reference &ref,
652 			     string &result, substring_position &pos)
653 {
654   if (expr)
655     expr->evaluate(tentative, ref, result, pos);
656 }
657 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)658 void star_expr::evaluate(int tentative, const reference &ref,
659 			 string &result, substring_position &pos)
660 {
661   const label_info *lp = ref.get_label_ptr();
662   if (!tentative
663       && (lp == 0 || lp->total > 1)
664       && expr)
665     expr->evaluate(tentative, ref, result, pos);
666 }
667 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)668 void separator_expr::evaluate(int tentative, const reference &ref,
669 			      string &result, substring_position &pos)
670 {
671   int start_length = result.length();
672   int is_first = pos.start < 0;
673   if (expr)
674     expr->evaluate(tentative, ref, result, pos);
675   if (is_first) {
676     pos.start = start_length;
677     pos.length = result.length() - start_length;
678   }
679 }
680 
evaluate(int tentative,const reference & ref,string & result,substring_position &)681 void map_expr::evaluate(int tentative, const reference &ref,
682 			string &result, substring_position &)
683 {
684   if (expr) {
685     string temp;
686     substring_position temp_pos;
687     expr->evaluate(tentative, ref, temp, temp_pos);
688     (*func)(temp.contents(), temp.contents() + temp.length(), result);
689   }
690 }
691 
evaluate(int tentative,const reference & ref,string & result,substring_position &)692 void extractor_expr::evaluate(int tentative, const reference &ref,
693 			      string &result, substring_position &)
694 {
695   if (expr) {
696     string temp;
697     substring_position temp_pos;
698     expr->evaluate(tentative, ref, temp, temp_pos);
699     const char *end, *start = (*func)(temp.contents(),
700 				      temp.contents() + temp.length(),
701 				      &end);
702     switch (part) {
703     case BEFORE:
704       if (start)
705 	result.append(temp.contents(), start - temp.contents());
706       else
707 	result += temp;
708       break;
709     case MATCH:
710       if (start)
711 	result.append(start, end - start);
712       break;
713     case AFTER:
714       if (start)
715 	result.append(end, temp.contents() + temp.length() - end);
716       break;
717     default:
718       assert(0);
719     }
720   }
721 }
722 
first_part(int len,const char * ptr,const char * end,string & result)723 static void first_part(int len, const char *ptr, const char *end,
724 			  string &result)
725 {
726   for (;;) {
727     const char *token_start = ptr;
728     if (!get_token(&ptr, end))
729       break;
730     const token_info *ti = lookup_token(token_start, ptr);
731     int counts = ti->sortify_non_empty(token_start, ptr);
732     if (counts && --len < 0)
733       break;
734     if (counts || ti->is_accent())
735       result.append(token_start, ptr - token_start);
736   }
737 }
738 
last_part(int len,const char * ptr,const char * end,string & result)739 static void last_part(int len, const char *ptr, const char *end,
740 		      string &result)
741 {
742   const char *start = ptr;
743   int count = 0;
744   for (;;) {
745     const char *token_start = ptr;
746     if (!get_token(&ptr, end))
747       break;
748     const token_info *ti = lookup_token(token_start, ptr);
749     if (ti->sortify_non_empty(token_start, ptr))
750       count++;
751   }
752   ptr = start;
753   int skip = count - len;
754   if (skip > 0) {
755     for (;;) {
756       const char *token_start = ptr;
757       if (!get_token(&ptr, end))
758 	assert(0);
759       const token_info *ti = lookup_token(token_start, ptr);
760       if (ti->sortify_non_empty(token_start, ptr) && --skip < 0) {
761 	ptr = token_start;
762 	break;
763       }
764     }
765   }
766   first_part(len, ptr, end, result);
767 }
768 
evaluate(int tentative,const reference & ref,string & result,substring_position &)769 void truncate_expr::evaluate(int tentative, const reference &ref,
770 			     string &result, substring_position &)
771 {
772   if (expr) {
773     string temp;
774     substring_position temp_pos;
775     expr->evaluate(tentative, ref, temp, temp_pos);
776     const char *start = temp.contents();
777     const char *end = start + temp.length();
778     if (n > 0)
779       first_part(n, start, end, result);
780     else if (n < 0)
781       last_part(-n, start, end, result);
782   }
783 }
784 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)785 void alternative_expr::evaluate(int tentative, const reference &ref,
786 				string &result, substring_position &pos)
787 {
788   int start_length = result.length();
789   if (expr1)
790     expr1->evaluate(tentative, ref, result, pos);
791   if (result.length() == start_length && expr2)
792     expr2->evaluate(tentative, ref, result, pos);
793 }
794 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)795 void list_expr::evaluate(int tentative, const reference &ref,
796 			 string &result, substring_position &pos)
797 {
798   if (expr1)
799     expr1->evaluate(tentative, ref, result, pos);
800   if (expr2)
801     expr2->evaluate(tentative, ref, result, pos);
802 }
803 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)804 void substitute_expr::evaluate(int tentative, const reference &ref,
805 			       string &result, substring_position &pos)
806 {
807   int start_length = result.length();
808   if (expr1)
809     expr1->evaluate(tentative, ref, result, pos);
810   if (result.length() > start_length && result[result.length() - 1] == '-') {
811     // ought to see if pos covers the -
812     result.set_length(result.length() - 1);
813     if (expr2)
814       expr2->evaluate(tentative, ref, result, pos);
815   }
816 }
817 
evaluate(int tentative,const reference & ref,string & result,substring_position & pos)818 void conditional_expr::evaluate(int tentative, const reference &ref,
819 				string &result, substring_position &pos)
820 {
821   string temp;
822   substring_position temp_pos;
823   if (expr1)
824     expr1->evaluate(tentative, ref, temp, temp_pos);
825   if (temp.length() > 0) {
826     if (expr2)
827       expr2->evaluate(tentative, ref, result, pos);
828   }
829   else {
830     if (expr3)
831       expr3->evaluate(tentative, ref, result, pos);
832   }
833 }
834 
pre_compute_label()835 void reference::pre_compute_label()
836 {
837   if (parsed_label != 0
838       && (parsed_label->analyze() & expression::CONTAINS_VARIABLE)) {
839     label.clear();
840     substring_position temp_pos;
841     parsed_label->evaluate(1, *this, label, temp_pos);
842     label_ptr = lookup_label(label);
843   }
844 }
845 
compute_label()846 void reference::compute_label()
847 {
848   label.clear();
849   if (parsed_label)
850     parsed_label->evaluate(0, *this, label, separator_pos);
851   if (short_label_flag && parsed_short_label)
852     parsed_short_label->evaluate(0, *this, short_label, short_separator_pos);
853   if (date_as_label) {
854     string new_date;
855     if (parsed_date_label) {
856       substring_position temp_pos;
857       parsed_date_label->evaluate(0, *this, new_date, temp_pos);
858     }
859     set_date(new_date);
860   }
861   if (label_ptr)
862     label_ptr->count += 1;
863 }
864 
immediate_compute_label()865 void reference::immediate_compute_label()
866 {
867   if (label_ptr)
868     label_ptr->total = 2;	// force use of disambiguator
869   compute_label();
870 }
871 
merge_labels(reference ** v,int n,label_type type,string & result)872 int reference::merge_labels(reference **v, int n, label_type type,
873 			    string &result)
874 {
875   if (abbreviate_label_ranges)
876     return merge_labels_by_number(v, n, type, result);
877   else
878     return merge_labels_by_parts(v, n, type, result);
879 }
880 
merge_labels_by_number(reference ** v,int n,label_type type,string & result)881 int reference::merge_labels_by_number(reference **v, int n, label_type type,
882 				      string &result)
883 {
884   if (n <= 1)
885     return 0;
886   int num = get_number();
887   // Only merge three or more labels.
888   if (v[0]->get_number() != num + 1
889       || v[1]->get_number() != num + 2)
890     return 0;
891   int i;
892   for (i = 2; i < n; i++)
893     if (v[i]->get_number() != num + i + 1)
894       break;
895   result = get_label(type);
896   result += label_range_indicator;
897   result += v[i - 1]->get_label(type);
898   return i;
899 }
900 
get_separator_pos(label_type type)901 const substring_position &reference::get_separator_pos(label_type type) const
902 {
903   if (type == SHORT_LABEL && short_label_flag)
904     return short_separator_pos;
905   else
906     return separator_pos;
907 }
908 
get_label(label_type type)909 const string &reference::get_label(label_type type) const
910 {
911   if (type == SHORT_LABEL && short_label_flag)
912     return short_label;
913   else
914     return label;
915 }
916 
merge_labels_by_parts(reference ** v,int n,label_type type,string & result)917 int reference::merge_labels_by_parts(reference **v, int n, label_type type,
918 				     string &result)
919 {
920   if (n <= 0)
921     return 0;
922   const string &lb = get_label(type);
923   const substring_position &sp = get_separator_pos(type);
924   if (sp.start < 0
925       || sp.start != v[0]->get_separator_pos(type).start
926       || memcmp(lb.contents(), v[0]->get_label(type).contents(),
927 		sp.start) != 0)
928     return 0;
929   result = lb;
930   int i = 0;
931   do {
932     result += separate_label_second_parts;
933     const substring_position &s = v[i]->get_separator_pos(type);
934     int sep_end_pos = s.start + s.length;
935     result.append(v[i]->get_label(type).contents() + sep_end_pos,
936 		  v[i]->get_label(type).length() - sep_end_pos);
937   } while (++i < n
938 	   && sp.start == v[i]->get_separator_pos(type).start
939 	   && memcmp(lb.contents(), v[i]->get_label(type).contents(),
940 		     sp.start) == 0);
941   return i;
942 }
943 
944 string label_pool;
945 
label_info(const string & s)946 label_info::label_info(const string &s)
947 : start(label_pool.length()), length(s.length()), count(0), total(1)
948 {
949   label_pool += s;
950 }
951 
952 static label_info **label_table = 0;
953 static int label_table_size = 0;
954 static int label_table_used = 0;
955 
lookup_label(const string & label)956 label_info *lookup_label(const string &label)
957 {
958   if (label_table == 0) {
959     label_table = new label_info *[17];
960     label_table_size = 17;
961     for (int i = 0; i < 17; i++)
962       label_table[i] = 0;
963   }
964   unsigned h = hash_string(label.contents(), label.length()) % label_table_size;
965   label_info **ptr;
966   for (ptr = label_table + h;
967        *ptr != 0;
968        (ptr == label_table)
969        ? (ptr = label_table + label_table_size - 1)
970        : ptr--)
971     if ((*ptr)->length == label.length()
972 	&& memcmp(label_pool.contents() + (*ptr)->start, label.contents(),
973 		  label.length()) == 0) {
974       (*ptr)->total += 1;
975       return *ptr;
976     }
977   label_info *result = *ptr = new label_info(label);
978   if (++label_table_used * 2 > label_table_size) {
979     // Rehash the table.
980     label_info **old_table = label_table;
981     int old_size = label_table_size;
982     label_table_size = next_size(label_table_size);
983     label_table = new label_info *[label_table_size];
984     int i;
985     for (i = 0; i < label_table_size; i++)
986       label_table[i] = 0;
987     for (i = 0; i < old_size; i++)
988       if (old_table[i]) {
989 	h = hash_string(label_pool.contents() + old_table[i]->start,
990 			old_table[i]->length);
991 	label_info **p;
992 	for (p = label_table + (h % label_table_size);
993 	     *p != 0;
994 	     (p == label_table)
995 	     ? (p = label_table + label_table_size - 1)
996 	     : --p)
997 	    ;
998 	*p = old_table[i];
999 	}
1000     a_delete old_table;
1001   }
1002   return result;
1003 }
1004 
clear_labels()1005 void clear_labels()
1006 {
1007   for (int i = 0; i < label_table_size; i++) {
1008     delete label_table[i];
1009     label_table[i] = 0;
1010   }
1011   label_table_used = 0;
1012   label_pool.clear();
1013 }
1014 
1015 static void consider_authors(reference **start, reference **end, int i);
1016 
compute_labels(reference ** v,int n)1017 void compute_labels(reference **v, int n)
1018 {
1019   if (parsed_label
1020       && (parsed_label->analyze() & expression::CONTAINS_AT)
1021       && sort_fields.length() >= 2
1022       && sort_fields[0] == 'A'
1023       && sort_fields[1] == '+')
1024     consider_authors(v, v + n, 0);
1025   for (int i = 0; i < n; i++)
1026     v[i]->compute_label();
1027 }
1028 
1029 
1030 /* A reference with a list of authors <A0,A1,...,AN> _needs_ author i
1031 where 0 <= i <= N if there exists a reference with a list of authors
1032 <B0,B1,...,BM> such that <A0,A1,...,AN> != <B0,B1,...,BM> and M >= i
1033 and Aj = Bj for 0 <= j < i. In this case if we can't say ``A0,
1034 A1,...,A(i-1) et al'' because this would match both <A0,A1,...,AN> and
1035 <B0,B1,...,BM>.  If a reference needs author i we only have to call
1036 need_author(j) for some j >= i such that the reference also needs
1037 author j. */
1038 
1039 /* This function handles 2 tasks:
1040 determine which authors are needed (cannot be elided with et al.);
1041 determine which authors can have only last names in the labels.
1042 
1043 References >= start and < end have the same first i author names.
1044 Also they're sorted by A+. */
1045 
consider_authors(reference ** start,reference ** end,int i)1046 static void consider_authors(reference **start, reference **end, int i)
1047 {
1048   if (start >= end)
1049     return;
1050   reference **p = start;
1051   if (i >= (*p)->get_nauthors()) {
1052     for (++p; p < end && i >= (*p)->get_nauthors(); p++)
1053       ;
1054     if (p < end && i > 0) {
1055       // If we have an author list <A B C> and an author list <A B C D>,
1056       // then both lists need C.
1057       for (reference **q = start; q < end; q++)
1058 	(*q)->need_author(i - 1);
1059     }
1060     start = p;
1061   }
1062   while (p < end) {
1063     reference **last_name_start = p;
1064     reference **name_start = p;
1065     for (++p;
1066 	 p < end && i < (*p)->get_nauthors()
1067 	 && same_author_last_name(**last_name_start, **p, i);
1068 	 p++) {
1069       if (!same_author_name(**name_start, **p, i)) {
1070 	consider_authors(name_start, p, i + 1);
1071 	name_start = p;
1072       }
1073     }
1074     consider_authors(name_start, p, i + 1);
1075     if (last_name_start == name_start) {
1076       for (reference **q = last_name_start; q < p; q++)
1077 	(*q)->set_last_name_unambiguous(i);
1078     }
1079     // If we have an author list <A B C D> and <A B C E>, then the lists
1080     // need author D and E respectively.
1081     if (name_start > start || p < end) {
1082       for (reference **q = last_name_start; q < p; q++)
1083 	(*q)->need_author(i);
1084     }
1085   }
1086 }
1087 
same_author_last_name(const reference & r1,const reference & r2,int n)1088 int same_author_last_name(const reference &r1, const reference &r2, int n)
1089 {
1090   const char *ae1;
1091   const char *as1 = r1.get_sort_field(0, n, 0, &ae1);
1092   const char *ae2;
1093   const char *as2 = r2.get_sort_field(0, n, 0, &ae2);
1094   if (!as1 && !as2) return 1;	// they are the same
1095   if (!as1 || !as2) return 0;
1096   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1097 }
1098 
same_author_name(const reference & r1,const reference & r2,int n)1099 int same_author_name(const reference &r1, const reference &r2, int n)
1100 {
1101   const char *ae1;
1102   const char *as1 = r1.get_sort_field(0, n, -1, &ae1);
1103   const char *ae2;
1104   const char *as2 = r2.get_sort_field(0, n, -1, &ae2);
1105   if (!as1 && !as2) return 1;	// they are the same
1106   if (!as1 || !as2) return 0;
1107   return ae1 - as1 == ae2 - as2 && memcmp(as1, as2, ae1 - as1) == 0;
1108 }
1109 
1110 
set(int i)1111 void int_set::set(int i)
1112 {
1113   assert(i >= 0);
1114   int bytei = i >> 3;
1115   if (bytei >= v.length()) {
1116     int old_length = v.length();
1117     v.set_length(bytei + 1);
1118     for (int j = old_length; j <= bytei; j++)
1119       v[j] = 0;
1120   }
1121   v[bytei] |= 1 << (i & 7);
1122 }
1123 
get(int i)1124 int int_set::get(int i) const
1125 {
1126   assert(i >= 0);
1127   int bytei = i >> 3;
1128   return bytei >= v.length() ? 0 : (v[bytei] & (1 << (i & 7))) != 0;
1129 }
1130 
set_last_name_unambiguous(int i)1131 void reference::set_last_name_unambiguous(int i)
1132 {
1133   last_name_unambiguous.set(i);
1134 }
1135 
need_author(int n)1136 void reference::need_author(int n)
1137 {
1138   if (n > last_needed_author)
1139     last_needed_author = n;
1140 }
1141 
get_authors(const char ** end)1142 const char *reference::get_authors(const char **end) const
1143 {
1144   if (!computed_authors) {
1145     ((reference *)this)->computed_authors = 1;
1146     string &result = ((reference *)this)->authors;
1147     int na = get_nauthors();
1148     result.clear();
1149     for (int i = 0; i < na; i++) {
1150       if (last_name_unambiguous.get(i)) {
1151 	const char *e, *start = get_author_last_name(i, &e);
1152 	assert(start != 0);
1153 	result.append(start, e - start);
1154       }
1155       else {
1156 	const char *e, *start = get_author(i, &e);
1157 	assert(start != 0);
1158 	result.append(start, e - start);
1159       }
1160       if (i == last_needed_author
1161 	  && et_al.length() > 0
1162 	  && et_al_min_elide > 0
1163 	  && last_needed_author + et_al_min_elide < na
1164 	  && na >= et_al_min_total) {
1165 	result += et_al;
1166 	break;
1167       }
1168       if (i < na - 1) {
1169 	if (na == 2)
1170 	  result += join_authors_exactly_two;
1171 	else if (i < na - 2)
1172 	  result += join_authors_default;
1173 	else
1174 	  result += join_authors_last_two;
1175       }
1176     }
1177   }
1178   const char *start = authors.contents();
1179   *end = start + authors.length();
1180   return start;
1181 }
1182 
get_nauthors()1183 int reference::get_nauthors() const
1184 {
1185   if (nauthors < 0) {
1186     const char *dummy;
1187     int na;
1188     for (na = 0; get_author(na, &dummy) != 0; na++)
1189       ;
1190     ((reference *)this)->nauthors = na;
1191   }
1192   return nauthors;
1193 }
1194