xref: /trueos/contrib/groff/src/preproc/refer/ref.cpp (revision 513cdf04e173130783343fe42786eef6b8294c6e)
1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
3    Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
5 
6 This file is part of groff.
7 
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12 
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License
16 for more details.
17 
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING.  If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21 
22 #include "refer.h"
23 #include "refid.h"
24 #include "ref.h"
25 #include "token.h"
26 
27 static const char *find_day(const char *, const char *, const char **);
28 static int find_month(const char *start, const char *end);
29 static void abbreviate_names(string &);
30 
31 #define DEFAULT_ARTICLES "the\000a\000an"
32 
33 string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
34 
35 // Multiple occurrences of fields are separated by FIELD_SEPARATOR.
36 const char FIELD_SEPARATOR = '\0';
37 
38 const char MULTI_FIELD_NAMES[] = "AE";
39 const char *AUTHOR_FIELDS = "AQ";
40 
41 enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
42 
43 const char *reference_types[] = {
44   "other",
45   "journal-article",
46   "book",
47   "article-in-book",
48   "tech-report",
49   "bell-tm",
50 };
51 
52 static string temp_fields[256];
53 
reference(const char * start,int len,reference_id * ridp)54 reference::reference(const char *start, int len, reference_id *ridp)
55 : h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
56   computed_authors(0), last_needed_author(-1), nauthors(-1)
57 {
58   int i;
59   for (i = 0; i < 256; i++)
60     field_index[i] = NULL_FIELD_INDEX;
61   if (ridp)
62     rid = *ridp;
63   if (start == 0)
64     return;
65   if (len <= 0)
66     return;
67   const char *end = start + len;
68   const char *ptr = start;
69   assert(*ptr == '%');
70   while (ptr < end) {
71     if (ptr + 1 < end && ptr[1] != '\0'
72 	&& ((ptr[1] != '%' && ptr[1] == annotation_field)
73 	    || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
74 		&& discard_fields.search(ptr[2]) < 0))) {
75       if (ptr[1] == '%')
76 	ptr++;
77       string &f = temp_fields[(unsigned char)ptr[1]];
78       ptr += 2;
79       while (ptr < end && csspace(*ptr))
80 	ptr++;
81       for (;;) {
82 	for (;;) {
83 	  if (ptr >= end) {
84 	    f += '\n';
85 	    break;
86 	  }
87 	  f += *ptr;
88 	  if (*ptr++ == '\n')
89 	    break;
90 	}
91 	if (ptr >= end || *ptr == '%')
92 	  break;
93       }
94     }
95     else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
96 	     && discard_fields.search(ptr[1]) < 0) {
97       string &f = temp_fields[(unsigned char)ptr[1]];
98       if (f.length() > 0) {
99 	if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
100 	  f += FIELD_SEPARATOR;
101 	else
102 	  f.clear();
103       }
104       ptr += 2;
105       if (ptr < end) {
106 	if (*ptr == ' ')
107 	  ptr++;
108 	for (;;) {
109 	  const char *p = ptr;
110 	  while (ptr < end && *ptr != '\n')
111 	    ptr++;
112 	  // strip trailing white space
113 	  const char *q = ptr;
114 	  while (q > p && q[-1] != '\n' && csspace(q[-1]))
115 	    q--;
116 	  while (p < q)
117 	    f += *p++;
118 	  if (ptr >= end)
119 	    break;
120 	  ptr++;
121 	  if (ptr >= end)
122 	    break;
123 	  if (*ptr == '%')
124 	    break;
125 	  f += ' ';
126 	}
127       }
128     }
129     else {
130       // skip this field
131       for (;;) {
132 	while (ptr < end && *ptr++ != '\n')
133 	  ;
134 	if (ptr >= end || *ptr == '%')
135 	  break;
136       }
137     }
138   }
139   for (i = 0; i < 256; i++)
140     if (temp_fields[i].length() > 0)
141       nfields++;
142   field = new string[nfields];
143   int j = 0;
144   for (i = 0; i < 256; i++)
145     if (temp_fields[i].length() > 0) {
146       field[j].move(temp_fields[i]);
147       if (abbreviate_fields.search(i) >= 0)
148 	abbreviate_names(field[j]);
149       field_index[i] = j;
150       j++;
151     }
152 }
153 
~reference()154 reference::~reference()
155 {
156   if (nfields > 0)
157     ad_delete(nfields) field;
158 }
159 
160 // ref is the inline, this is the database ref
161 
merge(reference & ref)162 void reference::merge(reference &ref)
163 {
164   int i;
165   for (i = 0; i < 256; i++)
166     if (field_index[i] != NULL_FIELD_INDEX)
167       temp_fields[i].move(field[field_index[i]]);
168   for (i = 0; i < 256; i++)
169     if (ref.field_index[i] != NULL_FIELD_INDEX)
170       temp_fields[i].move(ref.field[ref.field_index[i]]);
171   for (i = 0; i < 256; i++)
172     field_index[i] = NULL_FIELD_INDEX;
173   int old_nfields = nfields;
174   nfields = 0;
175   for (i = 0; i < 256; i++)
176     if (temp_fields[i].length() > 0)
177       nfields++;
178   if (nfields != old_nfields) {
179     if (old_nfields > 0)
180       ad_delete(old_nfields) field;
181     field = new string[nfields];
182   }
183   int j = 0;
184   for (i = 0; i < 256; i++)
185     if (temp_fields[i].length() > 0) {
186       field[j].move(temp_fields[i]);
187       field_index[i] = j;
188       j++;
189     }
190   merged = 1;
191 }
192 
insert_field(unsigned char c,string & s)193 void reference::insert_field(unsigned char c, string &s)
194 {
195   assert(s.length() > 0);
196   if (field_index[c] != NULL_FIELD_INDEX) {
197     field[field_index[c]].move(s);
198     return;
199   }
200   assert(field_index[c] == NULL_FIELD_INDEX);
201   string *old_field = field;
202   field = new string[nfields + 1];
203   int pos = 0;
204   int i;
205   for (i = 0; i < int(c); i++)
206     if (field_index[i] != NULL_FIELD_INDEX)
207       pos++;
208   for (i = 0; i < pos; i++)
209     field[i].move(old_field[i]);
210   field[pos].move(s);
211   for (i = pos; i < nfields; i++)
212     field[i + 1].move(old_field[i]);
213   if (nfields > 0)
214     ad_delete(nfields) old_field;
215   nfields++;
216   field_index[c] = pos;
217   for (i = c + 1; i < 256; i++)
218     if (field_index[i] != NULL_FIELD_INDEX)
219       field_index[i] += 1;
220 }
221 
delete_field(unsigned char c)222 void reference::delete_field(unsigned char c)
223 {
224   if (field_index[c] == NULL_FIELD_INDEX)
225     return;
226   string *old_field = field;
227   field = new string[nfields - 1];
228   int i;
229   for (i = 0; i < int(field_index[c]); i++)
230     field[i].move(old_field[i]);
231   for (i = field_index[c]; i < nfields - 1; i++)
232     field[i].move(old_field[i + 1]);
233   if (nfields > 0)
234     ad_delete(nfields) old_field;
235   nfields--;
236   field_index[c] = NULL_FIELD_INDEX;
237   for (i = c + 1; i < 256; i++)
238     if (field_index[i] != NULL_FIELD_INDEX)
239       field_index[i] -= 1;
240 }
241 
compute_hash_code()242 void reference::compute_hash_code()
243 {
244   if (!rid.is_null())
245     h = rid.hash();
246   else {
247     h = 0;
248     for (int i = 0; i < nfields; i++)
249       if (field[i].length() > 0) {
250 	h <<= 4;
251 	h ^= hash_string(field[i].contents(), field[i].length());
252       }
253   }
254 }
255 
set_number(int n)256 void reference::set_number(int n)
257 {
258   no = n;
259 }
260 
261 const char SORT_SEP = '\001';
262 const char SORT_SUB_SEP = '\002';
263 const char SORT_SUB_SUB_SEP = '\003';
264 
265 // sep specifies additional word separators
266 
sortify_words(const char * s,const char * end,const char * sep,string & result)267 void sortify_words(const char *s, const char *end, const char *sep,
268 		   string &result)
269 {
270   int non_empty = 0;
271   int need_separator = 0;
272   for (;;) {
273     const char *token_start = s;
274     if (!get_token(&s, end))
275       break;
276     if ((s - token_start == 1
277 	 && (*token_start == ' '
278 	     || *token_start == '\n'
279 	     || (sep && *token_start != '\0'
280 		 && strchr(sep, *token_start) != 0)))
281 	|| (s - token_start == 2
282 	    && token_start[0] == '\\' && token_start[1] == ' ')) {
283       if (non_empty)
284 	need_separator = 1;
285     }
286     else {
287       const token_info *ti = lookup_token(token_start, s);
288       if (ti->sortify_non_empty(token_start, s)) {
289 	if (need_separator) {
290 	  result += ' ';
291 	  need_separator = 0;
292 	}
293 	ti->sortify(token_start, s, result);
294 	non_empty = 1;
295       }
296     }
297   }
298 }
299 
sortify_word(const char * s,const char * end,string & result)300 void sortify_word(const char *s, const char *end, string &result)
301 {
302   for (;;) {
303     const char *token_start = s;
304     if (!get_token(&s, end))
305       break;
306     const token_info *ti = lookup_token(token_start, s);
307     ti->sortify(token_start, s, result);
308   }
309 }
310 
sortify_other(const char * s,int len,string & key)311 void sortify_other(const char *s, int len, string &key)
312 {
313   sortify_words(s, s + len, 0, key);
314 }
315 
sortify_title(const char * s,int len,string & key)316 void sortify_title(const char *s, int len, string &key)
317 {
318   const char *end = s + len;
319   for (; s < end && (*s == ' ' || *s == '\n'); s++)
320     ;
321   const char *ptr = s;
322   for (;;) {
323     const char *token_start = ptr;
324     if (!get_token(&ptr, end))
325       break;
326     if (ptr - token_start == 1
327 	&& (*token_start == ' ' || *token_start == '\n'))
328       break;
329   }
330   if (ptr < end) {
331     unsigned int first_word_len = ptr - s - 1;
332     const char *ae = articles.contents() + articles.length();
333     for (const char *a = articles.contents();
334 	 a < ae;
335 	 a = strchr(a, '\0') + 1)
336       if (first_word_len == strlen(a)) {
337 	unsigned int j;
338 	for (j = 0; j < first_word_len; j++)
339 	  if (a[j] != cmlower(s[j]))
340 	    break;
341 	if (j >= first_word_len) {
342 	  s = ptr;
343 	  for (; s < end && (*s == ' ' || *s == '\n'); s++)
344 	    ;
345 	  break;
346 	}
347       }
348   }
349   sortify_words(s, end, 0, key);
350 }
351 
sortify_name(const char * s,int len,string & key)352 void sortify_name(const char *s, int len, string &key)
353 {
354   const char *last_name_end;
355   const char *last_name = find_last_name(s, s + len, &last_name_end);
356   sortify_word(last_name, last_name_end, key);
357   key += SORT_SUB_SUB_SEP;
358   if (last_name > s)
359     sortify_words(s, last_name, ".", key);
360   key += SORT_SUB_SUB_SEP;
361   if (last_name_end < s + len)
362     sortify_words(last_name_end, s + len, ".,", key);
363 }
364 
sortify_date(const char * s,int len,string & key)365 void sortify_date(const char *s, int len, string &key)
366 {
367   const char *year_end;
368   const char *year_start = find_year(s, s + len, &year_end);
369   if (!year_start) {
370     // Things without years are often `forthcoming', so it makes sense
371     // that they sort after things with explicit years.
372     key += 'A';
373     sortify_words(s, s + len, 0, key);
374     return;
375   }
376   int n = year_end - year_start;
377   while (n < 4) {
378     key += '0';
379     n++;
380   }
381   while (year_start < year_end)
382     key += *year_start++;
383   int m = find_month(s, s + len);
384   if (m < 0)
385     return;
386   key += 'A' + m;
387   const char *day_end;
388   const char *day_start = find_day(s, s + len, &day_end);
389   if (!day_start)
390     return;
391   if (day_end - day_start == 1)
392     key += '0';
393   while (day_start < day_end)
394     key += *day_start++;
395 }
396 
397 // SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
398 
sortify_label(const char * s,int len,string & key)399 void sortify_label(const char *s, int len, string &key)
400 {
401   const char *end = s + len;
402   for (;;) {
403     const char *ptr;
404     for (ptr = s;
405 	 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
406 	 ptr++)
407       ;
408     if (ptr > s)
409       sortify_words(s, ptr, 0, key);
410     s = ptr;
411     if (s >= end)
412       break;
413     key += *s++;
414   }
415 }
416 
compute_sort_key()417 void reference::compute_sort_key()
418 {
419   if (sort_fields.length() == 0)
420     return;
421   sort_fields += '\0';
422   const char *sf = sort_fields.contents();
423   while (*sf != '\0') {
424     sort_key += SORT_SEP;
425     char f = *sf++;
426     int n = 1;
427     if (*sf == '+') {
428       n = INT_MAX;
429       sf++;
430     }
431     else if (csdigit(*sf)) {
432       char *ptr;
433       long l = strtol(sf, &ptr, 10);
434       if (l == 0 && ptr == sf)
435 	;
436       else {
437 	sf = ptr;
438 	if (l < 0) {
439 	  n = 1;
440 	}
441 	else {
442 	  n = int(l);
443 	}
444       }
445     }
446     if (f == '.')
447       sortify_label(label.contents(), label.length(), sort_key);
448     else if (f == AUTHOR_FIELDS[0])
449       sortify_authors(n, sort_key);
450     else
451       sortify_field(f, n, sort_key);
452   }
453   sort_fields.set_length(sort_fields.length() - 1);
454 }
455 
sortify_authors(int n,string & result) const456 void reference::sortify_authors(int n, string &result) const
457 {
458   for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459     if (contains_field(*p)) {
460       sortify_field(*p, n, result);
461       return;
462     }
463   sortify_field(AUTHOR_FIELDS[0], n, result);
464 }
465 
canonicalize_authors(string & result) const466 void reference::canonicalize_authors(string &result) const
467 {
468   int len = result.length();
469   sortify_authors(INT_MAX, result);
470   if (result.length() > len)
471     result += SORT_SUB_SEP;
472 }
473 
sortify_field(unsigned char f,int n,string & result) const474 void reference::sortify_field(unsigned char f, int n, string &result) const
475 {
476   typedef void (*sortify_t)(const char *, int, string &);
477   sortify_t sortifier = sortify_other;
478   switch (f) {
479   case 'A':
480   case 'E':
481     sortifier = sortify_name;
482     break;
483   case 'D':
484     sortifier = sortify_date;
485     break;
486   case 'B':
487   case 'J':
488   case 'T':
489     sortifier = sortify_title;
490     break;
491   }
492   int fi = field_index[(unsigned char)f];
493   if (fi != NULL_FIELD_INDEX) {
494     string &str = field[fi];
495     const char *start = str.contents();
496     const char *end = start + str.length();
497     for (int i = 0; i < n && start < end; i++) {
498       const char *p = start;
499       while (start < end && *start != FIELD_SEPARATOR)
500 	start++;
501       if (i > 0)
502 	result += SORT_SUB_SEP;
503       (*sortifier)(p, start - p, result);
504       if (start < end)
505 	start++;
506     }
507   }
508 }
509 
compare_reference(const reference & r1,const reference & r2)510 int compare_reference(const reference &r1, const reference &r2)
511 {
512   assert(r1.no >= 0);
513   assert(r2.no >= 0);
514   const char *s1 = r1.sort_key.contents();
515   int n1 = r1.sort_key.length();
516   const char *s2 = r2.sort_key.contents();
517   int n2 = r2.sort_key.length();
518   for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
519     if (*s1 != *s2)
520       return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
521   if (n2 > 0)
522     return -1;
523   if (n1 > 0)
524     return 1;
525   return r1.no - r2.no;
526 }
527 
same_reference(const reference & r1,const reference & r2)528 int same_reference(const reference &r1, const reference &r2)
529 {
530   if (!r1.rid.is_null() && r1.rid == r2.rid)
531     return 1;
532   if (r1.h != r2.h)
533     return 0;
534   if (r1.nfields != r2.nfields)
535     return 0;
536   int i = 0;
537   for (i = 0; i < 256; i++)
538     if (r1.field_index != r2.field_index)
539       return 0;
540   for (i = 0; i < r1.nfields; i++)
541     if (r1.field[i] != r2.field[i])
542       return 0;
543   return 1;
544 }
545 
find_last_name(const char * start,const char * end,const char ** endp)546 const char *find_last_name(const char *start, const char *end,
547 			   const char **endp)
548 {
549   const char *ptr = start;
550   const char *last_word = start;
551   for (;;) {
552     const char *token_start = ptr;
553     if (!get_token(&ptr, end))
554       break;
555     if (ptr - token_start == 1) {
556       if (*token_start == ',') {
557 	*endp = token_start;
558 	return last_word;
559       }
560       else if (*token_start == ' ' || *token_start == '\n') {
561 	if (ptr < end && *ptr != ' ' && *ptr != '\n')
562 	  last_word = ptr;
563       }
564     }
565   }
566   *endp = end;
567   return last_word;
568 }
569 
abbreviate_name(const char * ptr,const char * end,string & result)570 void abbreviate_name(const char *ptr, const char *end, string &result)
571 {
572   const char *last_name_end;
573   const char *last_name_start = find_last_name(ptr, end, &last_name_end);
574   int need_period = 0;
575   for (;;) {
576     const char *token_start = ptr;
577     if (!get_token(&ptr, last_name_start))
578       break;
579     const token_info *ti = lookup_token(token_start, ptr);
580     if (need_period) {
581       if ((ptr - token_start == 1 && *token_start == ' ')
582 	  || (ptr - token_start == 2 && token_start[0] == '\\'
583 	      && token_start[1] == ' '))
584 	continue;
585       if (ti->is_upper())
586 	result += period_before_initial;
587       else
588 	result += period_before_other;
589       need_period = 0;
590     }
591     result.append(token_start, ptr - token_start);
592     if (ti->is_upper()) {
593       const char *lower_ptr = ptr;
594       int first_token = 1;
595       for (;;) {
596 	token_start = ptr;
597 	if (!get_token(&ptr, last_name_start))
598 	  break;
599 	if ((ptr - token_start == 1 && *token_start == ' ')
600 	    || (ptr - token_start == 2 && token_start[0] == '\\'
601 		&& token_start[1] == ' '))
602 	  break;
603 	ti = lookup_token(token_start, ptr);
604 	if (ti->is_hyphen()) {
605 	  const char *ptr1 = ptr;
606 	  if (get_token(&ptr1, last_name_start)) {
607 	    ti = lookup_token(ptr, ptr1);
608 	    if (ti->is_upper()) {
609 	      result += period_before_hyphen;
610 	      result.append(token_start, ptr1 - token_start);
611 	      ptr = ptr1;
612 	    }
613 	  }
614 	}
615 	else if (ti->is_upper()) {
616 	  // MacDougal -> MacD.
617 	  result.append(lower_ptr, ptr - lower_ptr);
618 	  lower_ptr = ptr;
619 	  first_token = 1;
620 	}
621 	else if (first_token && ti->is_accent()) {
622 	  result.append(token_start, ptr - token_start);
623 	  lower_ptr = ptr;
624 	}
625 	first_token = 0;
626       }
627       need_period = 1;
628     }
629   }
630   if (need_period)
631     result += period_before_last_name;
632   result.append(last_name_start, end - last_name_start);
633 }
634 
abbreviate_names(string & result)635 static void abbreviate_names(string &result)
636 {
637   string str;
638   str.move(result);
639   const char *ptr = str.contents();
640   const char *end = ptr + str.length();
641   while (ptr < end) {
642     const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
643     if (name_end == 0)
644       name_end = end;
645     abbreviate_name(ptr, name_end, result);
646     if (name_end >= end)
647       break;
648     ptr = name_end + 1;
649     result += FIELD_SEPARATOR;
650   }
651 }
652 
reverse_name(const char * ptr,const char * name_end,string & result)653 void reverse_name(const char *ptr, const char *name_end, string &result)
654 {
655   const char *last_name_end;
656   const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657   result.append(last_name_start, last_name_end - last_name_start);
658   while (last_name_start > ptr
659 	 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
660     last_name_start--;
661   if (last_name_start > ptr) {
662     result += ", ";
663     result.append(ptr, last_name_start - ptr);
664   }
665   if (last_name_end < name_end)
666     result.append(last_name_end, name_end - last_name_end);
667 }
668 
reverse_names(string & result,int n)669 void reverse_names(string &result, int n)
670 {
671   if (n <= 0)
672     return;
673   string str;
674   str.move(result);
675   const char *ptr = str.contents();
676   const char *end = ptr + str.length();
677   while (ptr < end) {
678     if (--n < 0) {
679       result.append(ptr, end - ptr);
680       break;
681     }
682     const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
683     if (name_end == 0)
684       name_end = end;
685     reverse_name(ptr, name_end, result);
686     if (name_end >= end)
687       break;
688     ptr = name_end + 1;
689     result += FIELD_SEPARATOR;
690   }
691 }
692 
693 // Return number of field separators.
694 
join_fields(string & f)695 int join_fields(string &f)
696 {
697   const char *ptr = f.contents();
698   int len = f.length();
699   int nfield_seps = 0;
700   int j;
701   for (j = 0; j < len; j++)
702     if (ptr[j] == FIELD_SEPARATOR)
703       nfield_seps++;
704   if (nfield_seps == 0)
705     return 0;
706   string temp;
707   int field_seps_left = nfield_seps;
708   for (j = 0; j < len; j++) {
709     if (ptr[j] == FIELD_SEPARATOR) {
710       if (nfield_seps == 1)
711 	temp += join_authors_exactly_two;
712       else if (--field_seps_left == 0)
713 	temp += join_authors_last_two;
714       else
715 	temp += join_authors_default;
716     }
717     else
718       temp += ptr[j];
719   }
720   f = temp;
721   return nfield_seps;
722 }
723 
uppercase(const char * start,const char * end,string & result)724 void uppercase(const char *start, const char *end, string &result)
725 {
726   for (;;) {
727     const char *token_start = start;
728     if (!get_token(&start, end))
729       break;
730     const token_info *ti = lookup_token(token_start, start);
731     ti->upper_case(token_start, start, result);
732   }
733 }
734 
lowercase(const char * start,const char * end,string & result)735 void lowercase(const char *start, const char *end, string &result)
736 {
737   for (;;) {
738     const char *token_start = start;
739     if (!get_token(&start, end))
740       break;
741     const token_info *ti = lookup_token(token_start, start);
742     ti->lower_case(token_start, start, result);
743   }
744 }
745 
capitalize(const char * ptr,const char * end,string & result)746 void capitalize(const char *ptr, const char *end, string &result)
747 {
748   int in_small_point_size = 0;
749   for (;;) {
750     const char *start = ptr;
751     if (!get_token(&ptr, end))
752       break;
753     const token_info *ti = lookup_token(start, ptr);
754     const char *char_end = ptr;
755     int is_lower = ti->is_lower();
756     if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757       const token_info *ti2 = lookup_token(char_end, ptr);
758       if (!ti2->is_accent())
759 	ptr = char_end;
760     }
761     if (is_lower) {
762       if (!in_small_point_size) {
763 	result += "\\s-2";
764 	in_small_point_size = 1;
765       }
766       ti->upper_case(start, char_end, result);
767       result.append(char_end, ptr - char_end);
768     }
769     else {
770       if (in_small_point_size) {
771 	result += "\\s+2";
772 	in_small_point_size = 0;
773       }
774       result.append(start, ptr - start);
775     }
776   }
777   if (in_small_point_size)
778     result += "\\s+2";
779 }
780 
capitalize_field(string & str)781 void capitalize_field(string &str)
782 {
783   string temp;
784   capitalize(str.contents(), str.contents() + str.length(), temp);
785   str.move(temp);
786 }
787 
is_terminated(const char * ptr,const char * end)788 int is_terminated(const char *ptr, const char *end)
789 {
790   const char *last_token = end;
791   for (;;) {
792     const char *p = ptr;
793     if (!get_token(&ptr, end))
794       break;
795     last_token = p;
796   }
797   return end - last_token == 1
798     && (*last_token == '.' || *last_token == '!' || *last_token == '?');
799 }
800 
output(FILE * fp)801 void reference::output(FILE *fp)
802 {
803   fputs(".]-\n", fp);
804   for (int i = 0; i < 256; i++)
805     if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806       string &f = field[field_index[i]];
807       if (!csdigit(i)) {
808 	int j = reverse_fields.search(i);
809 	if (j >= 0) {
810 	  int n;
811 	  int len = reverse_fields.length();
812 	  if (++j < len && csdigit(reverse_fields[j])) {
813 	    n = reverse_fields[j] - '0';
814 	    for (++j; j < len && csdigit(reverse_fields[j]); j++)
815 	      // should check for overflow
816 	      n = n*10 + reverse_fields[j] - '0';
817 	  }
818 	  else
819 	    n = INT_MAX;
820 	  reverse_names(f, n);
821 	}
822       }
823       int is_multiple = join_fields(f) > 0;
824       if (capitalize_fields.search(i) >= 0)
825 	capitalize_field(f);
826       if (memchr(f.contents(), '\n', f.length()) == 0) {
827 	fprintf(fp, ".ds [%c ", i);
828 	if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
829 	  putc('"', fp);
830 	put_string(f, fp);
831 	putc('\n', fp);
832       }
833       else {
834 	fprintf(fp, ".de [%c\n", i);
835 	put_string(f, fp);
836 	fputs("..\n", fp);
837       }
838       if (i == 'P') {
839 	int multiple_pages = 0;
840 	const char *s = f.contents();
841 	const char *end = f.contents() + f.length();
842 	for (;;) {
843 	  const char *token_start = s;
844 	  if (!get_token(&s, end))
845 	    break;
846 	  const token_info *ti = lookup_token(token_start, s);
847 	  if (ti->is_hyphen() || ti->is_range_sep()) {
848 	    multiple_pages = 1;
849 	    break;
850 	  }
851 	}
852 	fprintf(fp, ".nr [P %d\n", multiple_pages);
853       }
854       else if (i == 'E')
855 	fprintf(fp, ".nr [E %d\n", is_multiple);
856     }
857   for (const char *p = "TAO"; *p; p++) {
858     int fi = field_index[(unsigned char)*p];
859     if (fi != NULL_FIELD_INDEX) {
860       string &f = field[fi];
861       fprintf(fp, ".nr [%c %d\n", *p,
862 	      is_terminated(f.contents(), f.contents() + f.length()));
863     }
864   }
865   int t = classify();
866   fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867   if (annotation_macro.length() > 0 && annotation_field >= 0
868       && field_index[annotation_field] != NULL_FIELD_INDEX) {
869     putc('.', fp);
870     put_string(annotation_macro, fp);
871     putc('\n', fp);
872     put_string(field[field_index[annotation_field]], fp);
873   }
874 }
875 
print_sort_key_comment(FILE * fp)876 void reference::print_sort_key_comment(FILE *fp)
877 {
878   fputs(".\\\"", fp);
879   put_string(sort_key, fp);
880   putc('\n', fp);
881 }
882 
find_year(const char * start,const char * end,const char ** endp)883 const char *find_year(const char *start, const char *end, const char **endp)
884 {
885   for (;;) {
886     while (start < end && !csdigit(*start))
887       start++;
888     const char *ptr = start;
889     if (start == end)
890       break;
891     while (ptr < end && csdigit(*ptr))
892       ptr++;
893     if (ptr - start == 4 || ptr - start == 3
894 	|| (ptr - start == 2
895 	    && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
896       *endp = ptr;
897       return start;
898     }
899     start = ptr;
900   }
901   return 0;
902 }
903 
find_day(const char * start,const char * end,const char ** endp)904 static const char *find_day(const char *start, const char *end,
905 			    const char **endp)
906 {
907   for (;;) {
908     while (start < end && !csdigit(*start))
909       start++;
910     const char *ptr = start;
911     if (start == end)
912       break;
913     while (ptr < end && csdigit(*ptr))
914       ptr++;
915     if ((ptr - start == 1 && start[0] != '0')
916 	|| (ptr - start == 2 &&
917 	    (start[0] == '1'
918 	     || start[0] == '2'
919 	     || (start[0] == '3' && start[1] <= '1')
920 	     || (start[0] == '0' && start[1] != '0')))) {
921       *endp = ptr;
922       return start;
923     }
924     start = ptr;
925   }
926   return 0;
927 }
928 
find_month(const char * start,const char * end)929 static int find_month(const char *start, const char *end)
930 {
931   static const char *months[] = {
932     "january",
933     "february",
934     "march",
935     "april",
936     "may",
937     "june",
938     "july",
939     "august",
940     "september",
941     "october",
942     "november",
943     "december",
944   };
945   for (;;) {
946     while (start < end && !csalpha(*start))
947       start++;
948     const char *ptr = start;
949     if (start == end)
950       break;
951     while (ptr < end && csalpha(*ptr))
952       ptr++;
953     if (ptr - start >= 3) {
954       for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955 	const char *q = months[i];
956 	const char *p = start;
957 	for (; p < ptr; p++, q++)
958 	  if (cmlower(*p) != *q)
959 	    break;
960 	if (p >= ptr)
961 	  return i;
962       }
963     }
964     start = ptr;
965   }
966   return -1;
967 }
968 
contains_field(char c) const969 int reference::contains_field(char c) const
970 {
971   return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
972 }
973 
classify()974 int reference::classify()
975 {
976   if (contains_field('J'))
977     return JOURNAL_ARTICLE;
978   if (contains_field('B'))
979     return ARTICLE_IN_BOOK;
980   if (contains_field('G'))
981     return TECH_REPORT;
982   if (contains_field('R'))
983     return TECH_REPORT;
984   if (contains_field('I'))
985     return BOOK;
986   if (contains_field('M'))
987     return BELL_TM;
988   return OTHER;
989 }
990 
get_year(const char ** endp) const991 const char *reference::get_year(const char **endp) const
992 {
993   if (field_index['D'] != NULL_FIELD_INDEX) {
994     string &date = field[field_index['D']];
995     const char *start = date.contents();
996     const char *end = start + date.length();
997     return find_year(start, end, endp);
998   }
999   else
1000     return 0;
1001 }
1002 
get_field(unsigned char c,const char ** endp) const1003 const char *reference::get_field(unsigned char c, const char **endp) const
1004 {
1005   if (field_index[c] != NULL_FIELD_INDEX) {
1006     string &f = field[field_index[c]];
1007     const char *start = f.contents();
1008     *endp = start + f.length();
1009     return start;
1010   }
1011   else
1012     return 0;
1013 }
1014 
get_date(const char ** endp) const1015 const char *reference::get_date(const char **endp) const
1016 {
1017   return get_field('D', endp);
1018 }
1019 
nth_field(int i,const char * start,const char ** endp)1020 const char *nth_field(int i, const char *start, const char **endp)
1021 {
1022   while (--i >= 0) {
1023     start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024     if (!start)
1025       return 0;
1026     start++;
1027   }
1028   const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029   if (e)
1030     *endp = e;
1031   return start;
1032 }
1033 
get_author(int i,const char ** endp) const1034 const char *reference::get_author(int i, const char **endp) const
1035 {
1036   for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037     const char *start = get_field(*f, endp);
1038     if (start) {
1039       if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040 	return nth_field(i, start, endp);
1041       else if (i == 0)
1042 	return start;
1043       else
1044 	return 0;
1045     }
1046   }
1047   return 0;
1048 }
1049 
get_author_last_name(int i,const char ** endp) const1050 const char *reference::get_author_last_name(int i, const char **endp) const
1051 {
1052   for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053     const char *start = get_field(*f, endp);
1054     if (start) {
1055       if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056 	start = nth_field(i, start, endp);
1057 	if (!start)
1058 	  return 0;
1059       }
1060       if (*f == 'A')
1061 	return find_last_name(start, *endp, endp);
1062       else
1063 	return start;
1064     }
1065   }
1066   return 0;
1067 }
1068 
set_date(string & d)1069 void reference::set_date(string &d)
1070 {
1071   if (d.length() == 0)
1072     delete_field('D');
1073   else
1074     insert_field('D', d);
1075 }
1076 
same_year(const reference & r1,const reference & r2)1077 int same_year(const reference &r1, const reference &r2)
1078 {
1079   const char *ye1;
1080   const char *ys1 = r1.get_year(&ye1);
1081   const char *ye2;
1082   const char *ys2 = r2.get_year(&ye2);
1083   if (ys1 == 0) {
1084     if (ys2 == 0)
1085       return same_date(r1, r2);
1086     else
1087       return 0;
1088   }
1089   else if (ys2 == 0)
1090     return 0;
1091   else if (ye1 - ys1 != ye2 - ys2)
1092     return 0;
1093   else
1094     return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095 }
1096 
same_date(const reference & r1,const reference & r2)1097 int same_date(const reference &r1, const reference &r2)
1098 {
1099   const char *e1;
1100   const char *s1 = r1.get_date(&e1);
1101   const char *e2;
1102   const char *s2 = r2.get_date(&e2);
1103   if (s1 == 0)
1104     return s2 == 0;
1105   else if (s2 == 0)
1106     return 0;
1107   else if (e1 - s1 != e2 - s2)
1108     return 0;
1109   else
1110     return memcmp(s1, s2, e1 - s1) == 0;
1111 }
1112 
get_sort_field(int i,int si,int ssi,const char ** endp) const1113 const char *reference::get_sort_field(int i, int si, int ssi,
1114 				      const char **endp) const
1115 {
1116   const char *start = sort_key.contents();
1117   const char *end = start + sort_key.length();
1118   if (i < 0) {
1119     *endp = end;
1120     return start;
1121   }
1122   while (--i >= 0) {
1123     start = (char *)memchr(start, SORT_SEP, end - start);
1124     if (!start)
1125       return 0;
1126     start++;
1127   }
1128   const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129   if (e)
1130     end = e;
1131   if (si < 0) {
1132     *endp = end;
1133     return start;
1134   }
1135   while (--si >= 0) {
1136     start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137     if (!start)
1138       return 0;
1139     start++;
1140   }
1141   e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142   if (e)
1143     end = e;
1144   if (ssi < 0) {
1145     *endp = end;
1146     return start;
1147   }
1148   while (--ssi >= 0) {
1149     start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150     if (!start)
1151       return 0;
1152     start++;
1153   }
1154   e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155   if (e)
1156     end = e;
1157   *endp = end;
1158   return start;
1159 }
1160 
1161