1 // -*- C++ -*-
2 /* Copyright (C) 1989, 1990, 1991, 1992, 2001, 2003
3 Free Software Foundation, Inc.
4 Written by James Clark (jjc@jclark.com)
5
6 This file is part of groff.
7
8 groff is free software; you can redistribute it and/or modify it under
9 the terms of the GNU General Public License as published by the Free
10 Software Foundation; either version 2, or (at your option) any later
11 version.
12
13 groff is distributed in the hope that it will be useful, but WITHOUT ANY
14 WARRANTY; without even the implied warranty of MERCHANTABILITY or
15 FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License
16 for more details.
17
18 You should have received a copy of the GNU General Public License along
19 with groff; see the file COPYING. If not, write to the Free Software
20 Foundation, 51 Franklin St - Fifth Floor, Boston, MA 02110-1301, USA. */
21
22 #include "refer.h"
23 #include "refid.h"
24 #include "ref.h"
25 #include "token.h"
26
27 static const char *find_day(const char *, const char *, const char **);
28 static int find_month(const char *start, const char *end);
29 static void abbreviate_names(string &);
30
31 #define DEFAULT_ARTICLES "the\000a\000an"
32
33 string articles(DEFAULT_ARTICLES, sizeof(DEFAULT_ARTICLES));
34
35 // Multiple occurrences of fields are separated by FIELD_SEPARATOR.
36 const char FIELD_SEPARATOR = '\0';
37
38 const char MULTI_FIELD_NAMES[] = "AE";
39 const char *AUTHOR_FIELDS = "AQ";
40
41 enum { OTHER, JOURNAL_ARTICLE, BOOK, ARTICLE_IN_BOOK, TECH_REPORT, BELL_TM };
42
43 const char *reference_types[] = {
44 "other",
45 "journal-article",
46 "book",
47 "article-in-book",
48 "tech-report",
49 "bell-tm",
50 };
51
52 static string temp_fields[256];
53
reference(const char * start,int len,reference_id * ridp)54 reference::reference(const char *start, int len, reference_id *ridp)
55 : h(0), merged(0), no(-1), field(0), nfields(0), label_ptr(0),
56 computed_authors(0), last_needed_author(-1), nauthors(-1)
57 {
58 int i;
59 for (i = 0; i < 256; i++)
60 field_index[i] = NULL_FIELD_INDEX;
61 if (ridp)
62 rid = *ridp;
63 if (start == 0)
64 return;
65 if (len <= 0)
66 return;
67 const char *end = start + len;
68 const char *ptr = start;
69 assert(*ptr == '%');
70 while (ptr < end) {
71 if (ptr + 1 < end && ptr[1] != '\0'
72 && ((ptr[1] != '%' && ptr[1] == annotation_field)
73 || (ptr + 2 < end && ptr[1] == '%' && ptr[2] != '\0'
74 && discard_fields.search(ptr[2]) < 0))) {
75 if (ptr[1] == '%')
76 ptr++;
77 string &f = temp_fields[(unsigned char)ptr[1]];
78 ptr += 2;
79 while (ptr < end && csspace(*ptr))
80 ptr++;
81 for (;;) {
82 for (;;) {
83 if (ptr >= end) {
84 f += '\n';
85 break;
86 }
87 f += *ptr;
88 if (*ptr++ == '\n')
89 break;
90 }
91 if (ptr >= end || *ptr == '%')
92 break;
93 }
94 }
95 else if (ptr + 1 < end && ptr[1] != '\0' && ptr[1] != '%'
96 && discard_fields.search(ptr[1]) < 0) {
97 string &f = temp_fields[(unsigned char)ptr[1]];
98 if (f.length() > 0) {
99 if (strchr(MULTI_FIELD_NAMES, ptr[1]) != 0)
100 f += FIELD_SEPARATOR;
101 else
102 f.clear();
103 }
104 ptr += 2;
105 if (ptr < end) {
106 if (*ptr == ' ')
107 ptr++;
108 for (;;) {
109 const char *p = ptr;
110 while (ptr < end && *ptr != '\n')
111 ptr++;
112 // strip trailing white space
113 const char *q = ptr;
114 while (q > p && q[-1] != '\n' && csspace(q[-1]))
115 q--;
116 while (p < q)
117 f += *p++;
118 if (ptr >= end)
119 break;
120 ptr++;
121 if (ptr >= end)
122 break;
123 if (*ptr == '%')
124 break;
125 f += ' ';
126 }
127 }
128 }
129 else {
130 // skip this field
131 for (;;) {
132 while (ptr < end && *ptr++ != '\n')
133 ;
134 if (ptr >= end || *ptr == '%')
135 break;
136 }
137 }
138 }
139 for (i = 0; i < 256; i++)
140 if (temp_fields[i].length() > 0)
141 nfields++;
142 field = new string[nfields];
143 int j = 0;
144 for (i = 0; i < 256; i++)
145 if (temp_fields[i].length() > 0) {
146 field[j].move(temp_fields[i]);
147 if (abbreviate_fields.search(i) >= 0)
148 abbreviate_names(field[j]);
149 field_index[i] = j;
150 j++;
151 }
152 }
153
~reference()154 reference::~reference()
155 {
156 if (nfields > 0)
157 ad_delete(nfields) field;
158 }
159
160 // ref is the inline, this is the database ref
161
merge(reference & ref)162 void reference::merge(reference &ref)
163 {
164 int i;
165 for (i = 0; i < 256; i++)
166 if (field_index[i] != NULL_FIELD_INDEX)
167 temp_fields[i].move(field[field_index[i]]);
168 for (i = 0; i < 256; i++)
169 if (ref.field_index[i] != NULL_FIELD_INDEX)
170 temp_fields[i].move(ref.field[ref.field_index[i]]);
171 for (i = 0; i < 256; i++)
172 field_index[i] = NULL_FIELD_INDEX;
173 int old_nfields = nfields;
174 nfields = 0;
175 for (i = 0; i < 256; i++)
176 if (temp_fields[i].length() > 0)
177 nfields++;
178 if (nfields != old_nfields) {
179 if (old_nfields > 0)
180 ad_delete(old_nfields) field;
181 field = new string[nfields];
182 }
183 int j = 0;
184 for (i = 0; i < 256; i++)
185 if (temp_fields[i].length() > 0) {
186 field[j].move(temp_fields[i]);
187 field_index[i] = j;
188 j++;
189 }
190 merged = 1;
191 }
192
insert_field(unsigned char c,string & s)193 void reference::insert_field(unsigned char c, string &s)
194 {
195 assert(s.length() > 0);
196 if (field_index[c] != NULL_FIELD_INDEX) {
197 field[field_index[c]].move(s);
198 return;
199 }
200 assert(field_index[c] == NULL_FIELD_INDEX);
201 string *old_field = field;
202 field = new string[nfields + 1];
203 int pos = 0;
204 int i;
205 for (i = 0; i < int(c); i++)
206 if (field_index[i] != NULL_FIELD_INDEX)
207 pos++;
208 for (i = 0; i < pos; i++)
209 field[i].move(old_field[i]);
210 field[pos].move(s);
211 for (i = pos; i < nfields; i++)
212 field[i + 1].move(old_field[i]);
213 if (nfields > 0)
214 ad_delete(nfields) old_field;
215 nfields++;
216 field_index[c] = pos;
217 for (i = c + 1; i < 256; i++)
218 if (field_index[i] != NULL_FIELD_INDEX)
219 field_index[i] += 1;
220 }
221
delete_field(unsigned char c)222 void reference::delete_field(unsigned char c)
223 {
224 if (field_index[c] == NULL_FIELD_INDEX)
225 return;
226 string *old_field = field;
227 field = new string[nfields - 1];
228 int i;
229 for (i = 0; i < int(field_index[c]); i++)
230 field[i].move(old_field[i]);
231 for (i = field_index[c]; i < nfields - 1; i++)
232 field[i].move(old_field[i + 1]);
233 if (nfields > 0)
234 ad_delete(nfields) old_field;
235 nfields--;
236 field_index[c] = NULL_FIELD_INDEX;
237 for (i = c + 1; i < 256; i++)
238 if (field_index[i] != NULL_FIELD_INDEX)
239 field_index[i] -= 1;
240 }
241
compute_hash_code()242 void reference::compute_hash_code()
243 {
244 if (!rid.is_null())
245 h = rid.hash();
246 else {
247 h = 0;
248 for (int i = 0; i < nfields; i++)
249 if (field[i].length() > 0) {
250 h <<= 4;
251 h ^= hash_string(field[i].contents(), field[i].length());
252 }
253 }
254 }
255
set_number(int n)256 void reference::set_number(int n)
257 {
258 no = n;
259 }
260
261 const char SORT_SEP = '\001';
262 const char SORT_SUB_SEP = '\002';
263 const char SORT_SUB_SUB_SEP = '\003';
264
265 // sep specifies additional word separators
266
sortify_words(const char * s,const char * end,const char * sep,string & result)267 void sortify_words(const char *s, const char *end, const char *sep,
268 string &result)
269 {
270 int non_empty = 0;
271 int need_separator = 0;
272 for (;;) {
273 const char *token_start = s;
274 if (!get_token(&s, end))
275 break;
276 if ((s - token_start == 1
277 && (*token_start == ' '
278 || *token_start == '\n'
279 || (sep && *token_start != '\0'
280 && strchr(sep, *token_start) != 0)))
281 || (s - token_start == 2
282 && token_start[0] == '\\' && token_start[1] == ' ')) {
283 if (non_empty)
284 need_separator = 1;
285 }
286 else {
287 const token_info *ti = lookup_token(token_start, s);
288 if (ti->sortify_non_empty(token_start, s)) {
289 if (need_separator) {
290 result += ' ';
291 need_separator = 0;
292 }
293 ti->sortify(token_start, s, result);
294 non_empty = 1;
295 }
296 }
297 }
298 }
299
sortify_word(const char * s,const char * end,string & result)300 void sortify_word(const char *s, const char *end, string &result)
301 {
302 for (;;) {
303 const char *token_start = s;
304 if (!get_token(&s, end))
305 break;
306 const token_info *ti = lookup_token(token_start, s);
307 ti->sortify(token_start, s, result);
308 }
309 }
310
sortify_other(const char * s,int len,string & key)311 void sortify_other(const char *s, int len, string &key)
312 {
313 sortify_words(s, s + len, 0, key);
314 }
315
sortify_title(const char * s,int len,string & key)316 void sortify_title(const char *s, int len, string &key)
317 {
318 const char *end = s + len;
319 for (; s < end && (*s == ' ' || *s == '\n'); s++)
320 ;
321 const char *ptr = s;
322 for (;;) {
323 const char *token_start = ptr;
324 if (!get_token(&ptr, end))
325 break;
326 if (ptr - token_start == 1
327 && (*token_start == ' ' || *token_start == '\n'))
328 break;
329 }
330 if (ptr < end) {
331 unsigned int first_word_len = ptr - s - 1;
332 const char *ae = articles.contents() + articles.length();
333 for (const char *a = articles.contents();
334 a < ae;
335 a = strchr(a, '\0') + 1)
336 if (first_word_len == strlen(a)) {
337 unsigned int j;
338 for (j = 0; j < first_word_len; j++)
339 if (a[j] != cmlower(s[j]))
340 break;
341 if (j >= first_word_len) {
342 s = ptr;
343 for (; s < end && (*s == ' ' || *s == '\n'); s++)
344 ;
345 break;
346 }
347 }
348 }
349 sortify_words(s, end, 0, key);
350 }
351
sortify_name(const char * s,int len,string & key)352 void sortify_name(const char *s, int len, string &key)
353 {
354 const char *last_name_end;
355 const char *last_name = find_last_name(s, s + len, &last_name_end);
356 sortify_word(last_name, last_name_end, key);
357 key += SORT_SUB_SUB_SEP;
358 if (last_name > s)
359 sortify_words(s, last_name, ".", key);
360 key += SORT_SUB_SUB_SEP;
361 if (last_name_end < s + len)
362 sortify_words(last_name_end, s + len, ".,", key);
363 }
364
sortify_date(const char * s,int len,string & key)365 void sortify_date(const char *s, int len, string &key)
366 {
367 const char *year_end;
368 const char *year_start = find_year(s, s + len, &year_end);
369 if (!year_start) {
370 // Things without years are often `forthcoming', so it makes sense
371 // that they sort after things with explicit years.
372 key += 'A';
373 sortify_words(s, s + len, 0, key);
374 return;
375 }
376 int n = year_end - year_start;
377 while (n < 4) {
378 key += '0';
379 n++;
380 }
381 while (year_start < year_end)
382 key += *year_start++;
383 int m = find_month(s, s + len);
384 if (m < 0)
385 return;
386 key += 'A' + m;
387 const char *day_end;
388 const char *day_start = find_day(s, s + len, &day_end);
389 if (!day_start)
390 return;
391 if (day_end - day_start == 1)
392 key += '0';
393 while (day_start < day_end)
394 key += *day_start++;
395 }
396
397 // SORT_{SUB,SUB_SUB}_SEP can creep in from use of @ in label specification.
398
sortify_label(const char * s,int len,string & key)399 void sortify_label(const char *s, int len, string &key)
400 {
401 const char *end = s + len;
402 for (;;) {
403 const char *ptr;
404 for (ptr = s;
405 ptr < end && *ptr != SORT_SUB_SEP && *ptr != SORT_SUB_SUB_SEP;
406 ptr++)
407 ;
408 if (ptr > s)
409 sortify_words(s, ptr, 0, key);
410 s = ptr;
411 if (s >= end)
412 break;
413 key += *s++;
414 }
415 }
416
compute_sort_key()417 void reference::compute_sort_key()
418 {
419 if (sort_fields.length() == 0)
420 return;
421 sort_fields += '\0';
422 const char *sf = sort_fields.contents();
423 while (*sf != '\0') {
424 sort_key += SORT_SEP;
425 char f = *sf++;
426 int n = 1;
427 if (*sf == '+') {
428 n = INT_MAX;
429 sf++;
430 }
431 else if (csdigit(*sf)) {
432 char *ptr;
433 long l = strtol(sf, &ptr, 10);
434 if (l == 0 && ptr == sf)
435 ;
436 else {
437 sf = ptr;
438 if (l < 0) {
439 n = 1;
440 }
441 else {
442 n = int(l);
443 }
444 }
445 }
446 if (f == '.')
447 sortify_label(label.contents(), label.length(), sort_key);
448 else if (f == AUTHOR_FIELDS[0])
449 sortify_authors(n, sort_key);
450 else
451 sortify_field(f, n, sort_key);
452 }
453 sort_fields.set_length(sort_fields.length() - 1);
454 }
455
sortify_authors(int n,string & result) const456 void reference::sortify_authors(int n, string &result) const
457 {
458 for (const char *p = AUTHOR_FIELDS; *p != '\0'; p++)
459 if (contains_field(*p)) {
460 sortify_field(*p, n, result);
461 return;
462 }
463 sortify_field(AUTHOR_FIELDS[0], n, result);
464 }
465
canonicalize_authors(string & result) const466 void reference::canonicalize_authors(string &result) const
467 {
468 int len = result.length();
469 sortify_authors(INT_MAX, result);
470 if (result.length() > len)
471 result += SORT_SUB_SEP;
472 }
473
sortify_field(unsigned char f,int n,string & result) const474 void reference::sortify_field(unsigned char f, int n, string &result) const
475 {
476 typedef void (*sortify_t)(const char *, int, string &);
477 sortify_t sortifier = sortify_other;
478 switch (f) {
479 case 'A':
480 case 'E':
481 sortifier = sortify_name;
482 break;
483 case 'D':
484 sortifier = sortify_date;
485 break;
486 case 'B':
487 case 'J':
488 case 'T':
489 sortifier = sortify_title;
490 break;
491 }
492 int fi = field_index[(unsigned char)f];
493 if (fi != NULL_FIELD_INDEX) {
494 string &str = field[fi];
495 const char *start = str.contents();
496 const char *end = start + str.length();
497 for (int i = 0; i < n && start < end; i++) {
498 const char *p = start;
499 while (start < end && *start != FIELD_SEPARATOR)
500 start++;
501 if (i > 0)
502 result += SORT_SUB_SEP;
503 (*sortifier)(p, start - p, result);
504 if (start < end)
505 start++;
506 }
507 }
508 }
509
compare_reference(const reference & r1,const reference & r2)510 int compare_reference(const reference &r1, const reference &r2)
511 {
512 assert(r1.no >= 0);
513 assert(r2.no >= 0);
514 const char *s1 = r1.sort_key.contents();
515 int n1 = r1.sort_key.length();
516 const char *s2 = r2.sort_key.contents();
517 int n2 = r2.sort_key.length();
518 for (; n1 > 0 && n2 > 0; --n1, --n2, ++s1, ++s2)
519 if (*s1 != *s2)
520 return (int)(unsigned char)*s1 - (int)(unsigned char)*s2;
521 if (n2 > 0)
522 return -1;
523 if (n1 > 0)
524 return 1;
525 return r1.no - r2.no;
526 }
527
same_reference(const reference & r1,const reference & r2)528 int same_reference(const reference &r1, const reference &r2)
529 {
530 if (!r1.rid.is_null() && r1.rid == r2.rid)
531 return 1;
532 if (r1.h != r2.h)
533 return 0;
534 if (r1.nfields != r2.nfields)
535 return 0;
536 int i = 0;
537 for (i = 0; i < 256; i++)
538 if (r1.field_index != r2.field_index)
539 return 0;
540 for (i = 0; i < r1.nfields; i++)
541 if (r1.field[i] != r2.field[i])
542 return 0;
543 return 1;
544 }
545
find_last_name(const char * start,const char * end,const char ** endp)546 const char *find_last_name(const char *start, const char *end,
547 const char **endp)
548 {
549 const char *ptr = start;
550 const char *last_word = start;
551 for (;;) {
552 const char *token_start = ptr;
553 if (!get_token(&ptr, end))
554 break;
555 if (ptr - token_start == 1) {
556 if (*token_start == ',') {
557 *endp = token_start;
558 return last_word;
559 }
560 else if (*token_start == ' ' || *token_start == '\n') {
561 if (ptr < end && *ptr != ' ' && *ptr != '\n')
562 last_word = ptr;
563 }
564 }
565 }
566 *endp = end;
567 return last_word;
568 }
569
abbreviate_name(const char * ptr,const char * end,string & result)570 void abbreviate_name(const char *ptr, const char *end, string &result)
571 {
572 const char *last_name_end;
573 const char *last_name_start = find_last_name(ptr, end, &last_name_end);
574 int need_period = 0;
575 for (;;) {
576 const char *token_start = ptr;
577 if (!get_token(&ptr, last_name_start))
578 break;
579 const token_info *ti = lookup_token(token_start, ptr);
580 if (need_period) {
581 if ((ptr - token_start == 1 && *token_start == ' ')
582 || (ptr - token_start == 2 && token_start[0] == '\\'
583 && token_start[1] == ' '))
584 continue;
585 if (ti->is_upper())
586 result += period_before_initial;
587 else
588 result += period_before_other;
589 need_period = 0;
590 }
591 result.append(token_start, ptr - token_start);
592 if (ti->is_upper()) {
593 const char *lower_ptr = ptr;
594 int first_token = 1;
595 for (;;) {
596 token_start = ptr;
597 if (!get_token(&ptr, last_name_start))
598 break;
599 if ((ptr - token_start == 1 && *token_start == ' ')
600 || (ptr - token_start == 2 && token_start[0] == '\\'
601 && token_start[1] == ' '))
602 break;
603 ti = lookup_token(token_start, ptr);
604 if (ti->is_hyphen()) {
605 const char *ptr1 = ptr;
606 if (get_token(&ptr1, last_name_start)) {
607 ti = lookup_token(ptr, ptr1);
608 if (ti->is_upper()) {
609 result += period_before_hyphen;
610 result.append(token_start, ptr1 - token_start);
611 ptr = ptr1;
612 }
613 }
614 }
615 else if (ti->is_upper()) {
616 // MacDougal -> MacD.
617 result.append(lower_ptr, ptr - lower_ptr);
618 lower_ptr = ptr;
619 first_token = 1;
620 }
621 else if (first_token && ti->is_accent()) {
622 result.append(token_start, ptr - token_start);
623 lower_ptr = ptr;
624 }
625 first_token = 0;
626 }
627 need_period = 1;
628 }
629 }
630 if (need_period)
631 result += period_before_last_name;
632 result.append(last_name_start, end - last_name_start);
633 }
634
abbreviate_names(string & result)635 static void abbreviate_names(string &result)
636 {
637 string str;
638 str.move(result);
639 const char *ptr = str.contents();
640 const char *end = ptr + str.length();
641 while (ptr < end) {
642 const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
643 if (name_end == 0)
644 name_end = end;
645 abbreviate_name(ptr, name_end, result);
646 if (name_end >= end)
647 break;
648 ptr = name_end + 1;
649 result += FIELD_SEPARATOR;
650 }
651 }
652
reverse_name(const char * ptr,const char * name_end,string & result)653 void reverse_name(const char *ptr, const char *name_end, string &result)
654 {
655 const char *last_name_end;
656 const char *last_name_start = find_last_name(ptr, name_end, &last_name_end);
657 result.append(last_name_start, last_name_end - last_name_start);
658 while (last_name_start > ptr
659 && (last_name_start[-1] == ' ' || last_name_start[-1] == '\n'))
660 last_name_start--;
661 if (last_name_start > ptr) {
662 result += ", ";
663 result.append(ptr, last_name_start - ptr);
664 }
665 if (last_name_end < name_end)
666 result.append(last_name_end, name_end - last_name_end);
667 }
668
reverse_names(string & result,int n)669 void reverse_names(string &result, int n)
670 {
671 if (n <= 0)
672 return;
673 string str;
674 str.move(result);
675 const char *ptr = str.contents();
676 const char *end = ptr + str.length();
677 while (ptr < end) {
678 if (--n < 0) {
679 result.append(ptr, end - ptr);
680 break;
681 }
682 const char *name_end = (char *)memchr(ptr, FIELD_SEPARATOR, end - ptr);
683 if (name_end == 0)
684 name_end = end;
685 reverse_name(ptr, name_end, result);
686 if (name_end >= end)
687 break;
688 ptr = name_end + 1;
689 result += FIELD_SEPARATOR;
690 }
691 }
692
693 // Return number of field separators.
694
join_fields(string & f)695 int join_fields(string &f)
696 {
697 const char *ptr = f.contents();
698 int len = f.length();
699 int nfield_seps = 0;
700 int j;
701 for (j = 0; j < len; j++)
702 if (ptr[j] == FIELD_SEPARATOR)
703 nfield_seps++;
704 if (nfield_seps == 0)
705 return 0;
706 string temp;
707 int field_seps_left = nfield_seps;
708 for (j = 0; j < len; j++) {
709 if (ptr[j] == FIELD_SEPARATOR) {
710 if (nfield_seps == 1)
711 temp += join_authors_exactly_two;
712 else if (--field_seps_left == 0)
713 temp += join_authors_last_two;
714 else
715 temp += join_authors_default;
716 }
717 else
718 temp += ptr[j];
719 }
720 f = temp;
721 return nfield_seps;
722 }
723
uppercase(const char * start,const char * end,string & result)724 void uppercase(const char *start, const char *end, string &result)
725 {
726 for (;;) {
727 const char *token_start = start;
728 if (!get_token(&start, end))
729 break;
730 const token_info *ti = lookup_token(token_start, start);
731 ti->upper_case(token_start, start, result);
732 }
733 }
734
lowercase(const char * start,const char * end,string & result)735 void lowercase(const char *start, const char *end, string &result)
736 {
737 for (;;) {
738 const char *token_start = start;
739 if (!get_token(&start, end))
740 break;
741 const token_info *ti = lookup_token(token_start, start);
742 ti->lower_case(token_start, start, result);
743 }
744 }
745
capitalize(const char * ptr,const char * end,string & result)746 void capitalize(const char *ptr, const char *end, string &result)
747 {
748 int in_small_point_size = 0;
749 for (;;) {
750 const char *start = ptr;
751 if (!get_token(&ptr, end))
752 break;
753 const token_info *ti = lookup_token(start, ptr);
754 const char *char_end = ptr;
755 int is_lower = ti->is_lower();
756 if ((is_lower || ti->is_upper()) && get_token(&ptr, end)) {
757 const token_info *ti2 = lookup_token(char_end, ptr);
758 if (!ti2->is_accent())
759 ptr = char_end;
760 }
761 if (is_lower) {
762 if (!in_small_point_size) {
763 result += "\\s-2";
764 in_small_point_size = 1;
765 }
766 ti->upper_case(start, char_end, result);
767 result.append(char_end, ptr - char_end);
768 }
769 else {
770 if (in_small_point_size) {
771 result += "\\s+2";
772 in_small_point_size = 0;
773 }
774 result.append(start, ptr - start);
775 }
776 }
777 if (in_small_point_size)
778 result += "\\s+2";
779 }
780
capitalize_field(string & str)781 void capitalize_field(string &str)
782 {
783 string temp;
784 capitalize(str.contents(), str.contents() + str.length(), temp);
785 str.move(temp);
786 }
787
is_terminated(const char * ptr,const char * end)788 int is_terminated(const char *ptr, const char *end)
789 {
790 const char *last_token = end;
791 for (;;) {
792 const char *p = ptr;
793 if (!get_token(&ptr, end))
794 break;
795 last_token = p;
796 }
797 return end - last_token == 1
798 && (*last_token == '.' || *last_token == '!' || *last_token == '?');
799 }
800
output(FILE * fp)801 void reference::output(FILE *fp)
802 {
803 fputs(".]-\n", fp);
804 for (int i = 0; i < 256; i++)
805 if (field_index[i] != NULL_FIELD_INDEX && i != annotation_field) {
806 string &f = field[field_index[i]];
807 if (!csdigit(i)) {
808 int j = reverse_fields.search(i);
809 if (j >= 0) {
810 int n;
811 int len = reverse_fields.length();
812 if (++j < len && csdigit(reverse_fields[j])) {
813 n = reverse_fields[j] - '0';
814 for (++j; j < len && csdigit(reverse_fields[j]); j++)
815 // should check for overflow
816 n = n*10 + reverse_fields[j] - '0';
817 }
818 else
819 n = INT_MAX;
820 reverse_names(f, n);
821 }
822 }
823 int is_multiple = join_fields(f) > 0;
824 if (capitalize_fields.search(i) >= 0)
825 capitalize_field(f);
826 if (memchr(f.contents(), '\n', f.length()) == 0) {
827 fprintf(fp, ".ds [%c ", i);
828 if (f[0] == ' ' || f[0] == '\\' || f[0] == '"')
829 putc('"', fp);
830 put_string(f, fp);
831 putc('\n', fp);
832 }
833 else {
834 fprintf(fp, ".de [%c\n", i);
835 put_string(f, fp);
836 fputs("..\n", fp);
837 }
838 if (i == 'P') {
839 int multiple_pages = 0;
840 const char *s = f.contents();
841 const char *end = f.contents() + f.length();
842 for (;;) {
843 const char *token_start = s;
844 if (!get_token(&s, end))
845 break;
846 const token_info *ti = lookup_token(token_start, s);
847 if (ti->is_hyphen() || ti->is_range_sep()) {
848 multiple_pages = 1;
849 break;
850 }
851 }
852 fprintf(fp, ".nr [P %d\n", multiple_pages);
853 }
854 else if (i == 'E')
855 fprintf(fp, ".nr [E %d\n", is_multiple);
856 }
857 for (const char *p = "TAO"; *p; p++) {
858 int fi = field_index[(unsigned char)*p];
859 if (fi != NULL_FIELD_INDEX) {
860 string &f = field[fi];
861 fprintf(fp, ".nr [%c %d\n", *p,
862 is_terminated(f.contents(), f.contents() + f.length()));
863 }
864 }
865 int t = classify();
866 fprintf(fp, ".][ %d %s\n", t, reference_types[t]);
867 if (annotation_macro.length() > 0 && annotation_field >= 0
868 && field_index[annotation_field] != NULL_FIELD_INDEX) {
869 putc('.', fp);
870 put_string(annotation_macro, fp);
871 putc('\n', fp);
872 put_string(field[field_index[annotation_field]], fp);
873 }
874 }
875
print_sort_key_comment(FILE * fp)876 void reference::print_sort_key_comment(FILE *fp)
877 {
878 fputs(".\\\"", fp);
879 put_string(sort_key, fp);
880 putc('\n', fp);
881 }
882
find_year(const char * start,const char * end,const char ** endp)883 const char *find_year(const char *start, const char *end, const char **endp)
884 {
885 for (;;) {
886 while (start < end && !csdigit(*start))
887 start++;
888 const char *ptr = start;
889 if (start == end)
890 break;
891 while (ptr < end && csdigit(*ptr))
892 ptr++;
893 if (ptr - start == 4 || ptr - start == 3
894 || (ptr - start == 2
895 && (start[0] >= '4' || (start[0] == '3' && start[1] >= '2')))) {
896 *endp = ptr;
897 return start;
898 }
899 start = ptr;
900 }
901 return 0;
902 }
903
find_day(const char * start,const char * end,const char ** endp)904 static const char *find_day(const char *start, const char *end,
905 const char **endp)
906 {
907 for (;;) {
908 while (start < end && !csdigit(*start))
909 start++;
910 const char *ptr = start;
911 if (start == end)
912 break;
913 while (ptr < end && csdigit(*ptr))
914 ptr++;
915 if ((ptr - start == 1 && start[0] != '0')
916 || (ptr - start == 2 &&
917 (start[0] == '1'
918 || start[0] == '2'
919 || (start[0] == '3' && start[1] <= '1')
920 || (start[0] == '0' && start[1] != '0')))) {
921 *endp = ptr;
922 return start;
923 }
924 start = ptr;
925 }
926 return 0;
927 }
928
find_month(const char * start,const char * end)929 static int find_month(const char *start, const char *end)
930 {
931 static const char *months[] = {
932 "january",
933 "february",
934 "march",
935 "april",
936 "may",
937 "june",
938 "july",
939 "august",
940 "september",
941 "october",
942 "november",
943 "december",
944 };
945 for (;;) {
946 while (start < end && !csalpha(*start))
947 start++;
948 const char *ptr = start;
949 if (start == end)
950 break;
951 while (ptr < end && csalpha(*ptr))
952 ptr++;
953 if (ptr - start >= 3) {
954 for (unsigned int i = 0; i < sizeof(months)/sizeof(months[0]); i++) {
955 const char *q = months[i];
956 const char *p = start;
957 for (; p < ptr; p++, q++)
958 if (cmlower(*p) != *q)
959 break;
960 if (p >= ptr)
961 return i;
962 }
963 }
964 start = ptr;
965 }
966 return -1;
967 }
968
contains_field(char c) const969 int reference::contains_field(char c) const
970 {
971 return field_index[(unsigned char)c] != NULL_FIELD_INDEX;
972 }
973
classify()974 int reference::classify()
975 {
976 if (contains_field('J'))
977 return JOURNAL_ARTICLE;
978 if (contains_field('B'))
979 return ARTICLE_IN_BOOK;
980 if (contains_field('G'))
981 return TECH_REPORT;
982 if (contains_field('R'))
983 return TECH_REPORT;
984 if (contains_field('I'))
985 return BOOK;
986 if (contains_field('M'))
987 return BELL_TM;
988 return OTHER;
989 }
990
get_year(const char ** endp) const991 const char *reference::get_year(const char **endp) const
992 {
993 if (field_index['D'] != NULL_FIELD_INDEX) {
994 string &date = field[field_index['D']];
995 const char *start = date.contents();
996 const char *end = start + date.length();
997 return find_year(start, end, endp);
998 }
999 else
1000 return 0;
1001 }
1002
get_field(unsigned char c,const char ** endp) const1003 const char *reference::get_field(unsigned char c, const char **endp) const
1004 {
1005 if (field_index[c] != NULL_FIELD_INDEX) {
1006 string &f = field[field_index[c]];
1007 const char *start = f.contents();
1008 *endp = start + f.length();
1009 return start;
1010 }
1011 else
1012 return 0;
1013 }
1014
get_date(const char ** endp) const1015 const char *reference::get_date(const char **endp) const
1016 {
1017 return get_field('D', endp);
1018 }
1019
nth_field(int i,const char * start,const char ** endp)1020 const char *nth_field(int i, const char *start, const char **endp)
1021 {
1022 while (--i >= 0) {
1023 start = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1024 if (!start)
1025 return 0;
1026 start++;
1027 }
1028 const char *e = (char *)memchr(start, FIELD_SEPARATOR, *endp - start);
1029 if (e)
1030 *endp = e;
1031 return start;
1032 }
1033
get_author(int i,const char ** endp) const1034 const char *reference::get_author(int i, const char **endp) const
1035 {
1036 for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1037 const char *start = get_field(*f, endp);
1038 if (start) {
1039 if (strchr(MULTI_FIELD_NAMES, *f) != 0)
1040 return nth_field(i, start, endp);
1041 else if (i == 0)
1042 return start;
1043 else
1044 return 0;
1045 }
1046 }
1047 return 0;
1048 }
1049
get_author_last_name(int i,const char ** endp) const1050 const char *reference::get_author_last_name(int i, const char **endp) const
1051 {
1052 for (const char *f = AUTHOR_FIELDS; *f != '\0'; f++) {
1053 const char *start = get_field(*f, endp);
1054 if (start) {
1055 if (strchr(MULTI_FIELD_NAMES, *f) != 0) {
1056 start = nth_field(i, start, endp);
1057 if (!start)
1058 return 0;
1059 }
1060 if (*f == 'A')
1061 return find_last_name(start, *endp, endp);
1062 else
1063 return start;
1064 }
1065 }
1066 return 0;
1067 }
1068
set_date(string & d)1069 void reference::set_date(string &d)
1070 {
1071 if (d.length() == 0)
1072 delete_field('D');
1073 else
1074 insert_field('D', d);
1075 }
1076
same_year(const reference & r1,const reference & r2)1077 int same_year(const reference &r1, const reference &r2)
1078 {
1079 const char *ye1;
1080 const char *ys1 = r1.get_year(&ye1);
1081 const char *ye2;
1082 const char *ys2 = r2.get_year(&ye2);
1083 if (ys1 == 0) {
1084 if (ys2 == 0)
1085 return same_date(r1, r2);
1086 else
1087 return 0;
1088 }
1089 else if (ys2 == 0)
1090 return 0;
1091 else if (ye1 - ys1 != ye2 - ys2)
1092 return 0;
1093 else
1094 return memcmp(ys1, ys2, ye1 - ys1) == 0;
1095 }
1096
same_date(const reference & r1,const reference & r2)1097 int same_date(const reference &r1, const reference &r2)
1098 {
1099 const char *e1;
1100 const char *s1 = r1.get_date(&e1);
1101 const char *e2;
1102 const char *s2 = r2.get_date(&e2);
1103 if (s1 == 0)
1104 return s2 == 0;
1105 else if (s2 == 0)
1106 return 0;
1107 else if (e1 - s1 != e2 - s2)
1108 return 0;
1109 else
1110 return memcmp(s1, s2, e1 - s1) == 0;
1111 }
1112
get_sort_field(int i,int si,int ssi,const char ** endp) const1113 const char *reference::get_sort_field(int i, int si, int ssi,
1114 const char **endp) const
1115 {
1116 const char *start = sort_key.contents();
1117 const char *end = start + sort_key.length();
1118 if (i < 0) {
1119 *endp = end;
1120 return start;
1121 }
1122 while (--i >= 0) {
1123 start = (char *)memchr(start, SORT_SEP, end - start);
1124 if (!start)
1125 return 0;
1126 start++;
1127 }
1128 const char *e = (char *)memchr(start, SORT_SEP, end - start);
1129 if (e)
1130 end = e;
1131 if (si < 0) {
1132 *endp = end;
1133 return start;
1134 }
1135 while (--si >= 0) {
1136 start = (char *)memchr(start, SORT_SUB_SEP, end - start);
1137 if (!start)
1138 return 0;
1139 start++;
1140 }
1141 e = (char *)memchr(start, SORT_SUB_SEP, end - start);
1142 if (e)
1143 end = e;
1144 if (ssi < 0) {
1145 *endp = end;
1146 return start;
1147 }
1148 while (--ssi >= 0) {
1149 start = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1150 if (!start)
1151 return 0;
1152 start++;
1153 }
1154 e = (char *)memchr(start, SORT_SUB_SUB_SEP, end - start);
1155 if (e)
1156 end = e;
1157 *endp = end;
1158 return start;
1159 }
1160
1161