xref: /dragonfly/contrib/mdocml/mandoc.c (revision 1e4d43f9c96723e4e55543d240f182e1aac9a4c2)
1 /*        $Id: mandoc.c,v 1.119 2021/08/10 12:55:03 schwarze Exp $ */
2 /*
3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4  * Copyright (c) 2011-2015, 2017-2021 Ingo Schwarze <schwarze@openbsd.org>
5  *
6  * Permission to use, copy, modify, and distribute this software for any
7  * purpose with or without fee is hereby granted, provided that the above
8  * copyright notice and this permission notice appear in all copies.
9  *
10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17  */
18 #include "config.h"
19 
20 #include <sys/types.h>
21 
22 #include <assert.h>
23 #include <ctype.h>
24 #include <errno.h>
25 #include <limits.h>
26 #include <stdlib.h>
27 #include <stdio.h>
28 #include <string.h>
29 #include <time.h>
30 
31 #include "mandoc_aux.h"
32 #include "mandoc.h"
33 #include "roff.h"
34 #include "libmandoc.h"
35 #include "roff_int.h"
36 
37 static    int        a2time(time_t *, const char *, const char *);
38 static    char      *time2a(time_t);
39 
40 
41 enum mandoc_esc
mandoc_font(const char * cp,int sz)42 mandoc_font(const char *cp, int sz)
43 {
44           switch (sz) {
45           case 0:
46                     return ESCAPE_FONTPREV;
47           case 1:
48                     switch (cp[0]) {
49                     case 'B':
50                     case '3':
51                               return ESCAPE_FONTBOLD;
52                     case 'I':
53                     case '2':
54                               return ESCAPE_FONTITALIC;
55                     case 'P':
56                               return ESCAPE_FONTPREV;
57                     case 'R':
58                     case '1':
59                               return ESCAPE_FONTROMAN;
60                     case '4':
61                               return ESCAPE_FONTBI;
62                     default:
63                               return ESCAPE_ERROR;
64                     }
65           case 2:
66                     switch (cp[0]) {
67                     case 'B':
68                               switch (cp[1]) {
69                               case 'I':
70                                         return ESCAPE_FONTBI;
71                               default:
72                                         return ESCAPE_ERROR;
73                               }
74                     case 'C':
75                               switch (cp[1]) {
76                               case 'B':
77                                         return ESCAPE_FONTCB;
78                               case 'I':
79                                         return ESCAPE_FONTCI;
80                               case 'R':
81                               case 'W':
82                                         return ESCAPE_FONTCR;
83                               default:
84                                         return ESCAPE_ERROR;
85                               }
86                     default:
87                               return ESCAPE_ERROR;
88                     }
89           default:
90                     return ESCAPE_ERROR;
91           }
92 }
93 
94 enum mandoc_esc
mandoc_escape(const char ** end,const char ** start,int * sz)95 mandoc_escape(const char **end, const char **start, int *sz)
96 {
97           const char          *local_start;
98           int                  local_sz, c, i;
99           char                 term;
100           enum mandoc_esc      gly;
101 
102           /*
103            * When the caller doesn't provide return storage,
104            * use local storage.
105            */
106 
107           if (NULL == start)
108                     start = &local_start;
109           if (NULL == sz)
110                     sz = &local_sz;
111 
112           /*
113            * Treat "\E" just like "\";
114            * it only makes a difference in copy mode.
115            */
116 
117           if (**end == 'E')
118                     ++*end;
119 
120           /*
121            * Beyond the backslash, at least one input character
122            * is part of the escape sequence.  With one exception
123            * (see below), that character won't be returned.
124            */
125 
126           gly = ESCAPE_ERROR;
127           *start = ++*end;
128           *sz = 0;
129           term = '\0';
130 
131           switch ((*start)[-1]) {
132           /*
133            * First the glyphs.  There are several different forms of
134            * these, but each eventually returns a substring of the glyph
135            * name.
136            */
137           case '(':
138                     gly = ESCAPE_SPECIAL;
139                     *sz = 2;
140                     break;
141           case '[':
142                     if (**start == ' ') {
143                               ++*end;
144                               return ESCAPE_ERROR;
145                     }
146                     gly = ESCAPE_SPECIAL;
147                     term = ']';
148                     break;
149           case 'C':
150                     if ('\'' != **start)
151                               return ESCAPE_ERROR;
152                     *start = ++*end;
153                     gly = ESCAPE_SPECIAL;
154                     term = '\'';
155                     break;
156 
157           /*
158            * Escapes taking no arguments at all.
159            */
160           case '!':
161           case '?':
162                     return ESCAPE_UNSUPP;
163           case '%':
164           case '&':
165           case ')':
166           case ',':
167           case '/':
168           case '^':
169           case 'a':
170           case 'd':
171           case 'r':
172           case 't':
173           case 'u':
174           case '{':
175           case '|':
176           case '}':
177                     return ESCAPE_IGNORE;
178           case 'c':
179                     return ESCAPE_NOSPACE;
180           case 'p':
181                     return ESCAPE_BREAK;
182 
183           /*
184            * The \z escape is supposed to output the following
185            * character without advancing the cursor position.
186            * Since we are mostly dealing with terminal mode,
187            * let us just skip the next character.
188            */
189           case 'z':
190                     return ESCAPE_SKIPCHAR;
191 
192           /*
193            * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
194            * 'X' is the trigger.  These have opaque sub-strings.
195            */
196           case 'F':
197           case 'f':
198           case 'g':
199           case 'k':
200           case 'M':
201           case 'm':
202           case 'n':
203           case 'O':
204           case 'V':
205           case 'Y':
206           case '*':
207                     switch ((*start)[-1]) {
208                     case 'f':
209                               gly = ESCAPE_FONT;
210                               break;
211                     case '*':
212                               gly = ESCAPE_DEVICE;
213                               break;
214                     default:
215                               gly = ESCAPE_IGNORE;
216                               break;
217                     }
218                     switch (**start) {
219                     case '(':
220                               if ((*start)[-1] == 'O')
221                                         gly = ESCAPE_ERROR;
222                               *start = ++*end;
223                               *sz = 2;
224                               break;
225                     case '[':
226                               if ((*start)[-1] == 'O')
227                                         gly = (*start)[1] == '5' ?
228                                             ESCAPE_UNSUPP : ESCAPE_ERROR;
229                               *start = ++*end;
230                               term = ']';
231                               break;
232                     default:
233                               if ((*start)[-1] == 'O') {
234                                         switch (**start) {
235                                         case '0':
236                                                   gly = ESCAPE_UNSUPP;
237                                                   break;
238                                         case '1':
239                                         case '2':
240                                         case '3':
241                                         case '4':
242                                                   break;
243                                         default:
244                                                   gly = ESCAPE_ERROR;
245                                                   break;
246                                         }
247                               }
248                               *sz = 1;
249                               break;
250                     }
251                     break;
252 
253           /*
254            * These escapes are of the form \X'Y', where 'X' is the trigger
255            * and 'Y' is any string.  These have opaque sub-strings.
256            * The \B and \w escapes are handled in roff.c, roff_res().
257            */
258           case 'A':
259           case 'b':
260           case 'D':
261           case 'R':
262           case 'X':
263           case 'Z':
264                     gly = ESCAPE_IGNORE;
265                     /* FALLTHROUGH */
266           case 'o':
267                     if (**start == '\0')
268                               return ESCAPE_ERROR;
269                     if (gly == ESCAPE_ERROR)
270                               gly = ESCAPE_OVERSTRIKE;
271                     term = **start;
272                     *start = ++*end;
273                     break;
274 
275           /*
276            * These escapes are of the form \X'N', where 'X' is the trigger
277            * and 'N' resolves to a numerical expression.
278            */
279           case 'h':
280           case 'H':
281           case 'L':
282           case 'l':
283           case 'S':
284           case 'v':
285           case 'x':
286                     if (strchr(" %&()*+-./0123456789:<=>", **start)) {
287                               if ('\0' != **start)
288                                         ++*end;
289                               return ESCAPE_ERROR;
290                     }
291                     switch ((*start)[-1]) {
292                     case 'h':
293                               gly = ESCAPE_HORIZ;
294                               break;
295                     case 'l':
296                               gly = ESCAPE_HLINE;
297                               break;
298                     default:
299                               gly = ESCAPE_IGNORE;
300                               break;
301                     }
302                     term = **start;
303                     *start = ++*end;
304                     break;
305 
306           /*
307            * Special handling for the numbered character escape.
308            * XXX Do any other escapes need similar handling?
309            */
310           case 'N':
311                     if ('\0' == **start)
312                               return ESCAPE_ERROR;
313                     (*end)++;
314                     if (isdigit((unsigned char)**start)) {
315                               *sz = 1;
316                               return ESCAPE_IGNORE;
317                     }
318                     (*start)++;
319                     while (isdigit((unsigned char)**end))
320                               (*end)++;
321                     *sz = *end - *start;
322                     if ('\0' != **end)
323                               (*end)++;
324                     return ESCAPE_NUMBERED;
325 
326           /*
327            * Sizes get a special category of their own.
328            */
329           case 's':
330                     gly = ESCAPE_IGNORE;
331 
332                     /* See +/- counts as a sign. */
333                     if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
334                               *start = ++*end;
335 
336                     switch (**end) {
337                     case '(':
338                               *start = ++*end;
339                               *sz = 2;
340                               break;
341                     case '[':
342                               *start = ++*end;
343                               term = ']';
344                               break;
345                     case '\'':
346                               *start = ++*end;
347                               term = '\'';
348                               break;
349                     case '3':
350                     case '2':
351                     case '1':
352                               *sz = (*end)[-1] == 's' &&
353                                   isdigit((unsigned char)(*end)[1]) ? 2 : 1;
354                               break;
355                     default:
356                               *sz = 1;
357                               break;
358                     }
359 
360                     break;
361 
362           /*
363            * Several special characters can be encoded as
364            * one-byte escape sequences without using \[].
365            */
366           case ' ':
367           case '\'':
368           case '-':
369           case '.':
370           case '0':
371           case ':':
372           case '_':
373           case '`':
374           case 'e':
375           case '~':
376                     gly = ESCAPE_SPECIAL;
377                     /* FALLTHROUGH */
378           default:
379                     if (gly == ESCAPE_ERROR)
380                               gly = ESCAPE_UNDEF;
381                     *start = --*end;
382                     *sz = 1;
383                     break;
384           }
385 
386           /*
387            * Read up to the terminating character,
388            * paying attention to nested escapes.
389            */
390 
391           if ('\0' != term) {
392                     while (**end != term) {
393                               switch (**end) {
394                               case '\0':
395                                         return ESCAPE_ERROR;
396                               case '\\':
397                                         (*end)++;
398                                         if (ESCAPE_ERROR ==
399                                             mandoc_escape(end, NULL, NULL))
400                                                   return ESCAPE_ERROR;
401                                         break;
402                               default:
403                                         (*end)++;
404                                         break;
405                               }
406                     }
407                     *sz = (*end)++ - *start;
408 
409                     /*
410                      * The file chars.c only provides one common list
411                      * of character names, but \[-] == \- is the only
412                      * one of the characters with one-byte names that
413                      * allows enclosing the name in brackets.
414                      */
415                     if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
416                               return ESCAPE_ERROR;
417           } else {
418                     assert(*sz > 0);
419                     if ((size_t)*sz > strlen(*start))
420                               return ESCAPE_ERROR;
421                     *end += *sz;
422           }
423 
424           /* Run post-processors. */
425 
426           switch (gly) {
427           case ESCAPE_FONT:
428                     gly = mandoc_font(*start, *sz);
429                     break;
430           case ESCAPE_SPECIAL:
431                     if (**start == 'c') {
432                               if (*sz < 6 || *sz > 7 ||
433                                   strncmp(*start, "char", 4) != 0 ||
434                                   (int)strspn(*start + 4, "0123456789") + 4 < *sz)
435                                         break;
436                               c = 0;
437                               for (i = 4; i < *sz; i++)
438                                         c = 10 * c + ((*start)[i] - '0');
439                               if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
440                                         break;
441                               *start += 4;
442                               *sz -= 4;
443                               gly = ESCAPE_NUMBERED;
444                               break;
445                     }
446 
447                     /*
448                      * Unicode escapes are defined in groff as \[u0000]
449                      * to \[u10FFFF], where the contained value must be
450                      * a valid Unicode codepoint.  Here, however, only
451                      * check the length and range.
452                      */
453                     if (**start != 'u' || *sz < 5 || *sz > 7)
454                               break;
455                     if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
456                               break;
457                     if (*sz == 6 && (*start)[1] == '0')
458                               break;
459                     if (*sz == 5 && (*start)[1] == 'D' &&
460                         strchr("89ABCDEF", (*start)[2]) != NULL)
461                               break;
462                     if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
463                         + 1 == *sz)
464                               gly = ESCAPE_UNICODE;
465                     break;
466           case ESCAPE_DEVICE:
467                     assert(*sz == 2 && (*start)[0] == '.' && (*start)[1] == 'T');
468                     break;
469           default:
470                     break;
471           }
472 
473           return gly;
474 }
475 
476 static int
a2time(time_t * t,const char * fmt,const char * p)477 a2time(time_t *t, const char *fmt, const char *p)
478 {
479           struct tm  tm;
480           char                *pp;
481 
482           memset(&tm, 0, sizeof(struct tm));
483 
484           pp = NULL;
485 #if HAVE_STRPTIME
486           pp = strptime(p, fmt, &tm);
487 #endif
488           if (NULL != pp && '\0' == *pp) {
489                     *t = mktime(&tm);
490                     return 1;
491           }
492 
493           return 0;
494 }
495 
496 static char *
time2a(time_t t)497 time2a(time_t t)
498 {
499           struct tm *tm;
500           char                *buf, *p;
501           size_t               ssz;
502           int                  isz;
503 
504           buf = NULL;
505           tm = localtime(&t);
506           if (tm == NULL)
507                     goto fail;
508 
509           /*
510            * Reserve space:
511            * up to 9 characters for the month (September) + blank
512            * up to 2 characters for the day + comma + blank
513            * 4 characters for the year and a terminating '\0'
514            */
515 
516           p = buf = mandoc_malloc(10 + 4 + 4 + 1);
517 
518           if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
519                     goto fail;
520           p += (int)ssz;
521 
522           /*
523            * The output format is just "%d" here, not "%2d" or "%02d".
524            * That's also the reason why we can't just format the
525            * date as a whole with "%B %e, %Y" or "%B %d, %Y".
526            * Besides, the present approach is less prone to buffer
527            * overflows, in case anybody should ever introduce the bug
528            * of looking at LC_TIME.
529            */
530 
531           isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday);
532           if (isz < 0 || isz > 4)
533                     goto fail;
534           p += isz;
535 
536           if (strftime(p, 4 + 1, "%Y", tm) == 0)
537                     goto fail;
538           return buf;
539 
540 fail:
541           free(buf);
542           return mandoc_strdup("");
543 }
544 
545 char *
mandoc_normdate(struct roff_node * nch,struct roff_node * nbl)546 mandoc_normdate(struct roff_node *nch, struct roff_node *nbl)
547 {
548           char                *cp;
549           time_t               t;
550 
551           /* No date specified. */
552 
553           if (nch == NULL) {
554                     if (nbl == NULL)
555                               mandoc_msg(MANDOCERR_DATE_MISSING, 0, 0, NULL);
556                     else
557                               mandoc_msg(MANDOCERR_DATE_MISSING, nbl->line,
558                                   nbl->pos, "%s", roff_name[nbl->tok]);
559                     return mandoc_strdup("");
560           }
561           if (*nch->string == '\0') {
562                     mandoc_msg(MANDOCERR_DATE_MISSING, nch->line,
563                         nch->pos, "%s", roff_name[nbl->tok]);
564                     return mandoc_strdup("");
565           }
566           if (strcmp(nch->string, "$" "Mdocdate$") == 0)
567                     return time2a(time(NULL));
568 
569           /* Valid mdoc(7) date format. */
570 
571           if (a2time(&t, "$" "Mdocdate: %b %d %Y $", nch->string) ||
572               a2time(&t, "%b %d, %Y", nch->string)) {
573                     cp = time2a(t);
574                     if (t > time(NULL) + 86400)
575                               mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line,
576                                   nch->pos, "%s %s", roff_name[nbl->tok], cp);
577                     else if (*nch->string != '$' &&
578                         strcmp(nch->string, cp) != 0)
579                               mandoc_msg(MANDOCERR_DATE_NORM, nch->line,
580                                   nch->pos, "%s %s", roff_name[nbl->tok], cp);
581                     return cp;
582           }
583 
584           /* In man(7), do not warn about the legacy format. */
585 
586           if (a2time(&t, "%Y-%m-%d", nch->string) == 0)
587                     mandoc_msg(MANDOCERR_DATE_BAD, nch->line, nch->pos,
588                         "%s %s", roff_name[nbl->tok], nch->string);
589           else if (t > time(NULL) + 86400)
590                     mandoc_msg(MANDOCERR_DATE_FUTURE, nch->line, nch->pos,
591                         "%s %s", roff_name[nbl->tok], nch->string);
592           else if (nbl->tok == MDOC_Dd)
593                     mandoc_msg(MANDOCERR_DATE_LEGACY, nch->line, nch->pos,
594                         "Dd %s", nch->string);
595 
596           /* Use any non-mdoc(7) date verbatim. */
597 
598           return mandoc_strdup(nch->string);
599 }
600 
601 int
mandoc_eos(const char * p,size_t sz)602 mandoc_eos(const char *p, size_t sz)
603 {
604           const char          *q;
605           int                  enclosed, found;
606 
607           if (0 == sz)
608                     return 0;
609 
610           /*
611            * End-of-sentence recognition must include situations where
612            * some symbols, such as `)', allow prior EOS punctuation to
613            * propagate outward.
614            */
615 
616           enclosed = found = 0;
617           for (q = p + (int)sz - 1; q >= p; q--) {
618                     switch (*q) {
619                     case '\"':
620                     case '\'':
621                     case ']':
622                     case ')':
623                               if (0 == found)
624                                         enclosed = 1;
625                               break;
626                     case '.':
627                     case '!':
628                     case '?':
629                               found = 1;
630                               break;
631                     default:
632                               return found &&
633                                   (!enclosed || isalnum((unsigned char)*q));
634                     }
635           }
636 
637           return found && !enclosed;
638 }
639 
640 /*
641  * Convert a string to a long that may not be <0.
642  * If the string is invalid, or is less than 0, return -1.
643  */
644 int
mandoc_strntoi(const char * p,size_t sz,int base)645 mandoc_strntoi(const char *p, size_t sz, int base)
646 {
647           char                 buf[32];
648           char                *ep;
649           long                 v;
650 
651           if (sz > 31)
652                     return -1;
653 
654           memcpy(buf, p, sz);
655           buf[(int)sz] = '\0';
656 
657           errno = 0;
658           v = strtol(buf, &ep, base);
659 
660           if (buf[0] == '\0' || *ep != '\0')
661                     return -1;
662 
663           if (v > INT_MAX)
664                     v = INT_MAX;
665           if (v < INT_MIN)
666                     v = INT_MIN;
667 
668           return (int)v;
669 }
670