Annotation of mandoc/mandoc.c, Revision 1.113
1.113 ! schwarze 1: /* $Id: mandoc.c,v 1.112 2018/12/16 00:17:02 schwarze Exp $ */
1.1 kristaps 2: /*
1.90 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
1.104 schwarze 4: * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #include "config.h"
1.7 kristaps 19:
1.2 kristaps 20: #include <sys/types.h>
21:
1.1 kristaps 22: #include <assert.h>
23: #include <ctype.h>
1.50 kristaps 24: #include <errno.h>
25: #include <limits.h>
1.1 kristaps 26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.101 schwarze 31: #include "mandoc_aux.h"
1.18 kristaps 32: #include "mandoc.h"
1.101 schwarze 33: #include "roff.h"
1.1 kristaps 34: #include "libmandoc.h"
1.37 schwarze 35:
1.18 kristaps 36: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 37: static char *time2a(time_t);
1.7 kristaps 38:
1.45 kristaps 39:
40: enum mandoc_esc
1.112 schwarze 41: mandoc_font(const char *cp, int sz)
42: {
43: switch (sz) {
44: case 0:
45: return ESCAPE_FONTPREV;
46: case 1:
47: switch (cp[0]) {
48: case 'B':
49: case '3':
50: return ESCAPE_FONTBOLD;
51: case 'I':
52: case '2':
53: return ESCAPE_FONTITALIC;
54: case 'P':
55: return ESCAPE_FONTPREV;
56: case 'R':
57: case '1':
58: return ESCAPE_FONTROMAN;
59: case '4':
60: return ESCAPE_FONTBI;
61: default:
62: return ESCAPE_ERROR;
63: }
64: case 2:
65: switch (cp[0]) {
66: case 'B':
67: switch (cp[1]) {
68: case 'I':
69: return ESCAPE_FONTBI;
70: default:
71: return ESCAPE_ERROR;
72: }
73: case 'C':
74: switch (cp[1]) {
75: case 'B':
76: return ESCAPE_FONTBOLD;
77: case 'I':
78: return ESCAPE_FONTITALIC;
79: case 'R':
80: case 'W':
81: return ESCAPE_FONTCW;
82: default:
83: return ESCAPE_ERROR;
84: }
85: default:
86: return ESCAPE_ERROR;
87: }
88: default:
89: return ESCAPE_ERROR;
90: }
91: }
92:
93: enum mandoc_esc
1.74 schwarze 94: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 95: {
1.65 schwarze 96: const char *local_start;
1.105 schwarze 97: int local_sz, c, i;
1.65 schwarze 98: char term;
1.79 schwarze 99: enum mandoc_esc gly;
1.45 kristaps 100:
1.65 schwarze 101: /*
102: * When the caller doesn't provide return storage,
103: * use local storage.
104: */
105:
106: if (NULL == start)
107: start = &local_start;
108: if (NULL == sz)
109: sz = &local_sz;
110:
111: /*
1.111 schwarze 112: * Treat "\E" just like "\";
113: * it only makes a difference in copy mode.
114: */
115:
116: if (**end == 'E')
117: ++*end;
118:
119: /*
1.65 schwarze 120: * Beyond the backslash, at least one input character
121: * is part of the escape sequence. With one exception
122: * (see below), that character won't be returned.
123: */
124:
1.45 kristaps 125: gly = ESCAPE_ERROR;
1.65 schwarze 126: *start = ++*end;
127: *sz = 0;
1.64 schwarze 128: term = '\0';
1.18 kristaps 129:
1.65 schwarze 130: switch ((*start)[-1]) {
1.45 kristaps 131: /*
132: * First the glyphs. There are several different forms of
133: * these, but each eventually returns a substring of the glyph
134: * name.
135: */
1.79 schwarze 136: case '(':
1.45 kristaps 137: gly = ESCAPE_SPECIAL;
1.65 schwarze 138: *sz = 2;
1.45 kristaps 139: break;
1.79 schwarze 140: case '[':
1.111 schwarze 141: if (**start == ' ') {
142: ++*end;
143: return ESCAPE_ERROR;
144: }
1.45 kristaps 145: gly = ESCAPE_SPECIAL;
146: term = ']';
147: break;
1.79 schwarze 148: case 'C':
1.65 schwarze 149: if ('\'' != **start)
1.94 schwarze 150: return ESCAPE_ERROR;
1.65 schwarze 151: *start = ++*end;
1.87 schwarze 152: gly = ESCAPE_SPECIAL;
1.45 kristaps 153: term = '\'';
154: break;
1.72 schwarze 155:
156: /*
157: * Escapes taking no arguments at all.
158: */
1.111 schwarze 159: case '!':
160: case '?':
161: return ESCAPE_UNSUPP;
162: case '%':
163: case '&':
164: case ')':
165: case ',':
166: case '/':
167: case '^':
168: case 'a':
1.79 schwarze 169: case 'd':
1.111 schwarze 170: case 'r':
171: case 't':
1.79 schwarze 172: case 'u':
1.111 schwarze 173: case '{':
174: case '|':
175: case '}':
1.94 schwarze 176: return ESCAPE_IGNORE;
1.111 schwarze 177: case 'c':
178: return ESCAPE_NOSPACE;
1.102 schwarze 179: case 'p':
180: return ESCAPE_BREAK;
1.63 schwarze 181:
182: /*
183: * The \z escape is supposed to output the following
1.79 schwarze 184: * character without advancing the cursor position.
1.63 schwarze 185: * Since we are mostly dealing with terminal mode,
186: * let us just skip the next character.
187: */
1.79 schwarze 188: case 'z':
1.94 schwarze 189: return ESCAPE_SKIPCHAR;
1.1 kristaps 190:
1.45 kristaps 191: /*
192: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
193: * 'X' is the trigger. These have opaque sub-strings.
194: */
1.79 schwarze 195: case 'F':
1.111 schwarze 196: case 'f':
1.79 schwarze 197: case 'g':
198: case 'k':
199: case 'M':
200: case 'm':
201: case 'n':
1.111 schwarze 202: case 'O':
1.79 schwarze 203: case 'V':
204: case 'Y':
1.111 schwarze 205: gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
1.65 schwarze 206: switch (**start) {
1.79 schwarze 207: case '(':
1.111 schwarze 208: if ((*start)[-1] == 'O')
209: gly = ESCAPE_ERROR;
1.65 schwarze 210: *start = ++*end;
211: *sz = 2;
1.45 kristaps 212: break;
1.79 schwarze 213: case '[':
1.111 schwarze 214: if ((*start)[-1] == 'O')
215: gly = (*start)[1] == '5' ?
216: ESCAPE_UNSUPP : ESCAPE_ERROR;
1.65 schwarze 217: *start = ++*end;
1.45 kristaps 218: term = ']';
219: break;
220: default:
1.111 schwarze 221: if ((*start)[-1] == 'O') {
222: switch (**start) {
223: case '0':
224: gly = ESCAPE_UNSUPP;
225: break;
226: case '1':
227: case '2':
228: case '3':
229: case '4':
230: break;
231: default:
232: gly = ESCAPE_ERROR;
233: break;
234: }
235: }
1.65 schwarze 236: *sz = 1;
1.45 kristaps 237: break;
238: }
1.106 schwarze 239: break;
240: case '*':
241: if (strncmp(*start, "(.T", 3) != 0)
242: abort();
243: gly = ESCAPE_DEVICE;
244: *start = ++*end;
245: *sz = 2;
1.45 kristaps 246: break;
247:
248: /*
249: * These escapes are of the form \X'Y', where 'X' is the trigger
250: * and 'Y' is any string. These have opaque sub-strings.
1.78 schwarze 251: * The \B and \w escapes are handled in roff.c, roff_res().
1.45 kristaps 252: */
1.79 schwarze 253: case 'A':
254: case 'b':
255: case 'D':
256: case 'R':
257: case 'X':
258: case 'Z':
1.91 schwarze 259: gly = ESCAPE_IGNORE;
260: /* FALLTHROUGH */
261: case 'o':
262: if (**start == '\0')
1.94 schwarze 263: return ESCAPE_ERROR;
1.91 schwarze 264: if (gly == ESCAPE_ERROR)
265: gly = ESCAPE_OVERSTRIKE;
1.77 schwarze 266: term = **start;
1.65 schwarze 267: *start = ++*end;
1.24 kristaps 268: break;
1.45 kristaps 269:
270: /*
271: * These escapes are of the form \X'N', where 'X' is the trigger
272: * and 'N' resolves to a numerical expression.
273: */
1.79 schwarze 274: case 'h':
275: case 'H':
276: case 'L':
277: case 'l':
278: case 'S':
279: case 'v':
280: case 'x':
1.82 schwarze 281: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.86 kristaps 282: if ('\0' != **start)
283: ++*end;
1.94 schwarze 284: return ESCAPE_ERROR;
1.82 schwarze 285: }
1.100 schwarze 286: switch ((*start)[-1]) {
287: case 'h':
288: gly = ESCAPE_HORIZ;
289: break;
290: case 'l':
291: gly = ESCAPE_HLINE;
292: break;
293: default:
294: gly = ESCAPE_IGNORE;
295: break;
296: }
1.77 schwarze 297: term = **start;
1.65 schwarze 298: *start = ++*end;
1.45 kristaps 299: break;
1.60 schwarze 300:
301: /*
302: * Special handling for the numbered character escape.
303: * XXX Do any other escapes need similar handling?
304: */
1.79 schwarze 305: case 'N':
1.65 schwarze 306: if ('\0' == **start)
1.94 schwarze 307: return ESCAPE_ERROR;
1.65 schwarze 308: (*end)++;
309: if (isdigit((unsigned char)**start)) {
310: *sz = 1;
1.94 schwarze 311: return ESCAPE_IGNORE;
1.65 schwarze 312: }
313: (*start)++;
1.60 schwarze 314: while (isdigit((unsigned char)**end))
315: (*end)++;
1.65 schwarze 316: *sz = *end - *start;
1.60 schwarze 317: if ('\0' != **end)
318: (*end)++;
1.94 schwarze 319: return ESCAPE_NUMBERED;
1.45 kristaps 320:
1.79 schwarze 321: /*
1.45 kristaps 322: * Sizes get a special category of their own.
323: */
1.79 schwarze 324: case 's':
1.45 kristaps 325: gly = ESCAPE_IGNORE;
1.28 kristaps 326:
1.45 kristaps 327: /* See +/- counts as a sign. */
1.65 schwarze 328: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.90 schwarze 329: *start = ++*end;
1.8 kristaps 330:
1.65 schwarze 331: switch (**end) {
1.79 schwarze 332: case '(':
1.65 schwarze 333: *start = ++*end;
334: *sz = 2;
1.22 kristaps 335: break;
1.79 schwarze 336: case '[':
1.65 schwarze 337: *start = ++*end;
1.64 schwarze 338: term = ']';
1.22 kristaps 339: break;
1.79 schwarze 340: case '\'':
1.65 schwarze 341: *start = ++*end;
1.64 schwarze 342: term = '\'';
1.92 schwarze 343: break;
344: case '3':
345: case '2':
346: case '1':
347: *sz = (*end)[-1] == 's' &&
348: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.22 kristaps 349: break;
350: default:
1.65 schwarze 351: *sz = 1;
1.22 kristaps 352: break;
1.8 kristaps 353: }
354:
1.45 kristaps 355: break;
1.33 kristaps 356:
1.45 kristaps 357: /*
1.111 schwarze 358: * Several special characters can be encoded as
359: * one-byte escape sequences without using \[].
1.45 kristaps 360: */
1.111 schwarze 361: case ' ':
362: case '\'':
363: case '-':
364: case '.':
365: case '0':
366: case ':':
367: case '_':
368: case '`':
369: case 'e':
370: case '~':
371: gly = ESCAPE_SPECIAL;
372: /* FALLTHROUGH */
1.45 kristaps 373: default:
1.111 schwarze 374: if (gly == ESCAPE_ERROR)
375: gly = ESCAPE_UNDEF;
1.65 schwarze 376: *start = --*end;
377: *sz = 1;
1.22 kristaps 378: break;
1.45 kristaps 379: }
380:
381: /*
1.64 schwarze 382: * Read up to the terminating character,
383: * paying attention to nested escapes.
1.45 kristaps 384: */
385:
386: if ('\0' != term) {
1.64 schwarze 387: while (**end != term) {
388: switch (**end) {
1.79 schwarze 389: case '\0':
1.94 schwarze 390: return ESCAPE_ERROR;
1.79 schwarze 391: case '\\':
1.64 schwarze 392: (*end)++;
393: if (ESCAPE_ERROR ==
394: mandoc_escape(end, NULL, NULL))
1.94 schwarze 395: return ESCAPE_ERROR;
1.64 schwarze 396: break;
397: default:
398: (*end)++;
399: break;
400: }
401: }
1.65 schwarze 402: *sz = (*end)++ - *start;
1.111 schwarze 403:
404: /*
405: * The file chars.c only provides one common list
406: * of character names, but \[-] == \- is the only
407: * one of the characters with one-byte names that
408: * allows enclosing the name in brackets.
409: */
410: if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
411: return ESCAPE_ERROR;
1.64 schwarze 412: } else {
1.65 schwarze 413: assert(*sz > 0);
414: if ((size_t)*sz > strlen(*start))
1.94 schwarze 415: return ESCAPE_ERROR;
1.65 schwarze 416: *end += *sz;
1.45 kristaps 417: }
418:
419: /* Run post-processors. */
420:
421: switch (gly) {
1.79 schwarze 422: case ESCAPE_FONT:
1.112 schwarze 423: gly = mandoc_font(*start, *sz);
1.46 kristaps 424: break;
1.79 schwarze 425: case ESCAPE_SPECIAL:
1.105 schwarze 426: if (**start == 'c') {
427: if (*sz < 6 || *sz > 7 ||
428: strncmp(*start, "char", 4) != 0 ||
429: (int)strspn(*start + 4, "0123456789") + 4 < *sz)
430: break;
431: c = 0;
432: for (i = 4; i < *sz; i++)
433: c = 10 * c + ((*start)[i] - '0');
434: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
435: break;
436: *start += 4;
437: *sz -= 4;
438: gly = ESCAPE_NUMBERED;
439: break;
440: }
441:
1.87 schwarze 442: /*
1.88 schwarze 443: * Unicode escapes are defined in groff as \[u0000]
1.87 schwarze 444: * to \[u10FFFF], where the contained value must be
445: * a valid Unicode codepoint. Here, however, only
1.88 schwarze 446: * check the length and range.
1.87 schwarze 447: */
1.88 schwarze 448: if (**start != 'u' || *sz < 5 || *sz > 7)
449: break;
450: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
451: break;
452: if (*sz == 6 && (*start)[1] == '0')
1.96 schwarze 453: break;
454: if (*sz == 5 && (*start)[1] == 'D' &&
455: strchr("89ABCDEF", (*start)[2]) != NULL)
1.88 schwarze 456: break;
457: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.87 schwarze 458: + 1 == *sz)
459: gly = ESCAPE_UNICODE;
1.22 kristaps 460: break;
1.1 kristaps 461: default:
1.22 kristaps 462: break;
1.1 kristaps 463: }
464:
1.94 schwarze 465: return gly;
1.4 kristaps 466: }
1.7 kristaps 467:
468: static int
469: a2time(time_t *t, const char *fmt, const char *p)
470: {
471: struct tm tm;
472: char *pp;
473:
474: memset(&tm, 0, sizeof(struct tm));
475:
1.56 kristaps 476: pp = NULL;
1.85 schwarze 477: #if HAVE_STRPTIME
1.7 kristaps 478: pp = strptime(p, fmt, &tm);
1.56 kristaps 479: #endif
1.7 kristaps 480: if (NULL != pp && '\0' == *pp) {
481: *t = mktime(&tm);
1.94 schwarze 482: return 1;
1.7 kristaps 483: }
484:
1.94 schwarze 485: return 0;
1.7 kristaps 486: }
487:
1.37 schwarze 488: static char *
489: time2a(time_t t)
490: {
1.56 kristaps 491: struct tm *tm;
1.38 schwarze 492: char *buf, *p;
493: size_t ssz;
1.37 schwarze 494: int isz;
495:
1.56 kristaps 496: tm = localtime(&t);
1.89 schwarze 497: if (tm == NULL)
1.94 schwarze 498: return NULL;
1.37 schwarze 499:
1.38 schwarze 500: /*
501: * Reserve space:
502: * up to 9 characters for the month (September) + blank
503: * up to 2 characters for the day + comma + blank
504: * 4 characters for the year and a terminating '\0'
505: */
1.98 schwarze 506:
1.38 schwarze 507: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
508:
1.98 schwarze 509: if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
1.38 schwarze 510: goto fail;
511: p += (int)ssz;
1.37 schwarze 512:
1.98 schwarze 513: /*
514: * The output format is just "%d" here, not "%2d" or "%02d".
515: * That's also the reason why we can't just format the
516: * date as a whole with "%B %e, %Y" or "%B %d, %Y".
517: * Besides, the present approach is less prone to buffer
518: * overflows, in case anybody should ever introduce the bug
519: * of looking at LC_TIME.
520: */
521:
522: if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
1.38 schwarze 523: goto fail;
1.37 schwarze 524: p += isz;
525:
1.98 schwarze 526: if (strftime(p, 4 + 1, "%Y", tm) == 0)
1.38 schwarze 527: goto fail;
1.94 schwarze 528: return buf;
1.38 schwarze 529:
530: fail:
531: free(buf);
1.94 schwarze 532: return NULL;
1.37 schwarze 533: }
534:
535: char *
1.101 schwarze 536: mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
1.7 kristaps 537: {
1.103 schwarze 538: char *cp;
1.7 kristaps 539: time_t t;
540:
1.98 schwarze 541: /* No date specified: use today's date. */
542:
543: if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
1.109 schwarze 544: mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
1.98 schwarze 545: return time2a(time(NULL));
1.37 schwarze 546: }
1.98 schwarze 547:
548: /* Valid mdoc(7) date format. */
549:
550: if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
1.103 schwarze 551: a2time(&t, "%b %d, %Y", in)) {
552: cp = time2a(t);
553: if (t > time(NULL) + 86400)
1.109 schwarze 554: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
1.104 schwarze 555: else if (*in != '$' && strcmp(in, cp) != 0)
1.109 schwarze 556: mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
1.103 schwarze 557: return cp;
558: }
1.98 schwarze 559:
1.101 schwarze 560: /* In man(7), do not warn about the legacy format. */
1.98 schwarze 561:
1.101 schwarze 562: if (a2time(&t, "%Y-%m-%d", in) == 0)
1.109 schwarze 563: mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
1.103 schwarze 564: else if (t > time(NULL) + 86400)
1.109 schwarze 565: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
1.101 schwarze 566: else if (man->macroset == MACROSET_MDOC)
1.109 schwarze 567: mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
1.98 schwarze 568:
569: /* Use any non-mdoc(7) date verbatim. */
570:
571: return mandoc_strdup(in);
1.7 kristaps 572: }
573:
1.12 kristaps 574: int
1.75 schwarze 575: mandoc_eos(const char *p, size_t sz)
1.12 kristaps 576: {
1.75 schwarze 577: const char *q;
578: int enclosed, found;
1.12 kristaps 579:
1.13 kristaps 580: if (0 == sz)
1.94 schwarze 581: return 0;
1.12 kristaps 582:
1.14 kristaps 583: /*
584: * End-of-sentence recognition must include situations where
585: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 586: * propagate outward.
1.14 kristaps 587: */
588:
1.75 schwarze 589: enclosed = found = 0;
1.25 kristaps 590: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 591: switch (*q) {
1.79 schwarze 592: case '\"':
593: case '\'':
594: case ']':
595: case ')':
1.23 schwarze 596: if (0 == found)
597: enclosed = 1;
1.14 kristaps 598: break;
1.79 schwarze 599: case '.':
600: case '!':
601: case '?':
1.23 schwarze 602: found = 1;
603: break;
1.14 kristaps 604: default:
1.94 schwarze 605: return found &&
606: (!enclosed || isalnum((unsigned char)*q));
1.14 kristaps 607: }
1.12 kristaps 608: }
609:
1.94 schwarze 610: return found && !enclosed;
1.44 kristaps 611: }
1.50 kristaps 612:
613: /*
614: * Convert a string to a long that may not be <0.
615: * If the string is invalid, or is less than 0, return -1.
616: */
617: int
1.54 kristaps 618: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 619: {
620: char buf[32];
621: char *ep;
622: long v;
623:
624: if (sz > 31)
1.94 schwarze 625: return -1;
1.50 kristaps 626:
627: memcpy(buf, p, sz);
1.51 kristaps 628: buf[(int)sz] = '\0';
1.50 kristaps 629:
630: errno = 0;
631: v = strtol(buf, &ep, base);
632:
633: if (buf[0] == '\0' || *ep != '\0')
1.94 schwarze 634: return -1;
1.50 kristaps 635:
1.54 kristaps 636: if (v > INT_MAX)
637: v = INT_MAX;
638: if (v < INT_MIN)
639: v = INT_MIN;
1.50 kristaps 640:
1.94 schwarze 641: return (int)v;
1.50 kristaps 642: }
CVSweb