Annotation of mandoc/mandoc.c, Revision 1.114
1.114 ! schwarze 1: /* $Id: mandoc.c,v 1.113 2018/12/18 22:00:02 schwarze Exp $ */
1.1 kristaps 2: /*
1.90 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
1.104 schwarze 4: * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #include "config.h"
1.7 kristaps 19:
1.2 kristaps 20: #include <sys/types.h>
21:
1.1 kristaps 22: #include <assert.h>
23: #include <ctype.h>
1.50 kristaps 24: #include <errno.h>
25: #include <limits.h>
1.1 kristaps 26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.101 schwarze 31: #include "mandoc_aux.h"
1.18 kristaps 32: #include "mandoc.h"
1.101 schwarze 33: #include "roff.h"
1.1 kristaps 34: #include "libmandoc.h"
1.114 ! schwarze 35: #include "roff_int.h"
1.37 schwarze 36:
1.18 kristaps 37: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 38: static char *time2a(time_t);
1.7 kristaps 39:
1.45 kristaps 40:
41: enum mandoc_esc
1.112 schwarze 42: mandoc_font(const char *cp, int sz)
43: {
44: switch (sz) {
45: case 0:
46: return ESCAPE_FONTPREV;
47: case 1:
48: switch (cp[0]) {
49: case 'B':
50: case '3':
51: return ESCAPE_FONTBOLD;
52: case 'I':
53: case '2':
54: return ESCAPE_FONTITALIC;
55: case 'P':
56: return ESCAPE_FONTPREV;
57: case 'R':
58: case '1':
59: return ESCAPE_FONTROMAN;
60: case '4':
61: return ESCAPE_FONTBI;
62: default:
63: return ESCAPE_ERROR;
64: }
65: case 2:
66: switch (cp[0]) {
67: case 'B':
68: switch (cp[1]) {
69: case 'I':
70: return ESCAPE_FONTBI;
71: default:
72: return ESCAPE_ERROR;
73: }
74: case 'C':
75: switch (cp[1]) {
76: case 'B':
77: return ESCAPE_FONTBOLD;
78: case 'I':
79: return ESCAPE_FONTITALIC;
80: case 'R':
81: case 'W':
82: return ESCAPE_FONTCW;
83: default:
84: return ESCAPE_ERROR;
85: }
86: default:
87: return ESCAPE_ERROR;
88: }
89: default:
90: return ESCAPE_ERROR;
91: }
92: }
93:
94: enum mandoc_esc
1.74 schwarze 95: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 96: {
1.65 schwarze 97: const char *local_start;
1.105 schwarze 98: int local_sz, c, i;
1.65 schwarze 99: char term;
1.79 schwarze 100: enum mandoc_esc gly;
1.45 kristaps 101:
1.65 schwarze 102: /*
103: * When the caller doesn't provide return storage,
104: * use local storage.
105: */
106:
107: if (NULL == start)
108: start = &local_start;
109: if (NULL == sz)
110: sz = &local_sz;
111:
112: /*
1.111 schwarze 113: * Treat "\E" just like "\";
114: * it only makes a difference in copy mode.
115: */
116:
117: if (**end == 'E')
118: ++*end;
119:
120: /*
1.65 schwarze 121: * Beyond the backslash, at least one input character
122: * is part of the escape sequence. With one exception
123: * (see below), that character won't be returned.
124: */
125:
1.45 kristaps 126: gly = ESCAPE_ERROR;
1.65 schwarze 127: *start = ++*end;
128: *sz = 0;
1.64 schwarze 129: term = '\0';
1.18 kristaps 130:
1.65 schwarze 131: switch ((*start)[-1]) {
1.45 kristaps 132: /*
133: * First the glyphs. There are several different forms of
134: * these, but each eventually returns a substring of the glyph
135: * name.
136: */
1.79 schwarze 137: case '(':
1.45 kristaps 138: gly = ESCAPE_SPECIAL;
1.65 schwarze 139: *sz = 2;
1.45 kristaps 140: break;
1.79 schwarze 141: case '[':
1.111 schwarze 142: if (**start == ' ') {
143: ++*end;
144: return ESCAPE_ERROR;
145: }
1.45 kristaps 146: gly = ESCAPE_SPECIAL;
147: term = ']';
148: break;
1.79 schwarze 149: case 'C':
1.65 schwarze 150: if ('\'' != **start)
1.94 schwarze 151: return ESCAPE_ERROR;
1.65 schwarze 152: *start = ++*end;
1.87 schwarze 153: gly = ESCAPE_SPECIAL;
1.45 kristaps 154: term = '\'';
155: break;
1.72 schwarze 156:
157: /*
158: * Escapes taking no arguments at all.
159: */
1.111 schwarze 160: case '!':
161: case '?':
162: return ESCAPE_UNSUPP;
163: case '%':
164: case '&':
165: case ')':
166: case ',':
167: case '/':
168: case '^':
169: case 'a':
1.79 schwarze 170: case 'd':
1.111 schwarze 171: case 'r':
172: case 't':
1.79 schwarze 173: case 'u':
1.111 schwarze 174: case '{':
175: case '|':
176: case '}':
1.94 schwarze 177: return ESCAPE_IGNORE;
1.111 schwarze 178: case 'c':
179: return ESCAPE_NOSPACE;
1.102 schwarze 180: case 'p':
181: return ESCAPE_BREAK;
1.63 schwarze 182:
183: /*
184: * The \z escape is supposed to output the following
1.79 schwarze 185: * character without advancing the cursor position.
1.63 schwarze 186: * Since we are mostly dealing with terminal mode,
187: * let us just skip the next character.
188: */
1.79 schwarze 189: case 'z':
1.94 schwarze 190: return ESCAPE_SKIPCHAR;
1.1 kristaps 191:
1.45 kristaps 192: /*
193: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
194: * 'X' is the trigger. These have opaque sub-strings.
195: */
1.79 schwarze 196: case 'F':
1.111 schwarze 197: case 'f':
1.79 schwarze 198: case 'g':
199: case 'k':
200: case 'M':
201: case 'm':
202: case 'n':
1.111 schwarze 203: case 'O':
1.79 schwarze 204: case 'V':
205: case 'Y':
1.111 schwarze 206: gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
1.65 schwarze 207: switch (**start) {
1.79 schwarze 208: case '(':
1.111 schwarze 209: if ((*start)[-1] == 'O')
210: gly = ESCAPE_ERROR;
1.65 schwarze 211: *start = ++*end;
212: *sz = 2;
1.45 kristaps 213: break;
1.79 schwarze 214: case '[':
1.111 schwarze 215: if ((*start)[-1] == 'O')
216: gly = (*start)[1] == '5' ?
217: ESCAPE_UNSUPP : ESCAPE_ERROR;
1.65 schwarze 218: *start = ++*end;
1.45 kristaps 219: term = ']';
220: break;
221: default:
1.111 schwarze 222: if ((*start)[-1] == 'O') {
223: switch (**start) {
224: case '0':
225: gly = ESCAPE_UNSUPP;
226: break;
227: case '1':
228: case '2':
229: case '3':
230: case '4':
231: break;
232: default:
233: gly = ESCAPE_ERROR;
234: break;
235: }
236: }
1.65 schwarze 237: *sz = 1;
1.45 kristaps 238: break;
239: }
1.106 schwarze 240: break;
241: case '*':
242: if (strncmp(*start, "(.T", 3) != 0)
243: abort();
244: gly = ESCAPE_DEVICE;
245: *start = ++*end;
246: *sz = 2;
1.45 kristaps 247: break;
248:
249: /*
250: * These escapes are of the form \X'Y', where 'X' is the trigger
251: * and 'Y' is any string. These have opaque sub-strings.
1.78 schwarze 252: * The \B and \w escapes are handled in roff.c, roff_res().
1.45 kristaps 253: */
1.79 schwarze 254: case 'A':
255: case 'b':
256: case 'D':
257: case 'R':
258: case 'X':
259: case 'Z':
1.91 schwarze 260: gly = ESCAPE_IGNORE;
261: /* FALLTHROUGH */
262: case 'o':
263: if (**start == '\0')
1.94 schwarze 264: return ESCAPE_ERROR;
1.91 schwarze 265: if (gly == ESCAPE_ERROR)
266: gly = ESCAPE_OVERSTRIKE;
1.77 schwarze 267: term = **start;
1.65 schwarze 268: *start = ++*end;
1.24 kristaps 269: break;
1.45 kristaps 270:
271: /*
272: * These escapes are of the form \X'N', where 'X' is the trigger
273: * and 'N' resolves to a numerical expression.
274: */
1.79 schwarze 275: case 'h':
276: case 'H':
277: case 'L':
278: case 'l':
279: case 'S':
280: case 'v':
281: case 'x':
1.82 schwarze 282: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.86 kristaps 283: if ('\0' != **start)
284: ++*end;
1.94 schwarze 285: return ESCAPE_ERROR;
1.82 schwarze 286: }
1.100 schwarze 287: switch ((*start)[-1]) {
288: case 'h':
289: gly = ESCAPE_HORIZ;
290: break;
291: case 'l':
292: gly = ESCAPE_HLINE;
293: break;
294: default:
295: gly = ESCAPE_IGNORE;
296: break;
297: }
1.77 schwarze 298: term = **start;
1.65 schwarze 299: *start = ++*end;
1.45 kristaps 300: break;
1.60 schwarze 301:
302: /*
303: * Special handling for the numbered character escape.
304: * XXX Do any other escapes need similar handling?
305: */
1.79 schwarze 306: case 'N':
1.65 schwarze 307: if ('\0' == **start)
1.94 schwarze 308: return ESCAPE_ERROR;
1.65 schwarze 309: (*end)++;
310: if (isdigit((unsigned char)**start)) {
311: *sz = 1;
1.94 schwarze 312: return ESCAPE_IGNORE;
1.65 schwarze 313: }
314: (*start)++;
1.60 schwarze 315: while (isdigit((unsigned char)**end))
316: (*end)++;
1.65 schwarze 317: *sz = *end - *start;
1.60 schwarze 318: if ('\0' != **end)
319: (*end)++;
1.94 schwarze 320: return ESCAPE_NUMBERED;
1.45 kristaps 321:
1.79 schwarze 322: /*
1.45 kristaps 323: * Sizes get a special category of their own.
324: */
1.79 schwarze 325: case 's':
1.45 kristaps 326: gly = ESCAPE_IGNORE;
1.28 kristaps 327:
1.45 kristaps 328: /* See +/- counts as a sign. */
1.65 schwarze 329: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.90 schwarze 330: *start = ++*end;
1.8 kristaps 331:
1.65 schwarze 332: switch (**end) {
1.79 schwarze 333: case '(':
1.65 schwarze 334: *start = ++*end;
335: *sz = 2;
1.22 kristaps 336: break;
1.79 schwarze 337: case '[':
1.65 schwarze 338: *start = ++*end;
1.64 schwarze 339: term = ']';
1.22 kristaps 340: break;
1.79 schwarze 341: case '\'':
1.65 schwarze 342: *start = ++*end;
1.64 schwarze 343: term = '\'';
1.92 schwarze 344: break;
345: case '3':
346: case '2':
347: case '1':
348: *sz = (*end)[-1] == 's' &&
349: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.22 kristaps 350: break;
351: default:
1.65 schwarze 352: *sz = 1;
1.22 kristaps 353: break;
1.8 kristaps 354: }
355:
1.45 kristaps 356: break;
1.33 kristaps 357:
1.45 kristaps 358: /*
1.111 schwarze 359: * Several special characters can be encoded as
360: * one-byte escape sequences without using \[].
1.45 kristaps 361: */
1.111 schwarze 362: case ' ':
363: case '\'':
364: case '-':
365: case '.':
366: case '0':
367: case ':':
368: case '_':
369: case '`':
370: case 'e':
371: case '~':
372: gly = ESCAPE_SPECIAL;
373: /* FALLTHROUGH */
1.45 kristaps 374: default:
1.111 schwarze 375: if (gly == ESCAPE_ERROR)
376: gly = ESCAPE_UNDEF;
1.65 schwarze 377: *start = --*end;
378: *sz = 1;
1.22 kristaps 379: break;
1.45 kristaps 380: }
381:
382: /*
1.64 schwarze 383: * Read up to the terminating character,
384: * paying attention to nested escapes.
1.45 kristaps 385: */
386:
387: if ('\0' != term) {
1.64 schwarze 388: while (**end != term) {
389: switch (**end) {
1.79 schwarze 390: case '\0':
1.94 schwarze 391: return ESCAPE_ERROR;
1.79 schwarze 392: case '\\':
1.64 schwarze 393: (*end)++;
394: if (ESCAPE_ERROR ==
395: mandoc_escape(end, NULL, NULL))
1.94 schwarze 396: return ESCAPE_ERROR;
1.64 schwarze 397: break;
398: default:
399: (*end)++;
400: break;
401: }
402: }
1.65 schwarze 403: *sz = (*end)++ - *start;
1.111 schwarze 404:
405: /*
406: * The file chars.c only provides one common list
407: * of character names, but \[-] == \- is the only
408: * one of the characters with one-byte names that
409: * allows enclosing the name in brackets.
410: */
411: if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
412: return ESCAPE_ERROR;
1.64 schwarze 413: } else {
1.65 schwarze 414: assert(*sz > 0);
415: if ((size_t)*sz > strlen(*start))
1.94 schwarze 416: return ESCAPE_ERROR;
1.65 schwarze 417: *end += *sz;
1.45 kristaps 418: }
419:
420: /* Run post-processors. */
421:
422: switch (gly) {
1.79 schwarze 423: case ESCAPE_FONT:
1.112 schwarze 424: gly = mandoc_font(*start, *sz);
1.46 kristaps 425: break;
1.79 schwarze 426: case ESCAPE_SPECIAL:
1.105 schwarze 427: if (**start == 'c') {
428: if (*sz < 6 || *sz > 7 ||
429: strncmp(*start, "char", 4) != 0 ||
430: (int)strspn(*start + 4, "0123456789") + 4 < *sz)
431: break;
432: c = 0;
433: for (i = 4; i < *sz; i++)
434: c = 10 * c + ((*start)[i] - '0');
435: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
436: break;
437: *start += 4;
438: *sz -= 4;
439: gly = ESCAPE_NUMBERED;
440: break;
441: }
442:
1.87 schwarze 443: /*
1.88 schwarze 444: * Unicode escapes are defined in groff as \[u0000]
1.87 schwarze 445: * to \[u10FFFF], where the contained value must be
446: * a valid Unicode codepoint. Here, however, only
1.88 schwarze 447: * check the length and range.
1.87 schwarze 448: */
1.88 schwarze 449: if (**start != 'u' || *sz < 5 || *sz > 7)
450: break;
451: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
452: break;
453: if (*sz == 6 && (*start)[1] == '0')
1.96 schwarze 454: break;
455: if (*sz == 5 && (*start)[1] == 'D' &&
456: strchr("89ABCDEF", (*start)[2]) != NULL)
1.88 schwarze 457: break;
458: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.87 schwarze 459: + 1 == *sz)
460: gly = ESCAPE_UNICODE;
1.22 kristaps 461: break;
1.1 kristaps 462: default:
1.22 kristaps 463: break;
1.1 kristaps 464: }
465:
1.94 schwarze 466: return gly;
1.4 kristaps 467: }
1.7 kristaps 468:
469: static int
470: a2time(time_t *t, const char *fmt, const char *p)
471: {
472: struct tm tm;
473: char *pp;
474:
475: memset(&tm, 0, sizeof(struct tm));
476:
1.56 kristaps 477: pp = NULL;
1.85 schwarze 478: #if HAVE_STRPTIME
1.7 kristaps 479: pp = strptime(p, fmt, &tm);
1.56 kristaps 480: #endif
1.7 kristaps 481: if (NULL != pp && '\0' == *pp) {
482: *t = mktime(&tm);
1.94 schwarze 483: return 1;
1.7 kristaps 484: }
485:
1.94 schwarze 486: return 0;
1.7 kristaps 487: }
488:
1.37 schwarze 489: static char *
490: time2a(time_t t)
491: {
1.56 kristaps 492: struct tm *tm;
1.38 schwarze 493: char *buf, *p;
494: size_t ssz;
1.37 schwarze 495: int isz;
496:
1.56 kristaps 497: tm = localtime(&t);
1.89 schwarze 498: if (tm == NULL)
1.94 schwarze 499: return NULL;
1.37 schwarze 500:
1.38 schwarze 501: /*
502: * Reserve space:
503: * up to 9 characters for the month (September) + blank
504: * up to 2 characters for the day + comma + blank
505: * 4 characters for the year and a terminating '\0'
506: */
1.98 schwarze 507:
1.38 schwarze 508: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
509:
1.98 schwarze 510: if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
1.38 schwarze 511: goto fail;
512: p += (int)ssz;
1.37 schwarze 513:
1.98 schwarze 514: /*
515: * The output format is just "%d" here, not "%2d" or "%02d".
516: * That's also the reason why we can't just format the
517: * date as a whole with "%B %e, %Y" or "%B %d, %Y".
518: * Besides, the present approach is less prone to buffer
519: * overflows, in case anybody should ever introduce the bug
520: * of looking at LC_TIME.
521: */
522:
523: if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
1.38 schwarze 524: goto fail;
1.37 schwarze 525: p += isz;
526:
1.98 schwarze 527: if (strftime(p, 4 + 1, "%Y", tm) == 0)
1.38 schwarze 528: goto fail;
1.94 schwarze 529: return buf;
1.38 schwarze 530:
531: fail:
532: free(buf);
1.94 schwarze 533: return NULL;
1.37 schwarze 534: }
535:
536: char *
1.101 schwarze 537: mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
1.7 kristaps 538: {
1.103 schwarze 539: char *cp;
1.7 kristaps 540: time_t t;
541:
1.98 schwarze 542: /* No date specified: use today's date. */
543:
544: if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
1.109 schwarze 545: mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
1.98 schwarze 546: return time2a(time(NULL));
1.37 schwarze 547: }
1.98 schwarze 548:
549: /* Valid mdoc(7) date format. */
550:
551: if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
1.103 schwarze 552: a2time(&t, "%b %d, %Y", in)) {
553: cp = time2a(t);
554: if (t > time(NULL) + 86400)
1.109 schwarze 555: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
1.104 schwarze 556: else if (*in != '$' && strcmp(in, cp) != 0)
1.109 schwarze 557: mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
1.103 schwarze 558: return cp;
559: }
1.98 schwarze 560:
1.101 schwarze 561: /* In man(7), do not warn about the legacy format. */
1.98 schwarze 562:
1.101 schwarze 563: if (a2time(&t, "%Y-%m-%d", in) == 0)
1.109 schwarze 564: mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
1.103 schwarze 565: else if (t > time(NULL) + 86400)
1.109 schwarze 566: mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
1.114 ! schwarze 567: else if (man->meta.macroset == MACROSET_MDOC)
1.109 schwarze 568: mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
1.98 schwarze 569:
570: /* Use any non-mdoc(7) date verbatim. */
571:
572: return mandoc_strdup(in);
1.7 kristaps 573: }
574:
1.12 kristaps 575: int
1.75 schwarze 576: mandoc_eos(const char *p, size_t sz)
1.12 kristaps 577: {
1.75 schwarze 578: const char *q;
579: int enclosed, found;
1.12 kristaps 580:
1.13 kristaps 581: if (0 == sz)
1.94 schwarze 582: return 0;
1.12 kristaps 583:
1.14 kristaps 584: /*
585: * End-of-sentence recognition must include situations where
586: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 587: * propagate outward.
1.14 kristaps 588: */
589:
1.75 schwarze 590: enclosed = found = 0;
1.25 kristaps 591: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 592: switch (*q) {
1.79 schwarze 593: case '\"':
594: case '\'':
595: case ']':
596: case ')':
1.23 schwarze 597: if (0 == found)
598: enclosed = 1;
1.14 kristaps 599: break;
1.79 schwarze 600: case '.':
601: case '!':
602: case '?':
1.23 schwarze 603: found = 1;
604: break;
1.14 kristaps 605: default:
1.94 schwarze 606: return found &&
607: (!enclosed || isalnum((unsigned char)*q));
1.14 kristaps 608: }
1.12 kristaps 609: }
610:
1.94 schwarze 611: return found && !enclosed;
1.44 kristaps 612: }
1.50 kristaps 613:
614: /*
615: * Convert a string to a long that may not be <0.
616: * If the string is invalid, or is less than 0, return -1.
617: */
618: int
1.54 kristaps 619: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 620: {
621: char buf[32];
622: char *ep;
623: long v;
624:
625: if (sz > 31)
1.94 schwarze 626: return -1;
1.50 kristaps 627:
628: memcpy(buf, p, sz);
1.51 kristaps 629: buf[(int)sz] = '\0';
1.50 kristaps 630:
631: errno = 0;
632: v = strtol(buf, &ep, base);
633:
634: if (buf[0] == '\0' || *ep != '\0')
1.94 schwarze 635: return -1;
1.50 kristaps 636:
1.54 kristaps 637: if (v > INT_MAX)
638: v = INT_MAX;
639: if (v < INT_MIN)
640: v = INT_MIN;
1.50 kristaps 641:
1.94 schwarze 642: return (int)v;
1.50 kristaps 643: }
CVSweb