Annotation of mandoc/mandoc.c, Revision 1.100
1.100 ! schwarze 1: /* $Id: mandoc.c,v 1.99 2017/06/01 19:05:37 schwarze Exp $ */
1.1 kristaps 2: /*
1.90 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
1.100 ! schwarze 4: * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #include "config.h"
1.7 kristaps 19:
1.2 kristaps 20: #include <sys/types.h>
21:
1.1 kristaps 22: #include <assert.h>
23: #include <ctype.h>
1.50 kristaps 24: #include <errno.h>
25: #include <limits.h>
1.1 kristaps 26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.18 kristaps 31: #include "mandoc.h"
1.76 schwarze 32: #include "mandoc_aux.h"
1.1 kristaps 33: #include "libmandoc.h"
1.37 schwarze 34:
1.18 kristaps 35: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 36: static char *time2a(time_t);
1.7 kristaps 37:
1.45 kristaps 38:
39: enum mandoc_esc
1.74 schwarze 40: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 41: {
1.65 schwarze 42: const char *local_start;
43: int local_sz;
44: char term;
1.79 schwarze 45: enum mandoc_esc gly;
1.45 kristaps 46:
1.65 schwarze 47: /*
48: * When the caller doesn't provide return storage,
49: * use local storage.
50: */
51:
52: if (NULL == start)
53: start = &local_start;
54: if (NULL == sz)
55: sz = &local_sz;
56:
57: /*
58: * Beyond the backslash, at least one input character
59: * is part of the escape sequence. With one exception
60: * (see below), that character won't be returned.
61: */
62:
1.45 kristaps 63: gly = ESCAPE_ERROR;
1.65 schwarze 64: *start = ++*end;
65: *sz = 0;
1.64 schwarze 66: term = '\0';
1.18 kristaps 67:
1.65 schwarze 68: switch ((*start)[-1]) {
1.45 kristaps 69: /*
70: * First the glyphs. There are several different forms of
71: * these, but each eventually returns a substring of the glyph
72: * name.
73: */
1.79 schwarze 74: case '(':
1.45 kristaps 75: gly = ESCAPE_SPECIAL;
1.65 schwarze 76: *sz = 2;
1.45 kristaps 77: break;
1.79 schwarze 78: case '[':
1.45 kristaps 79: gly = ESCAPE_SPECIAL;
80: term = ']';
81: break;
1.79 schwarze 82: case 'C':
1.65 schwarze 83: if ('\'' != **start)
1.94 schwarze 84: return ESCAPE_ERROR;
1.65 schwarze 85: *start = ++*end;
1.87 schwarze 86: gly = ESCAPE_SPECIAL;
1.45 kristaps 87: term = '\'';
88: break;
1.72 schwarze 89:
90: /*
91: * Escapes taking no arguments at all.
92: */
1.79 schwarze 93: case 'd':
94: case 'u':
1.93 schwarze 95: case ',':
96: case '/':
1.94 schwarze 97: return ESCAPE_IGNORE;
1.63 schwarze 98:
99: /*
100: * The \z escape is supposed to output the following
1.79 schwarze 101: * character without advancing the cursor position.
1.63 schwarze 102: * Since we are mostly dealing with terminal mode,
103: * let us just skip the next character.
104: */
1.79 schwarze 105: case 'z':
1.94 schwarze 106: return ESCAPE_SKIPCHAR;
1.1 kristaps 107:
1.45 kristaps 108: /*
109: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
110: * 'X' is the trigger. These have opaque sub-strings.
111: */
1.79 schwarze 112: case 'F':
113: case 'g':
114: case 'k':
115: case 'M':
116: case 'm':
117: case 'n':
118: case 'V':
119: case 'Y':
1.60 schwarze 120: gly = ESCAPE_IGNORE;
1.24 kristaps 121: /* FALLTHROUGH */
1.79 schwarze 122: case 'f':
1.45 kristaps 123: if (ESCAPE_ERROR == gly)
124: gly = ESCAPE_FONT;
1.65 schwarze 125: switch (**start) {
1.79 schwarze 126: case '(':
1.65 schwarze 127: *start = ++*end;
128: *sz = 2;
1.45 kristaps 129: break;
1.79 schwarze 130: case '[':
1.65 schwarze 131: *start = ++*end;
1.45 kristaps 132: term = ']';
133: break;
134: default:
1.65 schwarze 135: *sz = 1;
1.45 kristaps 136: break;
137: }
138: break;
139:
140: /*
141: * These escapes are of the form \X'Y', where 'X' is the trigger
142: * and 'Y' is any string. These have opaque sub-strings.
1.78 schwarze 143: * The \B and \w escapes are handled in roff.c, roff_res().
1.45 kristaps 144: */
1.79 schwarze 145: case 'A':
146: case 'b':
147: case 'D':
148: case 'R':
149: case 'X':
150: case 'Z':
1.91 schwarze 151: gly = ESCAPE_IGNORE;
152: /* FALLTHROUGH */
153: case 'o':
154: if (**start == '\0')
1.94 schwarze 155: return ESCAPE_ERROR;
1.91 schwarze 156: if (gly == ESCAPE_ERROR)
157: gly = ESCAPE_OVERSTRIKE;
1.77 schwarze 158: term = **start;
1.65 schwarze 159: *start = ++*end;
1.24 kristaps 160: break;
1.45 kristaps 161:
162: /*
163: * These escapes are of the form \X'N', where 'X' is the trigger
164: * and 'N' resolves to a numerical expression.
165: */
1.79 schwarze 166: case 'h':
167: case 'H':
168: case 'L':
169: case 'l':
170: case 'S':
171: case 'v':
172: case 'x':
1.82 schwarze 173: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.86 kristaps 174: if ('\0' != **start)
175: ++*end;
1.94 schwarze 176: return ESCAPE_ERROR;
1.82 schwarze 177: }
1.100 ! schwarze 178: switch ((*start)[-1]) {
! 179: case 'h':
! 180: gly = ESCAPE_HORIZ;
! 181: break;
! 182: case 'l':
! 183: gly = ESCAPE_HLINE;
! 184: break;
! 185: default:
! 186: gly = ESCAPE_IGNORE;
! 187: break;
! 188: }
1.77 schwarze 189: term = **start;
1.65 schwarze 190: *start = ++*end;
1.45 kristaps 191: break;
1.60 schwarze 192:
193: /*
194: * Special handling for the numbered character escape.
195: * XXX Do any other escapes need similar handling?
196: */
1.79 schwarze 197: case 'N':
1.65 schwarze 198: if ('\0' == **start)
1.94 schwarze 199: return ESCAPE_ERROR;
1.65 schwarze 200: (*end)++;
201: if (isdigit((unsigned char)**start)) {
202: *sz = 1;
1.94 schwarze 203: return ESCAPE_IGNORE;
1.65 schwarze 204: }
205: (*start)++;
1.60 schwarze 206: while (isdigit((unsigned char)**end))
207: (*end)++;
1.65 schwarze 208: *sz = *end - *start;
1.60 schwarze 209: if ('\0' != **end)
210: (*end)++;
1.94 schwarze 211: return ESCAPE_NUMBERED;
1.45 kristaps 212:
1.79 schwarze 213: /*
1.45 kristaps 214: * Sizes get a special category of their own.
215: */
1.79 schwarze 216: case 's':
1.45 kristaps 217: gly = ESCAPE_IGNORE;
1.28 kristaps 218:
1.45 kristaps 219: /* See +/- counts as a sign. */
1.65 schwarze 220: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.90 schwarze 221: *start = ++*end;
1.8 kristaps 222:
1.65 schwarze 223: switch (**end) {
1.79 schwarze 224: case '(':
1.65 schwarze 225: *start = ++*end;
226: *sz = 2;
1.22 kristaps 227: break;
1.79 schwarze 228: case '[':
1.65 schwarze 229: *start = ++*end;
1.64 schwarze 230: term = ']';
1.22 kristaps 231: break;
1.79 schwarze 232: case '\'':
1.65 schwarze 233: *start = ++*end;
1.64 schwarze 234: term = '\'';
1.92 schwarze 235: break;
236: case '3':
237: case '2':
238: case '1':
239: *sz = (*end)[-1] == 's' &&
240: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.22 kristaps 241: break;
242: default:
1.65 schwarze 243: *sz = 1;
1.22 kristaps 244: break;
1.8 kristaps 245: }
246:
1.45 kristaps 247: break;
1.33 kristaps 248:
1.45 kristaps 249: /*
250: * Anything else is assumed to be a glyph.
1.65 schwarze 251: * In this case, pass back the character after the backslash.
1.45 kristaps 252: */
253: default:
254: gly = ESCAPE_SPECIAL;
1.65 schwarze 255: *start = --*end;
256: *sz = 1;
1.22 kristaps 257: break;
1.45 kristaps 258: }
259:
260: assert(ESCAPE_ERROR != gly);
261:
262: /*
1.64 schwarze 263: * Read up to the terminating character,
264: * paying attention to nested escapes.
1.45 kristaps 265: */
266:
267: if ('\0' != term) {
1.64 schwarze 268: while (**end != term) {
269: switch (**end) {
1.79 schwarze 270: case '\0':
1.94 schwarze 271: return ESCAPE_ERROR;
1.79 schwarze 272: case '\\':
1.64 schwarze 273: (*end)++;
274: if (ESCAPE_ERROR ==
275: mandoc_escape(end, NULL, NULL))
1.94 schwarze 276: return ESCAPE_ERROR;
1.64 schwarze 277: break;
278: default:
279: (*end)++;
280: break;
281: }
282: }
1.65 schwarze 283: *sz = (*end)++ - *start;
1.64 schwarze 284: } else {
1.65 schwarze 285: assert(*sz > 0);
286: if ((size_t)*sz > strlen(*start))
1.94 schwarze 287: return ESCAPE_ERROR;
1.65 schwarze 288: *end += *sz;
1.45 kristaps 289: }
290:
291: /* Run post-processors. */
292:
293: switch (gly) {
1.79 schwarze 294: case ESCAPE_FONT:
1.68 schwarze 295: if (2 == *sz) {
296: if ('C' == **start) {
297: /*
298: * Treat constant-width font modes
299: * just like regular font modes.
300: */
301: (*start)++;
302: (*sz)--;
303: } else {
304: if ('B' == (*start)[0] && 'I' == (*start)[1])
305: gly = ESCAPE_FONTBI;
306: break;
307: }
1.65 schwarze 308: } else if (1 != *sz)
1.45 kristaps 309: break;
1.61 kristaps 310:
1.65 schwarze 311: switch (**start) {
1.79 schwarze 312: case '3':
313: case 'B':
1.45 kristaps 314: gly = ESCAPE_FONTBOLD;
315: break;
1.79 schwarze 316: case '2':
317: case 'I':
1.45 kristaps 318: gly = ESCAPE_FONTITALIC;
1.22 kristaps 319: break;
1.79 schwarze 320: case 'P':
1.45 kristaps 321: gly = ESCAPE_FONTPREV;
1.22 kristaps 322: break;
1.79 schwarze 323: case '1':
324: case 'R':
1.45 kristaps 325: gly = ESCAPE_FONTROMAN;
1.1 kristaps 326: break;
327: }
1.46 kristaps 328: break;
1.79 schwarze 329: case ESCAPE_SPECIAL:
1.65 schwarze 330: if (1 == *sz && 'c' == **start)
1.45 kristaps 331: gly = ESCAPE_NOSPACE;
1.87 schwarze 332: /*
1.88 schwarze 333: * Unicode escapes are defined in groff as \[u0000]
1.87 schwarze 334: * to \[u10FFFF], where the contained value must be
335: * a valid Unicode codepoint. Here, however, only
1.88 schwarze 336: * check the length and range.
1.87 schwarze 337: */
1.88 schwarze 338: if (**start != 'u' || *sz < 5 || *sz > 7)
339: break;
340: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
341: break;
342: if (*sz == 6 && (*start)[1] == '0')
1.96 schwarze 343: break;
344: if (*sz == 5 && (*start)[1] == 'D' &&
345: strchr("89ABCDEF", (*start)[2]) != NULL)
1.88 schwarze 346: break;
347: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.87 schwarze 348: + 1 == *sz)
349: gly = ESCAPE_UNICODE;
1.22 kristaps 350: break;
1.1 kristaps 351: default:
1.22 kristaps 352: break;
1.1 kristaps 353: }
354:
1.94 schwarze 355: return gly;
1.36 schwarze 356: }
357:
358: /*
359: * Parse a quoted or unquoted roff-style request or macro argument.
360: * Return a pointer to the parsed argument, which is either the original
361: * pointer or advanced by one byte in case the argument is quoted.
1.71 schwarze 362: * NUL-terminate the argument in place.
1.36 schwarze 363: * Collapse pairs of quotes inside quoted arguments.
364: * Advance the argument pointer to the next argument,
1.71 schwarze 365: * or to the NUL byte terminating the argument line.
1.36 schwarze 366: */
367: char *
1.48 kristaps 368: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 369: {
370: char *start, *cp;
371: int quoted, pairs, white;
372:
373: /* Quoting can only start with a new word. */
374: start = *cpp;
1.47 kristaps 375: quoted = 0;
1.36 schwarze 376: if ('"' == *start) {
377: quoted = 1;
378: start++;
1.79 schwarze 379: }
1.36 schwarze 380:
381: pairs = 0;
382: white = 0;
383: for (cp = start; '\0' != *cp; cp++) {
1.67 schwarze 384:
385: /*
386: * Move the following text left
387: * after quoted quotes and after "\\" and "\t".
388: */
1.36 schwarze 389: if (pairs)
390: cp[-pairs] = cp[0];
1.67 schwarze 391:
1.36 schwarze 392: if ('\\' == cp[0]) {
1.67 schwarze 393: /*
394: * In copy mode, translate double to single
395: * backslashes and backslash-t to literal tabs.
396: */
397: switch (cp[1]) {
1.79 schwarze 398: case 't':
1.67 schwarze 399: cp[0] = '\t';
400: /* FALLTHROUGH */
1.79 schwarze 401: case '\\':
1.36 schwarze 402: pairs++;
403: cp++;
1.67 schwarze 404: break;
1.79 schwarze 405: case ' ':
1.36 schwarze 406: /* Skip escaped blanks. */
1.67 schwarze 407: if (0 == quoted)
408: cp++;
409: break;
410: default:
411: break;
412: }
1.36 schwarze 413: } else if (0 == quoted) {
414: if (' ' == cp[0]) {
415: /* Unescaped blanks end unquoted args. */
416: white = 1;
417: break;
418: }
419: } else if ('"' == cp[0]) {
420: if ('"' == cp[1]) {
421: /* Quoted quotes collapse. */
422: pairs++;
423: cp++;
424: } else {
425: /* Unquoted quotes end quoted args. */
426: quoted = 2;
427: break;
428: }
429: }
430: }
431:
432: /* Quoted argument without a closing quote. */
1.48 kristaps 433: if (1 == quoted)
1.83 schwarze 434: mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
1.36 schwarze 435:
1.71 schwarze 436: /* NUL-terminate this argument and move to the next one. */
1.36 schwarze 437: if (pairs)
438: cp[-pairs] = '\0';
439: if ('\0' != *cp) {
440: *cp++ = '\0';
441: while (' ' == *cp)
442: cp++;
443: }
1.39 kristaps 444: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 445: *cpp = cp;
446:
1.48 kristaps 447: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.83 schwarze 448: mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
1.36 schwarze 449:
1.94 schwarze 450: return start;
1.4 kristaps 451: }
1.7 kristaps 452:
453: static int
454: a2time(time_t *t, const char *fmt, const char *p)
455: {
456: struct tm tm;
457: char *pp;
458:
459: memset(&tm, 0, sizeof(struct tm));
460:
1.56 kristaps 461: pp = NULL;
1.85 schwarze 462: #if HAVE_STRPTIME
1.7 kristaps 463: pp = strptime(p, fmt, &tm);
1.56 kristaps 464: #endif
1.7 kristaps 465: if (NULL != pp && '\0' == *pp) {
466: *t = mktime(&tm);
1.94 schwarze 467: return 1;
1.7 kristaps 468: }
469:
1.94 schwarze 470: return 0;
1.7 kristaps 471: }
472:
1.37 schwarze 473: static char *
474: time2a(time_t t)
475: {
1.56 kristaps 476: struct tm *tm;
1.38 schwarze 477: char *buf, *p;
478: size_t ssz;
1.37 schwarze 479: int isz;
480:
1.56 kristaps 481: tm = localtime(&t);
1.89 schwarze 482: if (tm == NULL)
1.94 schwarze 483: return NULL;
1.37 schwarze 484:
1.38 schwarze 485: /*
486: * Reserve space:
487: * up to 9 characters for the month (September) + blank
488: * up to 2 characters for the day + comma + blank
489: * 4 characters for the year and a terminating '\0'
490: */
1.98 schwarze 491:
1.38 schwarze 492: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
493:
1.98 schwarze 494: if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
1.38 schwarze 495: goto fail;
496: p += (int)ssz;
1.37 schwarze 497:
1.98 schwarze 498: /*
499: * The output format is just "%d" here, not "%2d" or "%02d".
500: * That's also the reason why we can't just format the
501: * date as a whole with "%B %e, %Y" or "%B %d, %Y".
502: * Besides, the present approach is less prone to buffer
503: * overflows, in case anybody should ever introduce the bug
504: * of looking at LC_TIME.
505: */
506:
507: if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
1.38 schwarze 508: goto fail;
1.37 schwarze 509: p += isz;
510:
1.98 schwarze 511: if (strftime(p, 4 + 1, "%Y", tm) == 0)
1.38 schwarze 512: goto fail;
1.94 schwarze 513: return buf;
1.38 schwarze 514:
515: fail:
516: free(buf);
1.94 schwarze 517: return NULL;
1.37 schwarze 518: }
519:
520: char *
1.42 kristaps 521: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 522: {
523: time_t t;
524:
1.98 schwarze 525: /* No date specified: use today's date. */
526:
527: if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
1.80 schwarze 528: mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
1.98 schwarze 529: return time2a(time(NULL));
1.37 schwarze 530: }
1.98 schwarze 531:
532: /* Valid mdoc(7) date format. */
533:
534: if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
535: a2time(&t, "%b %d, %Y", in))
536: return time2a(t);
537:
538: /* Do not warn about the legacy man(7) format. */
539:
540: if ( ! a2time(&t, "%Y-%m-%d", in))
1.81 schwarze 541: mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
1.98 schwarze 542:
543: /* Use any non-mdoc(7) date verbatim. */
544:
545: return mandoc_strdup(in);
1.7 kristaps 546: }
547:
1.12 kristaps 548: int
1.75 schwarze 549: mandoc_eos(const char *p, size_t sz)
1.12 kristaps 550: {
1.75 schwarze 551: const char *q;
552: int enclosed, found;
1.12 kristaps 553:
1.13 kristaps 554: if (0 == sz)
1.94 schwarze 555: return 0;
1.12 kristaps 556:
1.14 kristaps 557: /*
558: * End-of-sentence recognition must include situations where
559: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 560: * propagate outward.
1.14 kristaps 561: */
562:
1.75 schwarze 563: enclosed = found = 0;
1.25 kristaps 564: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 565: switch (*q) {
1.79 schwarze 566: case '\"':
567: case '\'':
568: case ']':
569: case ')':
1.23 schwarze 570: if (0 == found)
571: enclosed = 1;
1.14 kristaps 572: break;
1.79 schwarze 573: case '.':
574: case '!':
575: case '?':
1.23 schwarze 576: found = 1;
577: break;
1.14 kristaps 578: default:
1.94 schwarze 579: return found &&
580: (!enclosed || isalnum((unsigned char)*q));
1.14 kristaps 581: }
1.12 kristaps 582: }
583:
1.94 schwarze 584: return found && !enclosed;
1.44 kristaps 585: }
1.50 kristaps 586:
587: /*
588: * Convert a string to a long that may not be <0.
589: * If the string is invalid, or is less than 0, return -1.
590: */
591: int
1.54 kristaps 592: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 593: {
594: char buf[32];
595: char *ep;
596: long v;
597:
598: if (sz > 31)
1.94 schwarze 599: return -1;
1.50 kristaps 600:
601: memcpy(buf, p, sz);
1.51 kristaps 602: buf[(int)sz] = '\0';
1.50 kristaps 603:
604: errno = 0;
605: v = strtol(buf, &ep, base);
606:
607: if (buf[0] == '\0' || *ep != '\0')
1.94 schwarze 608: return -1;
1.50 kristaps 609:
1.54 kristaps 610: if (v > INT_MAX)
611: v = INT_MAX;
612: if (v < INT_MIN)
613: v = INT_MIN;
1.50 kristaps 614:
1.94 schwarze 615: return (int)v;
1.50 kristaps 616: }
CVSweb