Annotation of mandoc/mandoc.c, Revision 1.96
1.96 ! schwarze 1: /* $Id: mandoc.c,v 1.95 2015/10/12 00:08:15 schwarze Exp $ */
1.1 kristaps 2: /*
1.90 schwarze 3: * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2011-2015 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #include "config.h"
1.7 kristaps 19:
1.2 kristaps 20: #include <sys/types.h>
21:
1.1 kristaps 22: #include <assert.h>
23: #include <ctype.h>
1.50 kristaps 24: #include <errno.h>
25: #include <limits.h>
1.1 kristaps 26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.18 kristaps 31: #include "mandoc.h"
1.76 schwarze 32: #include "mandoc_aux.h"
1.1 kristaps 33: #include "libmandoc.h"
34:
1.37 schwarze 35: #define DATESIZE 32
36:
1.18 kristaps 37: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 38: static char *time2a(time_t);
1.7 kristaps 39:
1.45 kristaps 40:
41: enum mandoc_esc
1.74 schwarze 42: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 43: {
1.65 schwarze 44: const char *local_start;
45: int local_sz;
46: char term;
1.79 schwarze 47: enum mandoc_esc gly;
1.45 kristaps 48:
1.65 schwarze 49: /*
50: * When the caller doesn't provide return storage,
51: * use local storage.
52: */
53:
54: if (NULL == start)
55: start = &local_start;
56: if (NULL == sz)
57: sz = &local_sz;
58:
59: /*
60: * Beyond the backslash, at least one input character
61: * is part of the escape sequence. With one exception
62: * (see below), that character won't be returned.
63: */
64:
1.45 kristaps 65: gly = ESCAPE_ERROR;
1.65 schwarze 66: *start = ++*end;
67: *sz = 0;
1.64 schwarze 68: term = '\0';
1.18 kristaps 69:
1.65 schwarze 70: switch ((*start)[-1]) {
1.45 kristaps 71: /*
72: * First the glyphs. There are several different forms of
73: * these, but each eventually returns a substring of the glyph
74: * name.
75: */
1.79 schwarze 76: case '(':
1.45 kristaps 77: gly = ESCAPE_SPECIAL;
1.65 schwarze 78: *sz = 2;
1.45 kristaps 79: break;
1.79 schwarze 80: case '[':
1.45 kristaps 81: gly = ESCAPE_SPECIAL;
82: term = ']';
83: break;
1.79 schwarze 84: case 'C':
1.65 schwarze 85: if ('\'' != **start)
1.94 schwarze 86: return ESCAPE_ERROR;
1.65 schwarze 87: *start = ++*end;
1.87 schwarze 88: gly = ESCAPE_SPECIAL;
1.45 kristaps 89: term = '\'';
90: break;
1.72 schwarze 91:
92: /*
93: * Escapes taking no arguments at all.
94: */
1.79 schwarze 95: case 'd':
96: case 'u':
1.93 schwarze 97: case ',':
98: case '/':
1.94 schwarze 99: return ESCAPE_IGNORE;
1.63 schwarze 100:
101: /*
102: * The \z escape is supposed to output the following
1.79 schwarze 103: * character without advancing the cursor position.
1.63 schwarze 104: * Since we are mostly dealing with terminal mode,
105: * let us just skip the next character.
106: */
1.79 schwarze 107: case 'z':
1.94 schwarze 108: return ESCAPE_SKIPCHAR;
1.1 kristaps 109:
1.45 kristaps 110: /*
111: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
112: * 'X' is the trigger. These have opaque sub-strings.
113: */
1.79 schwarze 114: case 'F':
115: case 'g':
116: case 'k':
117: case 'M':
118: case 'm':
119: case 'n':
120: case 'V':
121: case 'Y':
1.60 schwarze 122: gly = ESCAPE_IGNORE;
1.24 kristaps 123: /* FALLTHROUGH */
1.79 schwarze 124: case 'f':
1.45 kristaps 125: if (ESCAPE_ERROR == gly)
126: gly = ESCAPE_FONT;
1.65 schwarze 127: switch (**start) {
1.79 schwarze 128: case '(':
1.65 schwarze 129: *start = ++*end;
130: *sz = 2;
1.45 kristaps 131: break;
1.79 schwarze 132: case '[':
1.65 schwarze 133: *start = ++*end;
1.45 kristaps 134: term = ']';
135: break;
136: default:
1.65 schwarze 137: *sz = 1;
1.45 kristaps 138: break;
139: }
140: break;
141:
142: /*
143: * These escapes are of the form \X'Y', where 'X' is the trigger
144: * and 'Y' is any string. These have opaque sub-strings.
1.78 schwarze 145: * The \B and \w escapes are handled in roff.c, roff_res().
1.45 kristaps 146: */
1.79 schwarze 147: case 'A':
148: case 'b':
149: case 'D':
150: case 'R':
151: case 'X':
152: case 'Z':
1.91 schwarze 153: gly = ESCAPE_IGNORE;
154: /* FALLTHROUGH */
155: case 'o':
156: if (**start == '\0')
1.94 schwarze 157: return ESCAPE_ERROR;
1.91 schwarze 158: if (gly == ESCAPE_ERROR)
159: gly = ESCAPE_OVERSTRIKE;
1.77 schwarze 160: term = **start;
1.65 schwarze 161: *start = ++*end;
1.24 kristaps 162: break;
1.45 kristaps 163:
164: /*
165: * These escapes are of the form \X'N', where 'X' is the trigger
166: * and 'N' resolves to a numerical expression.
167: */
1.79 schwarze 168: case 'h':
169: case 'H':
170: case 'L':
171: case 'l':
172: case 'S':
173: case 'v':
174: case 'x':
1.82 schwarze 175: if (strchr(" %&()*+-./0123456789:<=>", **start)) {
1.86 kristaps 176: if ('\0' != **start)
177: ++*end;
1.94 schwarze 178: return ESCAPE_ERROR;
1.82 schwarze 179: }
1.73 schwarze 180: gly = ESCAPE_IGNORE;
1.77 schwarze 181: term = **start;
1.65 schwarze 182: *start = ++*end;
1.45 kristaps 183: break;
1.60 schwarze 184:
185: /*
186: * Special handling for the numbered character escape.
187: * XXX Do any other escapes need similar handling?
188: */
1.79 schwarze 189: case 'N':
1.65 schwarze 190: if ('\0' == **start)
1.94 schwarze 191: return ESCAPE_ERROR;
1.65 schwarze 192: (*end)++;
193: if (isdigit((unsigned char)**start)) {
194: *sz = 1;
1.94 schwarze 195: return ESCAPE_IGNORE;
1.65 schwarze 196: }
197: (*start)++;
1.60 schwarze 198: while (isdigit((unsigned char)**end))
199: (*end)++;
1.65 schwarze 200: *sz = *end - *start;
1.60 schwarze 201: if ('\0' != **end)
202: (*end)++;
1.94 schwarze 203: return ESCAPE_NUMBERED;
1.45 kristaps 204:
1.79 schwarze 205: /*
1.45 kristaps 206: * Sizes get a special category of their own.
207: */
1.79 schwarze 208: case 's':
1.45 kristaps 209: gly = ESCAPE_IGNORE;
1.28 kristaps 210:
1.45 kristaps 211: /* See +/- counts as a sign. */
1.65 schwarze 212: if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
1.90 schwarze 213: *start = ++*end;
1.8 kristaps 214:
1.65 schwarze 215: switch (**end) {
1.79 schwarze 216: case '(':
1.65 schwarze 217: *start = ++*end;
218: *sz = 2;
1.22 kristaps 219: break;
1.79 schwarze 220: case '[':
1.65 schwarze 221: *start = ++*end;
1.64 schwarze 222: term = ']';
1.22 kristaps 223: break;
1.79 schwarze 224: case '\'':
1.65 schwarze 225: *start = ++*end;
1.64 schwarze 226: term = '\'';
1.92 schwarze 227: break;
228: case '3':
229: case '2':
230: case '1':
231: *sz = (*end)[-1] == 's' &&
232: isdigit((unsigned char)(*end)[1]) ? 2 : 1;
1.22 kristaps 233: break;
234: default:
1.65 schwarze 235: *sz = 1;
1.22 kristaps 236: break;
1.8 kristaps 237: }
238:
1.45 kristaps 239: break;
1.33 kristaps 240:
1.45 kristaps 241: /*
242: * Anything else is assumed to be a glyph.
1.65 schwarze 243: * In this case, pass back the character after the backslash.
1.45 kristaps 244: */
245: default:
246: gly = ESCAPE_SPECIAL;
1.65 schwarze 247: *start = --*end;
248: *sz = 1;
1.22 kristaps 249: break;
1.45 kristaps 250: }
251:
252: assert(ESCAPE_ERROR != gly);
253:
254: /*
1.64 schwarze 255: * Read up to the terminating character,
256: * paying attention to nested escapes.
1.45 kristaps 257: */
258:
259: if ('\0' != term) {
1.64 schwarze 260: while (**end != term) {
261: switch (**end) {
1.79 schwarze 262: case '\0':
1.94 schwarze 263: return ESCAPE_ERROR;
1.79 schwarze 264: case '\\':
1.64 schwarze 265: (*end)++;
266: if (ESCAPE_ERROR ==
267: mandoc_escape(end, NULL, NULL))
1.94 schwarze 268: return ESCAPE_ERROR;
1.64 schwarze 269: break;
270: default:
271: (*end)++;
272: break;
273: }
274: }
1.65 schwarze 275: *sz = (*end)++ - *start;
1.64 schwarze 276: } else {
1.65 schwarze 277: assert(*sz > 0);
278: if ((size_t)*sz > strlen(*start))
1.94 schwarze 279: return ESCAPE_ERROR;
1.65 schwarze 280: *end += *sz;
1.45 kristaps 281: }
282:
283: /* Run post-processors. */
284:
285: switch (gly) {
1.79 schwarze 286: case ESCAPE_FONT:
1.68 schwarze 287: if (2 == *sz) {
288: if ('C' == **start) {
289: /*
290: * Treat constant-width font modes
291: * just like regular font modes.
292: */
293: (*start)++;
294: (*sz)--;
295: } else {
296: if ('B' == (*start)[0] && 'I' == (*start)[1])
297: gly = ESCAPE_FONTBI;
298: break;
299: }
1.65 schwarze 300: } else if (1 != *sz)
1.45 kristaps 301: break;
1.61 kristaps 302:
1.65 schwarze 303: switch (**start) {
1.79 schwarze 304: case '3':
305: case 'B':
1.45 kristaps 306: gly = ESCAPE_FONTBOLD;
307: break;
1.79 schwarze 308: case '2':
309: case 'I':
1.45 kristaps 310: gly = ESCAPE_FONTITALIC;
1.22 kristaps 311: break;
1.79 schwarze 312: case 'P':
1.45 kristaps 313: gly = ESCAPE_FONTPREV;
1.22 kristaps 314: break;
1.79 schwarze 315: case '1':
316: case 'R':
1.45 kristaps 317: gly = ESCAPE_FONTROMAN;
1.1 kristaps 318: break;
319: }
1.46 kristaps 320: break;
1.79 schwarze 321: case ESCAPE_SPECIAL:
1.65 schwarze 322: if (1 == *sz && 'c' == **start)
1.45 kristaps 323: gly = ESCAPE_NOSPACE;
1.87 schwarze 324: /*
1.88 schwarze 325: * Unicode escapes are defined in groff as \[u0000]
1.87 schwarze 326: * to \[u10FFFF], where the contained value must be
327: * a valid Unicode codepoint. Here, however, only
1.88 schwarze 328: * check the length and range.
1.87 schwarze 329: */
1.88 schwarze 330: if (**start != 'u' || *sz < 5 || *sz > 7)
331: break;
332: if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
333: break;
334: if (*sz == 6 && (*start)[1] == '0')
1.96 ! schwarze 335: break;
! 336: if (*sz == 5 && (*start)[1] == 'D' &&
! 337: strchr("89ABCDEF", (*start)[2]) != NULL)
1.88 schwarze 338: break;
339: if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
1.87 schwarze 340: + 1 == *sz)
341: gly = ESCAPE_UNICODE;
1.22 kristaps 342: break;
1.1 kristaps 343: default:
1.22 kristaps 344: break;
1.1 kristaps 345: }
346:
1.94 schwarze 347: return gly;
1.36 schwarze 348: }
349:
350: /*
351: * Parse a quoted or unquoted roff-style request or macro argument.
352: * Return a pointer to the parsed argument, which is either the original
353: * pointer or advanced by one byte in case the argument is quoted.
1.71 schwarze 354: * NUL-terminate the argument in place.
1.36 schwarze 355: * Collapse pairs of quotes inside quoted arguments.
356: * Advance the argument pointer to the next argument,
1.71 schwarze 357: * or to the NUL byte terminating the argument line.
1.36 schwarze 358: */
359: char *
1.48 kristaps 360: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 361: {
362: char *start, *cp;
363: int quoted, pairs, white;
364:
365: /* Quoting can only start with a new word. */
366: start = *cpp;
1.47 kristaps 367: quoted = 0;
1.36 schwarze 368: if ('"' == *start) {
369: quoted = 1;
370: start++;
1.79 schwarze 371: }
1.36 schwarze 372:
373: pairs = 0;
374: white = 0;
375: for (cp = start; '\0' != *cp; cp++) {
1.67 schwarze 376:
377: /*
378: * Move the following text left
379: * after quoted quotes and after "\\" and "\t".
380: */
1.36 schwarze 381: if (pairs)
382: cp[-pairs] = cp[0];
1.67 schwarze 383:
1.36 schwarze 384: if ('\\' == cp[0]) {
1.67 schwarze 385: /*
386: * In copy mode, translate double to single
387: * backslashes and backslash-t to literal tabs.
388: */
389: switch (cp[1]) {
1.79 schwarze 390: case 't':
1.67 schwarze 391: cp[0] = '\t';
392: /* FALLTHROUGH */
1.79 schwarze 393: case '\\':
1.36 schwarze 394: pairs++;
395: cp++;
1.67 schwarze 396: break;
1.79 schwarze 397: case ' ':
1.36 schwarze 398: /* Skip escaped blanks. */
1.67 schwarze 399: if (0 == quoted)
400: cp++;
401: break;
402: default:
403: break;
404: }
1.36 schwarze 405: } else if (0 == quoted) {
406: if (' ' == cp[0]) {
407: /* Unescaped blanks end unquoted args. */
408: white = 1;
409: break;
410: }
411: } else if ('"' == cp[0]) {
412: if ('"' == cp[1]) {
413: /* Quoted quotes collapse. */
414: pairs++;
415: cp++;
416: } else {
417: /* Unquoted quotes end quoted args. */
418: quoted = 2;
419: break;
420: }
421: }
422: }
423:
424: /* Quoted argument without a closing quote. */
1.48 kristaps 425: if (1 == quoted)
1.83 schwarze 426: mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
1.36 schwarze 427:
1.71 schwarze 428: /* NUL-terminate this argument and move to the next one. */
1.36 schwarze 429: if (pairs)
430: cp[-pairs] = '\0';
431: if ('\0' != *cp) {
432: *cp++ = '\0';
433: while (' ' == *cp)
434: cp++;
435: }
1.39 kristaps 436: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 437: *cpp = cp;
438:
1.48 kristaps 439: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.83 schwarze 440: mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
1.36 schwarze 441:
1.94 schwarze 442: return start;
1.4 kristaps 443: }
1.7 kristaps 444:
445: static int
446: a2time(time_t *t, const char *fmt, const char *p)
447: {
448: struct tm tm;
449: char *pp;
450:
451: memset(&tm, 0, sizeof(struct tm));
452:
1.56 kristaps 453: pp = NULL;
1.85 schwarze 454: #if HAVE_STRPTIME
1.7 kristaps 455: pp = strptime(p, fmt, &tm);
1.56 kristaps 456: #endif
1.7 kristaps 457: if (NULL != pp && '\0' == *pp) {
458: *t = mktime(&tm);
1.94 schwarze 459: return 1;
1.7 kristaps 460: }
461:
1.94 schwarze 462: return 0;
1.7 kristaps 463: }
464:
1.37 schwarze 465: static char *
466: time2a(time_t t)
467: {
1.56 kristaps 468: struct tm *tm;
1.38 schwarze 469: char *buf, *p;
470: size_t ssz;
1.37 schwarze 471: int isz;
472:
1.56 kristaps 473: tm = localtime(&t);
1.89 schwarze 474: if (tm == NULL)
1.94 schwarze 475: return NULL;
1.37 schwarze 476:
1.38 schwarze 477: /*
478: * Reserve space:
479: * up to 9 characters for the month (September) + blank
480: * up to 2 characters for the day + comma + blank
481: * 4 characters for the year and a terminating '\0'
482: */
483: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
484:
1.56 kristaps 485: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.38 schwarze 486: goto fail;
487: p += (int)ssz;
1.37 schwarze 488:
1.56 kristaps 489: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.38 schwarze 490: goto fail;
1.37 schwarze 491: p += isz;
492:
1.56 kristaps 493: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.38 schwarze 494: goto fail;
1.94 schwarze 495: return buf;
1.38 schwarze 496:
497: fail:
498: free(buf);
1.94 schwarze 499: return NULL;
1.37 schwarze 500: }
501:
502: char *
1.42 kristaps 503: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 504: {
1.37 schwarze 505: char *out;
1.7 kristaps 506: time_t t;
507:
1.37 schwarze 508: if (NULL == in || '\0' == *in ||
509: 0 == strcmp(in, "$" "Mdocdate$")) {
1.80 schwarze 510: mandoc_msg(MANDOCERR_DATE_MISSING, parse, ln, pos, NULL);
1.37 schwarze 511: time(&t);
512: }
1.62 schwarze 513: else if (a2time(&t, "%Y-%m-%d", in))
514: t = 0;
1.37 schwarze 515: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.62 schwarze 516: !a2time(&t, "%b %d, %Y", in)) {
1.81 schwarze 517: mandoc_msg(MANDOCERR_DATE_BAD, parse, ln, pos, in);
1.37 schwarze 518: t = 0;
1.7 kristaps 519: }
1.37 schwarze 520: out = t ? time2a(t) : NULL;
1.94 schwarze 521: return out ? out : mandoc_strdup(in);
1.7 kristaps 522: }
523:
1.12 kristaps 524: int
1.75 schwarze 525: mandoc_eos(const char *p, size_t sz)
1.12 kristaps 526: {
1.75 schwarze 527: const char *q;
528: int enclosed, found;
1.12 kristaps 529:
1.13 kristaps 530: if (0 == sz)
1.94 schwarze 531: return 0;
1.12 kristaps 532:
1.14 kristaps 533: /*
534: * End-of-sentence recognition must include situations where
535: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 536: * propagate outward.
1.14 kristaps 537: */
538:
1.75 schwarze 539: enclosed = found = 0;
1.25 kristaps 540: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 541: switch (*q) {
1.79 schwarze 542: case '\"':
543: case '\'':
544: case ']':
545: case ')':
1.23 schwarze 546: if (0 == found)
547: enclosed = 1;
1.14 kristaps 548: break;
1.79 schwarze 549: case '.':
550: case '!':
551: case '?':
1.23 schwarze 552: found = 1;
553: break;
1.14 kristaps 554: default:
1.94 schwarze 555: return found &&
556: (!enclosed || isalnum((unsigned char)*q));
1.14 kristaps 557: }
1.12 kristaps 558: }
559:
1.94 schwarze 560: return found && !enclosed;
1.44 kristaps 561: }
1.50 kristaps 562:
563: /*
564: * Convert a string to a long that may not be <0.
565: * If the string is invalid, or is less than 0, return -1.
566: */
567: int
1.54 kristaps 568: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 569: {
570: char buf[32];
571: char *ep;
572: long v;
573:
574: if (sz > 31)
1.94 schwarze 575: return -1;
1.50 kristaps 576:
577: memcpy(buf, p, sz);
1.51 kristaps 578: buf[(int)sz] = '\0';
1.50 kristaps 579:
580: errno = 0;
581: v = strtol(buf, &ep, base);
582:
583: if (buf[0] == '\0' || *ep != '\0')
1.94 schwarze 584: return -1;
1.50 kristaps 585:
1.54 kristaps 586: if (v > INT_MAX)
587: v = INT_MAX;
588: if (v < INT_MIN)
589: v = INT_MIN;
1.50 kristaps 590:
1.94 schwarze 591: return (int)v;
1.50 kristaps 592: }
CVSweb