Annotation of mandoc/mandoc.c, Revision 1.64
1.64 ! schwarze 1: /* $Id: mandoc.c,v 1.63 2012/05/31 22:29:13 schwarze Exp $ */
1.1 kristaps 2: /*
1.59 schwarze 3: * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.63 schwarze 4: * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
1.7 kristaps 20: #endif
21:
1.2 kristaps 22: #include <sys/types.h>
23:
1.1 kristaps 24: #include <assert.h>
25: #include <ctype.h>
1.50 kristaps 26: #include <errno.h>
27: #include <limits.h>
1.1 kristaps 28: #include <stdlib.h>
1.4 kristaps 29: #include <stdio.h>
30: #include <string.h>
1.7 kristaps 31: #include <time.h>
1.1 kristaps 32:
1.18 kristaps 33: #include "mandoc.h"
1.1 kristaps 34: #include "libmandoc.h"
35:
1.37 schwarze 36: #define DATESIZE 32
37:
1.18 kristaps 38: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 39: static char *time2a(time_t);
1.7 kristaps 40:
1.45 kristaps 41:
42: enum mandoc_esc
43: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 44: {
1.64 ! schwarze 45: char c, term;
! 46: int i, rlim;
1.45 kristaps 47: const char *cp, *rstart;
48: enum mandoc_esc gly;
49:
50: cp = *end;
51: rstart = cp;
52: if (start)
53: *start = rstart;
1.64 ! schwarze 54: i = rlim = 0;
1.45 kristaps 55: gly = ESCAPE_ERROR;
1.64 ! schwarze 56: term = '\0';
1.18 kristaps 57:
1.45 kristaps 58: switch ((c = cp[i++])) {
59: /*
60: * First the glyphs. There are several different forms of
61: * these, but each eventually returns a substring of the glyph
62: * name.
63: */
64: case ('('):
65: gly = ESCAPE_SPECIAL;
1.64 ! schwarze 66: rlim = 2;
1.45 kristaps 67: break;
68: case ('['):
69: gly = ESCAPE_SPECIAL;
1.52 kristaps 70: /*
71: * Unicode escapes are defined in groff as \[uXXXX] to
72: * \[u10FFFF], where the contained value must be a valid
73: * Unicode codepoint. Here, however, only check whether
74: * it's not a zero-width escape.
75: */
76: if ('u' == cp[i] && ']' != cp[i + 1])
77: gly = ESCAPE_UNICODE;
1.45 kristaps 78: term = ']';
79: break;
80: case ('C'):
81: if ('\'' != cp[i])
82: return(ESCAPE_ERROR);
83: gly = ESCAPE_SPECIAL;
84: term = '\'';
85: break;
1.63 schwarze 86:
87: /*
88: * The \z escape is supposed to output the following
89: * character without advancing the cursor position.
90: * Since we are mostly dealing with terminal mode,
91: * let us just skip the next character.
92: */
93: case ('z'):
94: (*end)++;
95: return(ESCAPE_SKIPCHAR);
1.1 kristaps 96:
1.45 kristaps 97: /*
98: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
99: * 'X' is the trigger. These have opaque sub-strings.
100: */
101: case ('F'):
102: /* FALLTHROUGH */
103: case ('g'):
1.24 kristaps 104: /* FALLTHROUGH */
1.45 kristaps 105: case ('k'):
1.24 kristaps 106: /* FALLTHROUGH */
1.45 kristaps 107: case ('M'):
1.24 kristaps 108: /* FALLTHROUGH */
1.45 kristaps 109: case ('m'):
1.24 kristaps 110: /* FALLTHROUGH */
1.45 kristaps 111: case ('n'):
1.24 kristaps 112: /* FALLTHROUGH */
1.45 kristaps 113: case ('V'):
1.24 kristaps 114: /* FALLTHROUGH */
1.45 kristaps 115: case ('Y'):
1.60 schwarze 116: gly = ESCAPE_IGNORE;
1.24 kristaps 117: /* FALLTHROUGH */
1.45 kristaps 118: case ('f'):
119: if (ESCAPE_ERROR == gly)
120: gly = ESCAPE_FONT;
121:
122: rstart= &cp[i];
123: if (start)
124: *start = rstart;
125:
126: switch (cp[i++]) {
127: case ('('):
1.64 ! schwarze 128: rlim = 2;
1.45 kristaps 129: break;
130: case ('['):
131: term = ']';
132: break;
133: default:
1.64 ! schwarze 134: rlim = 1;
1.45 kristaps 135: i--;
136: break;
137: }
138: break;
139:
140: /*
141: * These escapes are of the form \X'Y', where 'X' is the trigger
142: * and 'Y' is any string. These have opaque sub-strings.
143: */
144: case ('A'):
1.24 kristaps 145: /* FALLTHROUGH */
1.45 kristaps 146: case ('b'):
1.24 kristaps 147: /* FALLTHROUGH */
148: case ('D'):
149: /* FALLTHROUGH */
1.45 kristaps 150: case ('o'):
1.24 kristaps 151: /* FALLTHROUGH */
1.45 kristaps 152: case ('R'):
1.24 kristaps 153: /* FALLTHROUGH */
1.45 kristaps 154: case ('X'):
1.24 kristaps 155: /* FALLTHROUGH */
1.45 kristaps 156: case ('Z'):
157: if ('\'' != cp[i++])
158: return(ESCAPE_ERROR);
159: gly = ESCAPE_IGNORE;
1.24 kristaps 160: term = '\'';
161: break;
1.45 kristaps 162:
163: /*
164: * These escapes are of the form \X'N', where 'X' is the trigger
165: * and 'N' resolves to a numerical expression.
166: */
167: case ('B'):
168: /* FALLTHROUGH */
1.28 kristaps 169: case ('h'):
170: /* FALLTHROUGH */
1.45 kristaps 171: case ('H'):
172: /* FALLTHROUGH */
173: case ('L'):
174: /* FALLTHROUGH */
175: case ('l'):
1.60 schwarze 176: gly = ESCAPE_NUMBERED;
1.45 kristaps 177: /* FALLTHROUGH */
178: case ('S'):
179: /* FALLTHROUGH */
1.28 kristaps 180: case ('v'):
181: /* FALLTHROUGH */
1.45 kristaps 182: case ('w'):
183: /* FALLTHROUGH */
184: case ('x'):
185: if (ESCAPE_ERROR == gly)
186: gly = ESCAPE_IGNORE;
187: if ('\'' != cp[i++])
188: return(ESCAPE_ERROR);
1.64 ! schwarze 189: term = '\'';
1.45 kristaps 190: break;
1.60 schwarze 191:
192: /*
193: * Special handling for the numbered character escape.
194: * XXX Do any other escapes need similar handling?
195: */
196: case ('N'):
197: if ('\0' == cp[i])
198: return(ESCAPE_ERROR);
199: *end = &cp[++i];
200: if (isdigit((unsigned char)cp[i-1]))
201: return(ESCAPE_IGNORE);
202: while (isdigit((unsigned char)**end))
203: (*end)++;
204: if (start)
205: *start = &cp[i];
206: if (sz)
207: *sz = *end - &cp[i];
208: if ('\0' != **end)
209: (*end)++;
210: return(ESCAPE_NUMBERED);
1.45 kristaps 211:
212: /*
213: * Sizes get a special category of their own.
214: */
1.8 kristaps 215: case ('s'):
1.45 kristaps 216: gly = ESCAPE_IGNORE;
1.28 kristaps 217:
1.45 kristaps 218: rstart = &cp[i];
219: if (start)
220: *start = rstart;
221:
222: /* See +/- counts as a sign. */
223: c = cp[i];
224: if ('+' == c || '-' == c || ASCII_HYPH == c)
225: ++i;
1.8 kristaps 226:
1.45 kristaps 227: switch (cp[i++]) {
1.22 kristaps 228: case ('('):
1.64 ! schwarze 229: rlim = 2;
1.22 kristaps 230: break;
231: case ('['):
1.64 ! schwarze 232: term = ']';
1.22 kristaps 233: break;
234: case ('\''):
1.64 ! schwarze 235: term = '\'';
1.22 kristaps 236: break;
237: default:
1.64 ! schwarze 238: rlim = 1;
1.45 kristaps 239: i--;
1.22 kristaps 240: break;
1.8 kristaps 241: }
242:
1.45 kristaps 243: /* See +/- counts as a sign. */
244: c = cp[i];
245: if ('+' == c || '-' == c || ASCII_HYPH == c)
246: ++i;
247:
248: break;
1.33 kristaps 249:
1.45 kristaps 250: /*
251: * Anything else is assumed to be a glyph.
252: */
253: default:
254: gly = ESCAPE_SPECIAL;
1.64 ! schwarze 255: rlim = 1;
1.45 kristaps 256: i--;
1.22 kristaps 257: break;
1.45 kristaps 258: }
259:
260: assert(ESCAPE_ERROR != gly);
261:
1.64 ! schwarze 262: *end = rstart = &cp[i];
1.45 kristaps 263: if (start)
264: *start = rstart;
265:
266: /*
1.64 ! schwarze 267: * Read up to the terminating character,
! 268: * paying attention to nested escapes.
1.45 kristaps 269: */
270:
271: if ('\0' != term) {
1.64 ! schwarze 272: while (**end != term) {
! 273: switch (**end) {
! 274: case ('\0'):
! 275: return(ESCAPE_ERROR);
! 276: case ('\\'):
! 277: (*end)++;
! 278: if (ESCAPE_ERROR ==
! 279: mandoc_escape(end, NULL, NULL))
! 280: return(ESCAPE_ERROR);
! 281: break;
! 282: default:
! 283: (*end)++;
! 284: break;
! 285: }
! 286: }
! 287: rlim = (*end)++ - rstart;
! 288: } else {
! 289: assert(rlim > 0);
! 290: if ((size_t)rlim > strlen(rstart))
1.45 kristaps 291: return(ESCAPE_ERROR);
1.64 ! schwarze 292: *end += rlim;
1.45 kristaps 293: }
294: if (sz)
295: *sz = rlim;
296:
297: /* Run post-processors. */
298:
299: switch (gly) {
300: case (ESCAPE_FONT):
1.61 kristaps 301: /*
302: * Pretend that the constant-width font modes are the
303: * same as the regular font modes.
304: */
305: if (2 == rlim && 'C' == *rstart)
306: rstart++;
307: else if (1 != rlim)
1.45 kristaps 308: break;
1.61 kristaps 309:
1.45 kristaps 310: switch (*rstart) {
311: case ('3'):
312: /* FALLTHROUGH */
313: case ('B'):
314: gly = ESCAPE_FONTBOLD;
315: break;
316: case ('2'):
317: /* FALLTHROUGH */
318: case ('I'):
319: gly = ESCAPE_FONTITALIC;
1.22 kristaps 320: break;
1.45 kristaps 321: case ('P'):
322: gly = ESCAPE_FONTPREV;
1.22 kristaps 323: break;
1.45 kristaps 324: case ('1'):
325: /* FALLTHROUGH */
326: case ('R'):
327: gly = ESCAPE_FONTROMAN;
1.1 kristaps 328: break;
329: }
1.46 kristaps 330: break;
1.45 kristaps 331: case (ESCAPE_SPECIAL):
332: if (1 != rlim)
333: break;
334: if ('c' == *rstart)
335: gly = ESCAPE_NOSPACE;
1.22 kristaps 336: break;
1.1 kristaps 337: default:
1.22 kristaps 338: break;
1.1 kristaps 339: }
340:
1.45 kristaps 341: return(gly);
1.1 kristaps 342: }
1.4 kristaps 343:
344: void *
345: mandoc_calloc(size_t num, size_t size)
346: {
347: void *ptr;
348:
349: ptr = calloc(num, size);
350: if (NULL == ptr) {
1.6 kristaps 351: perror(NULL);
1.35 kristaps 352: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 353: }
354:
355: return(ptr);
356: }
357:
358:
359: void *
360: mandoc_malloc(size_t size)
361: {
362: void *ptr;
363:
364: ptr = malloc(size);
365: if (NULL == ptr) {
1.6 kristaps 366: perror(NULL);
1.35 kristaps 367: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 368: }
369:
370: return(ptr);
371: }
372:
373:
374: void *
375: mandoc_realloc(void *ptr, size_t size)
376: {
377:
378: ptr = realloc(ptr, size);
379: if (NULL == ptr) {
1.6 kristaps 380: perror(NULL);
1.35 kristaps 381: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 382: }
383:
384: return(ptr);
385: }
386:
1.55 kristaps 387: char *
388: mandoc_strndup(const char *ptr, size_t sz)
389: {
390: char *p;
391:
392: p = mandoc_malloc(sz + 1);
393: memcpy(p, ptr, sz);
394: p[(int)sz] = '\0';
395: return(p);
396: }
1.4 kristaps 397:
398: char *
399: mandoc_strdup(const char *ptr)
400: {
401: char *p;
402:
403: p = strdup(ptr);
404: if (NULL == p) {
1.6 kristaps 405: perror(NULL);
1.35 kristaps 406: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 407: }
408:
409: return(p);
1.36 schwarze 410: }
411:
412: /*
413: * Parse a quoted or unquoted roff-style request or macro argument.
414: * Return a pointer to the parsed argument, which is either the original
415: * pointer or advanced by one byte in case the argument is quoted.
416: * Null-terminate the argument in place.
417: * Collapse pairs of quotes inside quoted arguments.
418: * Advance the argument pointer to the next argument,
419: * or to the null byte terminating the argument line.
420: */
421: char *
1.48 kristaps 422: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 423: {
424: char *start, *cp;
425: int quoted, pairs, white;
426:
427: /* Quoting can only start with a new word. */
428: start = *cpp;
1.47 kristaps 429: quoted = 0;
1.36 schwarze 430: if ('"' == *start) {
431: quoted = 1;
432: start++;
1.47 kristaps 433: }
1.36 schwarze 434:
435: pairs = 0;
436: white = 0;
437: for (cp = start; '\0' != *cp; cp++) {
438: /* Move left after quoted quotes and escaped backslashes. */
439: if (pairs)
440: cp[-pairs] = cp[0];
441: if ('\\' == cp[0]) {
442: if ('\\' == cp[1]) {
443: /* Poor man's copy mode. */
444: pairs++;
445: cp++;
446: } else if (0 == quoted && ' ' == cp[1])
447: /* Skip escaped blanks. */
448: cp++;
449: } else if (0 == quoted) {
450: if (' ' == cp[0]) {
451: /* Unescaped blanks end unquoted args. */
452: white = 1;
453: break;
454: }
455: } else if ('"' == cp[0]) {
456: if ('"' == cp[1]) {
457: /* Quoted quotes collapse. */
458: pairs++;
459: cp++;
460: } else {
461: /* Unquoted quotes end quoted args. */
462: quoted = 2;
463: break;
464: }
465: }
466: }
467:
468: /* Quoted argument without a closing quote. */
1.48 kristaps 469: if (1 == quoted)
1.42 kristaps 470: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.36 schwarze 471:
472: /* Null-terminate this argument and move to the next one. */
473: if (pairs)
474: cp[-pairs] = '\0';
475: if ('\0' != *cp) {
476: *cp++ = '\0';
477: while (' ' == *cp)
478: cp++;
479: }
1.39 kristaps 480: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 481: *cpp = cp;
482:
1.48 kristaps 483: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.42 kristaps 484: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.36 schwarze 485:
486: return(start);
1.4 kristaps 487: }
1.7 kristaps 488:
489: static int
490: a2time(time_t *t, const char *fmt, const char *p)
491: {
492: struct tm tm;
493: char *pp;
494:
495: memset(&tm, 0, sizeof(struct tm));
496:
1.56 kristaps 497: pp = NULL;
498: #ifdef HAVE_STRPTIME
1.7 kristaps 499: pp = strptime(p, fmt, &tm);
1.56 kristaps 500: #endif
1.7 kristaps 501: if (NULL != pp && '\0' == *pp) {
502: *t = mktime(&tm);
503: return(1);
504: }
505:
506: return(0);
507: }
508:
1.37 schwarze 509: static char *
510: time2a(time_t t)
511: {
1.56 kristaps 512: struct tm *tm;
1.38 schwarze 513: char *buf, *p;
514: size_t ssz;
1.37 schwarze 515: int isz;
516:
1.56 kristaps 517: tm = localtime(&t);
1.37 schwarze 518:
1.38 schwarze 519: /*
520: * Reserve space:
521: * up to 9 characters for the month (September) + blank
522: * up to 2 characters for the day + comma + blank
523: * 4 characters for the year and a terminating '\0'
524: */
525: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
526:
1.56 kristaps 527: if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
1.38 schwarze 528: goto fail;
529: p += (int)ssz;
1.37 schwarze 530:
1.56 kristaps 531: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
1.38 schwarze 532: goto fail;
1.37 schwarze 533: p += isz;
534:
1.56 kristaps 535: if (0 == strftime(p, 4 + 1, "%Y", tm))
1.38 schwarze 536: goto fail;
537: return(buf);
538:
539: fail:
540: free(buf);
541: return(NULL);
1.37 schwarze 542: }
543:
544: char *
1.42 kristaps 545: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 546: {
1.37 schwarze 547: char *out;
1.7 kristaps 548: time_t t;
549:
1.37 schwarze 550: if (NULL == in || '\0' == *in ||
551: 0 == strcmp(in, "$" "Mdocdate$")) {
1.42 kristaps 552: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.37 schwarze 553: time(&t);
554: }
1.62 schwarze 555: else if (a2time(&t, "%Y-%m-%d", in))
556: t = 0;
1.37 schwarze 557: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
1.62 schwarze 558: !a2time(&t, "%b %d, %Y", in)) {
1.42 kristaps 559: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.37 schwarze 560: t = 0;
1.7 kristaps 561: }
1.37 schwarze 562: out = t ? time2a(t) : NULL;
1.38 schwarze 563: return(out ? out : mandoc_strdup(in));
1.7 kristaps 564: }
565:
1.12 kristaps 566: int
1.23 schwarze 567: mandoc_eos(const char *p, size_t sz, int enclosed)
1.12 kristaps 568: {
1.23 schwarze 569: const char *q;
570: int found;
1.12 kristaps 571:
1.13 kristaps 572: if (0 == sz)
573: return(0);
1.12 kristaps 574:
1.14 kristaps 575: /*
576: * End-of-sentence recognition must include situations where
577: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 578: * propagate outward.
1.14 kristaps 579: */
580:
1.23 schwarze 581: found = 0;
1.25 kristaps 582: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 583: switch (*q) {
1.14 kristaps 584: case ('\"'):
585: /* FALLTHROUGH */
586: case ('\''):
1.15 kristaps 587: /* FALLTHROUGH */
588: case (']'):
1.14 kristaps 589: /* FALLTHROUGH */
590: case (')'):
1.23 schwarze 591: if (0 == found)
592: enclosed = 1;
1.14 kristaps 593: break;
594: case ('.'):
595: /* FALLTHROUGH */
596: case ('!'):
597: /* FALLTHROUGH */
598: case ('?'):
1.23 schwarze 599: found = 1;
600: break;
1.14 kristaps 601: default:
1.27 joerg 602: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.14 kristaps 603: }
1.12 kristaps 604: }
605:
1.23 schwarze 606: return(found && !enclosed);
1.40 kristaps 607: }
608:
1.44 kristaps 609: /*
610: * Find out whether a line is a macro line or not. If it is, adjust the
611: * current position and return one; if it isn't, return zero and don't
612: * change the current position.
613: */
614: int
615: mandoc_getcontrol(const char *cp, int *ppos)
616: {
617: int pos;
618:
619: pos = *ppos;
620:
621: if ('\\' == cp[pos] && '.' == cp[pos + 1])
622: pos += 2;
623: else if ('.' == cp[pos] || '\'' == cp[pos])
624: pos++;
625: else
626: return(0);
627:
628: while (' ' == cp[pos] || '\t' == cp[pos])
629: pos++;
630:
631: *ppos = pos;
632: return(1);
633: }
1.50 kristaps 634:
635: /*
636: * Convert a string to a long that may not be <0.
637: * If the string is invalid, or is less than 0, return -1.
638: */
639: int
1.54 kristaps 640: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 641: {
642: char buf[32];
643: char *ep;
644: long v;
645:
646: if (sz > 31)
647: return(-1);
648:
649: memcpy(buf, p, sz);
1.51 kristaps 650: buf[(int)sz] = '\0';
1.50 kristaps 651:
652: errno = 0;
653: v = strtol(buf, &ep, base);
654:
655: if (buf[0] == '\0' || *ep != '\0')
656: return(-1);
657:
1.54 kristaps 658: if (v > INT_MAX)
659: v = INT_MAX;
660: if (v < INT_MIN)
661: v = INT_MIN;
1.50 kristaps 662:
663: return((int)v);
664: }
CVSweb