Annotation of mandoc/mandoc.c, Revision 1.54
1.54 ! kristaps 1: /* $Id: mandoc.c,v 1.53 2011/05/24 21:31:23 kristaps Exp $ */
1.1 kristaps 2: /*
1.22 kristaps 3: * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
1.36 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
1.7 kristaps 20: #endif
21:
1.2 kristaps 22: #include <sys/types.h>
23:
1.1 kristaps 24: #include <assert.h>
25: #include <ctype.h>
1.50 kristaps 26: #include <errno.h>
27: #include <limits.h>
1.1 kristaps 28: #include <stdlib.h>
1.4 kristaps 29: #include <stdio.h>
30: #include <string.h>
1.7 kristaps 31: #include <time.h>
1.1 kristaps 32:
1.18 kristaps 33: #include "mandoc.h"
1.1 kristaps 34: #include "libmandoc.h"
35:
1.37 schwarze 36: #define DATESIZE 32
37:
1.18 kristaps 38: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 39: static char *time2a(time_t);
1.45 kristaps 40: static int numescape(const char *);
1.7 kristaps 41:
1.45 kristaps 42: /*
43: * Pass over recursive numerical expressions. This context of this
44: * function is important: it's only called within character-terminating
45: * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
46: * recursion: we don't care about what's in these blocks.
47: * This returns the number of characters skipped or -1 if an error
48: * occurs (the caller should bail).
49: */
50: static int
51: numescape(const char *start)
52: {
53: int i;
54: size_t sz;
55: const char *cp;
56:
57: i = 0;
58:
59: /* The expression consists of a subexpression. */
60:
61: if ('\\' == start[i]) {
62: cp = &start[++i];
63: /*
64: * Read past the end of the subexpression.
65: * Bail immediately on errors.
66: */
67: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
68: return(-1);
69: return(i + cp - &start[i]);
70: }
71:
72: if ('(' != start[i++])
73: return(0);
74:
75: /*
76: * A parenthesised subexpression. Read until the closing
77: * parenthesis, making sure to handle any nested subexpressions
78: * that might ruin our parse.
79: */
80:
81: while (')' != start[i]) {
82: sz = strcspn(&start[i], ")\\");
83: i += (int)sz;
84:
85: if ('\0' == start[i])
86: return(-1);
87: else if ('\\' != start[i])
88: continue;
89:
90: cp = &start[++i];
91: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
92: return(-1);
93: i += cp - &start[i];
94: }
95:
96: /* Read past the terminating ')'. */
97: return(++i);
98: }
99:
100: enum mandoc_esc
101: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 102: {
1.45 kristaps 103: char c, term, numeric;
104: int i, lim, ssz, rlim;
105: const char *cp, *rstart;
106: enum mandoc_esc gly;
107:
108: cp = *end;
109: rstart = cp;
110: if (start)
111: *start = rstart;
1.46 kristaps 112: i = lim = 0;
1.45 kristaps 113: gly = ESCAPE_ERROR;
1.46 kristaps 114: term = numeric = '\0';
1.18 kristaps 115:
1.45 kristaps 116: switch ((c = cp[i++])) {
117: /*
118: * First the glyphs. There are several different forms of
119: * these, but each eventually returns a substring of the glyph
120: * name.
121: */
122: case ('('):
123: gly = ESCAPE_SPECIAL;
124: lim = 2;
125: break;
126: case ('['):
127: gly = ESCAPE_SPECIAL;
1.52 kristaps 128: /*
129: * Unicode escapes are defined in groff as \[uXXXX] to
130: * \[u10FFFF], where the contained value must be a valid
131: * Unicode codepoint. Here, however, only check whether
132: * it's not a zero-width escape.
133: */
134: if ('u' == cp[i] && ']' != cp[i + 1])
135: gly = ESCAPE_UNICODE;
1.45 kristaps 136: term = ']';
137: break;
138: case ('C'):
139: if ('\'' != cp[i])
140: return(ESCAPE_ERROR);
141: gly = ESCAPE_SPECIAL;
142: term = '\'';
143: break;
1.1 kristaps 144:
1.45 kristaps 145: /*
146: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
147: * 'X' is the trigger. These have opaque sub-strings.
148: */
149: case ('F'):
150: /* FALLTHROUGH */
151: case ('g'):
1.24 kristaps 152: /* FALLTHROUGH */
1.45 kristaps 153: case ('k'):
1.24 kristaps 154: /* FALLTHROUGH */
1.45 kristaps 155: case ('M'):
1.24 kristaps 156: /* FALLTHROUGH */
1.45 kristaps 157: case ('m'):
1.24 kristaps 158: /* FALLTHROUGH */
1.45 kristaps 159: case ('n'):
1.24 kristaps 160: /* FALLTHROUGH */
1.45 kristaps 161: case ('V'):
1.24 kristaps 162: /* FALLTHROUGH */
1.45 kristaps 163: case ('Y'):
164: if (ESCAPE_ERROR == gly)
165: gly = ESCAPE_IGNORE;
1.24 kristaps 166: /* FALLTHROUGH */
1.45 kristaps 167: case ('f'):
168: if (ESCAPE_ERROR == gly)
169: gly = ESCAPE_FONT;
170:
171: rstart= &cp[i];
172: if (start)
173: *start = rstart;
174:
175: switch (cp[i++]) {
176: case ('('):
177: lim = 2;
178: break;
179: case ('['):
180: term = ']';
181: break;
182: default:
183: lim = 1;
184: i--;
185: break;
186: }
187: break;
188:
189: /*
190: * These escapes are of the form \X'Y', where 'X' is the trigger
191: * and 'Y' is any string. These have opaque sub-strings.
192: */
193: case ('A'):
1.24 kristaps 194: /* FALLTHROUGH */
1.45 kristaps 195: case ('b'):
1.24 kristaps 196: /* FALLTHROUGH */
197: case ('D'):
198: /* FALLTHROUGH */
1.45 kristaps 199: case ('o'):
1.24 kristaps 200: /* FALLTHROUGH */
1.45 kristaps 201: case ('R'):
1.24 kristaps 202: /* FALLTHROUGH */
1.45 kristaps 203: case ('X'):
1.24 kristaps 204: /* FALLTHROUGH */
1.45 kristaps 205: case ('Z'):
206: if ('\'' != cp[i++])
207: return(ESCAPE_ERROR);
208: gly = ESCAPE_IGNORE;
1.24 kristaps 209: term = '\'';
210: break;
1.45 kristaps 211:
212: /*
213: * These escapes are of the form \X'N', where 'X' is the trigger
214: * and 'N' resolves to a numerical expression.
215: */
216: case ('B'):
217: /* FALLTHROUGH */
1.28 kristaps 218: case ('h'):
219: /* FALLTHROUGH */
1.45 kristaps 220: case ('H'):
221: /* FALLTHROUGH */
222: case ('L'):
223: /* FALLTHROUGH */
224: case ('l'):
225: /* FALLTHROUGH */
226: case ('N'):
227: if (ESCAPE_ERROR == gly)
228: gly = ESCAPE_NUMBERED;
229: /* FALLTHROUGH */
230: case ('S'):
231: /* FALLTHROUGH */
1.28 kristaps 232: case ('v'):
233: /* FALLTHROUGH */
1.45 kristaps 234: case ('w'):
235: /* FALLTHROUGH */
236: case ('x'):
237: if (ESCAPE_ERROR == gly)
238: gly = ESCAPE_IGNORE;
239: if ('\'' != cp[i++])
240: return(ESCAPE_ERROR);
241: term = numeric = '\'';
242: break;
243:
244: /*
245: * Sizes get a special category of their own.
246: */
1.8 kristaps 247: case ('s'):
1.45 kristaps 248: gly = ESCAPE_IGNORE;
1.28 kristaps 249:
1.45 kristaps 250: rstart = &cp[i];
251: if (start)
252: *start = rstart;
253:
254: /* See +/- counts as a sign. */
255: c = cp[i];
256: if ('+' == c || '-' == c || ASCII_HYPH == c)
257: ++i;
1.8 kristaps 258:
1.45 kristaps 259: switch (cp[i++]) {
1.22 kristaps 260: case ('('):
1.45 kristaps 261: lim = 2;
1.22 kristaps 262: break;
263: case ('['):
1.45 kristaps 264: term = numeric = ']';
1.22 kristaps 265: break;
266: case ('\''):
1.45 kristaps 267: term = numeric = '\'';
1.22 kristaps 268: break;
269: default:
1.45 kristaps 270: lim = 1;
271: i--;
1.22 kristaps 272: break;
1.8 kristaps 273: }
274:
1.45 kristaps 275: /* See +/- counts as a sign. */
276: c = cp[i];
277: if ('+' == c || '-' == c || ASCII_HYPH == c)
278: ++i;
279:
280: break;
1.33 kristaps 281:
1.45 kristaps 282: /*
283: * Anything else is assumed to be a glyph.
284: */
285: default:
286: gly = ESCAPE_SPECIAL;
287: lim = 1;
288: i--;
1.22 kristaps 289: break;
1.45 kristaps 290: }
291:
292: assert(ESCAPE_ERROR != gly);
293:
294: rstart = &cp[i];
295: if (start)
296: *start = rstart;
297:
298: /*
299: * If a terminating block has been specified, we need to
300: * handle the case of recursion, which could have their
301: * own terminating blocks that mess up our parse. This, by the
302: * way, means that the "start" and "size" values will be
303: * effectively meaningless.
304: */
305:
306: ssz = 0;
307: if (numeric && -1 == (ssz = numescape(&cp[i])))
308: return(ESCAPE_ERROR);
309:
310: i += ssz;
311: rlim = -1;
312:
313: /*
314: * We have a character terminator. Try to read up to that
315: * character. If we can't (i.e., we hit the nil), then return
316: * an error; if we can, calculate our length, read past the
317: * terminating character, and exit.
318: */
319:
320: if ('\0' != term) {
321: *end = strchr(&cp[i], term);
322: if ('\0' == *end)
323: return(ESCAPE_ERROR);
324:
325: rlim = *end - &cp[i];
326: if (sz)
327: *sz = rlim;
328: (*end)++;
329: goto out;
330: }
331:
332: assert(lim > 0);
333:
334: /*
335: * We have a numeric limit. If the string is shorter than that,
336: * stop and return an error. Else adjust our endpoint, length,
337: * and return the current glyph.
338: */
339:
340: if ((size_t)lim > strlen(&cp[i]))
341: return(ESCAPE_ERROR);
342:
343: rlim = lim;
344: if (sz)
345: *sz = rlim;
346:
347: *end = &cp[i] + lim;
348:
349: out:
350: assert(rlim >= 0 && rstart);
351:
352: /* Run post-processors. */
353:
354: switch (gly) {
355: case (ESCAPE_FONT):
356: if (1 != rlim)
357: break;
358: switch (*rstart) {
359: case ('3'):
360: /* FALLTHROUGH */
361: case ('B'):
362: gly = ESCAPE_FONTBOLD;
363: break;
364: case ('2'):
365: /* FALLTHROUGH */
366: case ('I'):
367: gly = ESCAPE_FONTITALIC;
1.22 kristaps 368: break;
1.45 kristaps 369: case ('P'):
370: gly = ESCAPE_FONTPREV;
1.22 kristaps 371: break;
1.45 kristaps 372: case ('1'):
373: /* FALLTHROUGH */
374: case ('R'):
375: gly = ESCAPE_FONTROMAN;
1.1 kristaps 376: break;
377: }
1.46 kristaps 378: break;
1.45 kristaps 379: case (ESCAPE_SPECIAL):
380: if (1 != rlim)
381: break;
382: if ('c' == *rstart)
383: gly = ESCAPE_NOSPACE;
1.22 kristaps 384: break;
1.1 kristaps 385: default:
1.22 kristaps 386: break;
1.1 kristaps 387: }
388:
1.45 kristaps 389: return(gly);
1.1 kristaps 390: }
1.4 kristaps 391:
392: void *
393: mandoc_calloc(size_t num, size_t size)
394: {
395: void *ptr;
396:
397: ptr = calloc(num, size);
398: if (NULL == ptr) {
1.6 kristaps 399: perror(NULL);
1.35 kristaps 400: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 401: }
402:
403: return(ptr);
404: }
405:
406:
407: void *
408: mandoc_malloc(size_t size)
409: {
410: void *ptr;
411:
412: ptr = malloc(size);
413: if (NULL == ptr) {
1.6 kristaps 414: perror(NULL);
1.35 kristaps 415: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 416: }
417:
418: return(ptr);
419: }
420:
421:
422: void *
423: mandoc_realloc(void *ptr, size_t size)
424: {
425:
426: ptr = realloc(ptr, size);
427: if (NULL == ptr) {
1.6 kristaps 428: perror(NULL);
1.35 kristaps 429: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 430: }
431:
432: return(ptr);
433: }
434:
435:
436: char *
437: mandoc_strdup(const char *ptr)
438: {
439: char *p;
440:
441: p = strdup(ptr);
442: if (NULL == p) {
1.6 kristaps 443: perror(NULL);
1.35 kristaps 444: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 445: }
446:
447: return(p);
1.36 schwarze 448: }
449:
450: /*
451: * Parse a quoted or unquoted roff-style request or macro argument.
452: * Return a pointer to the parsed argument, which is either the original
453: * pointer or advanced by one byte in case the argument is quoted.
454: * Null-terminate the argument in place.
455: * Collapse pairs of quotes inside quoted arguments.
456: * Advance the argument pointer to the next argument,
457: * or to the null byte terminating the argument line.
458: */
459: char *
1.48 kristaps 460: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 461: {
462: char *start, *cp;
463: int quoted, pairs, white;
464:
465: /* Quoting can only start with a new word. */
466: start = *cpp;
1.47 kristaps 467: quoted = 0;
1.36 schwarze 468: if ('"' == *start) {
469: quoted = 1;
470: start++;
1.47 kristaps 471: }
1.36 schwarze 472:
473: pairs = 0;
474: white = 0;
475: for (cp = start; '\0' != *cp; cp++) {
476: /* Move left after quoted quotes and escaped backslashes. */
477: if (pairs)
478: cp[-pairs] = cp[0];
479: if ('\\' == cp[0]) {
480: if ('\\' == cp[1]) {
481: /* Poor man's copy mode. */
482: pairs++;
483: cp++;
484: } else if (0 == quoted && ' ' == cp[1])
485: /* Skip escaped blanks. */
486: cp++;
487: } else if (0 == quoted) {
488: if (' ' == cp[0]) {
489: /* Unescaped blanks end unquoted args. */
490: white = 1;
491: break;
492: }
493: } else if ('"' == cp[0]) {
494: if ('"' == cp[1]) {
495: /* Quoted quotes collapse. */
496: pairs++;
497: cp++;
498: } else {
499: /* Unquoted quotes end quoted args. */
500: quoted = 2;
501: break;
502: }
503: }
504: }
505:
506: /* Quoted argument without a closing quote. */
1.48 kristaps 507: if (1 == quoted)
1.42 kristaps 508: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.36 schwarze 509:
510: /* Null-terminate this argument and move to the next one. */
511: if (pairs)
512: cp[-pairs] = '\0';
513: if ('\0' != *cp) {
514: *cp++ = '\0';
515: while (' ' == *cp)
516: cp++;
517: }
1.39 kristaps 518: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 519: *cpp = cp;
520:
1.48 kristaps 521: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.42 kristaps 522: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.36 schwarze 523:
524: return(start);
1.4 kristaps 525: }
1.7 kristaps 526:
527: static int
528: a2time(time_t *t, const char *fmt, const char *p)
529: {
530: struct tm tm;
531: char *pp;
532:
533: memset(&tm, 0, sizeof(struct tm));
534:
535: pp = strptime(p, fmt, &tm);
536: if (NULL != pp && '\0' == *pp) {
537: *t = mktime(&tm);
538: return(1);
539: }
540:
541: return(0);
542: }
543:
1.37 schwarze 544: static char *
545: time2a(time_t t)
546: {
547: struct tm tm;
1.38 schwarze 548: char *buf, *p;
549: size_t ssz;
1.37 schwarze 550: int isz;
551:
552: localtime_r(&t, &tm);
553:
1.38 schwarze 554: /*
555: * Reserve space:
556: * up to 9 characters for the month (September) + blank
557: * up to 2 characters for the day + comma + blank
558: * 4 characters for the year and a terminating '\0'
559: */
560: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
561:
562: if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm)))
563: goto fail;
564: p += (int)ssz;
1.37 schwarze 565:
1.38 schwarze 566: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday)))
567: goto fail;
1.37 schwarze 568: p += isz;
569:
1.38 schwarze 570: if (0 == strftime(p, 4 + 1, "%Y", &tm))
571: goto fail;
572: return(buf);
573:
574: fail:
575: free(buf);
576: return(NULL);
1.37 schwarze 577: }
578:
579: char *
1.42 kristaps 580: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 581: {
1.37 schwarze 582: char *out;
1.7 kristaps 583: time_t t;
584:
1.37 schwarze 585: if (NULL == in || '\0' == *in ||
586: 0 == strcmp(in, "$" "Mdocdate$")) {
1.42 kristaps 587: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.37 schwarze 588: time(&t);
589: }
590: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
591: !a2time(&t, "%b %d, %Y", in) &&
592: !a2time(&t, "%Y-%m-%d", in)) {
1.42 kristaps 593: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.37 schwarze 594: t = 0;
1.7 kristaps 595: }
1.37 schwarze 596: out = t ? time2a(t) : NULL;
1.38 schwarze 597: return(out ? out : mandoc_strdup(in));
1.7 kristaps 598: }
599:
1.12 kristaps 600: int
1.23 schwarze 601: mandoc_eos(const char *p, size_t sz, int enclosed)
1.12 kristaps 602: {
1.23 schwarze 603: const char *q;
604: int found;
1.12 kristaps 605:
1.13 kristaps 606: if (0 == sz)
607: return(0);
1.12 kristaps 608:
1.14 kristaps 609: /*
610: * End-of-sentence recognition must include situations where
611: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 612: * propagate outward.
1.14 kristaps 613: */
614:
1.23 schwarze 615: found = 0;
1.25 kristaps 616: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 617: switch (*q) {
1.14 kristaps 618: case ('\"'):
619: /* FALLTHROUGH */
620: case ('\''):
1.15 kristaps 621: /* FALLTHROUGH */
622: case (']'):
1.14 kristaps 623: /* FALLTHROUGH */
624: case (')'):
1.23 schwarze 625: if (0 == found)
626: enclosed = 1;
1.14 kristaps 627: break;
628: case ('.'):
629: /* FALLTHROUGH */
630: case ('!'):
631: /* FALLTHROUGH */
632: case ('?'):
1.23 schwarze 633: found = 1;
634: break;
1.14 kristaps 635: default:
1.27 joerg 636: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.14 kristaps 637: }
1.12 kristaps 638: }
639:
1.23 schwarze 640: return(found && !enclosed);
1.16 kristaps 641: }
642:
643: int
644: mandoc_hyph(const char *start, const char *c)
645: {
646:
647: /*
648: * Choose whether to break at a hyphenated character. We only
649: * do this if it's free-standing within a word.
650: */
651:
652: /* Skip first/last character of buffer. */
653: if (c == start || '\0' == *(c + 1))
654: return(0);
655: /* Skip first/last character of word. */
656: if ('\t' == *(c + 1) || '\t' == *(c - 1))
657: return(0);
658: if (' ' == *(c + 1) || ' ' == *(c - 1))
659: return(0);
660: /* Skip double invocations. */
661: if ('-' == *(c + 1) || '-' == *(c - 1))
662: return(0);
663: /* Skip escapes. */
664: if ('\\' == *(c - 1))
665: return(0);
666:
667: return(1);
1.40 kristaps 668: }
669:
1.44 kristaps 670: /*
671: * Find out whether a line is a macro line or not. If it is, adjust the
672: * current position and return one; if it isn't, return zero and don't
673: * change the current position.
674: */
675: int
676: mandoc_getcontrol(const char *cp, int *ppos)
677: {
678: int pos;
679:
680: pos = *ppos;
681:
682: if ('\\' == cp[pos] && '.' == cp[pos + 1])
683: pos += 2;
684: else if ('.' == cp[pos] || '\'' == cp[pos])
685: pos++;
686: else
687: return(0);
688:
689: while (' ' == cp[pos] || '\t' == cp[pos])
690: pos++;
691:
692: *ppos = pos;
693: return(1);
694: }
1.50 kristaps 695:
696: /*
697: * Convert a string to a long that may not be <0.
698: * If the string is invalid, or is less than 0, return -1.
699: */
700: int
1.54 ! kristaps 701: mandoc_strntoi(const char *p, size_t sz, int base)
1.50 kristaps 702: {
703: char buf[32];
704: char *ep;
705: long v;
706:
707: if (sz > 31)
708: return(-1);
709:
710: memcpy(buf, p, sz);
1.51 kristaps 711: buf[(int)sz] = '\0';
1.50 kristaps 712:
713: errno = 0;
714: v = strtol(buf, &ep, base);
715:
716: if (buf[0] == '\0' || *ep != '\0')
717: return(-1);
718:
1.54 ! kristaps 719: if (v > INT_MAX)
! 720: v = INT_MAX;
! 721: if (v < INT_MIN)
! 722: v = INT_MIN;
1.50 kristaps 723:
724: return((int)v);
725: }
CVSweb