Annotation of mandoc/mandoc.c, Revision 1.51
1.51 ! kristaps 1: /* $Id: mandoc.c,v 1.50 2011/05/14 16:06:09 kristaps Exp $ */
1.1 kristaps 2: /*
1.22 kristaps 3: * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
1.36 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
1.7 kristaps 20: #endif
21:
1.2 kristaps 22: #include <sys/types.h>
23:
1.1 kristaps 24: #include <assert.h>
25: #include <ctype.h>
1.50 kristaps 26: #include <errno.h>
27: #include <limits.h>
1.1 kristaps 28: #include <stdlib.h>
1.4 kristaps 29: #include <stdio.h>
30: #include <string.h>
1.7 kristaps 31: #include <time.h>
1.1 kristaps 32:
1.18 kristaps 33: #include "mandoc.h"
1.1 kristaps 34: #include "libmandoc.h"
35:
1.37 schwarze 36: #define DATESIZE 32
37:
1.18 kristaps 38: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 39: static char *time2a(time_t);
1.45 kristaps 40: static int numescape(const char *);
1.7 kristaps 41:
1.45 kristaps 42: /*
43: * Pass over recursive numerical expressions. This context of this
44: * function is important: it's only called within character-terminating
45: * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
46: * recursion: we don't care about what's in these blocks.
47: * This returns the number of characters skipped or -1 if an error
48: * occurs (the caller should bail).
49: */
50: static int
51: numescape(const char *start)
52: {
53: int i;
54: size_t sz;
55: const char *cp;
56:
57: i = 0;
58:
59: /* The expression consists of a subexpression. */
60:
61: if ('\\' == start[i]) {
62: cp = &start[++i];
63: /*
64: * Read past the end of the subexpression.
65: * Bail immediately on errors.
66: */
67: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
68: return(-1);
69: return(i + cp - &start[i]);
70: }
71:
72: if ('(' != start[i++])
73: return(0);
74:
75: /*
76: * A parenthesised subexpression. Read until the closing
77: * parenthesis, making sure to handle any nested subexpressions
78: * that might ruin our parse.
79: */
80:
81: while (')' != start[i]) {
82: sz = strcspn(&start[i], ")\\");
83: i += (int)sz;
84:
85: if ('\0' == start[i])
86: return(-1);
87: else if ('\\' != start[i])
88: continue;
89:
90: cp = &start[++i];
91: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
92: return(-1);
93: i += cp - &start[i];
94: }
95:
96: /* Read past the terminating ')'. */
97: return(++i);
98: }
99:
100: enum mandoc_esc
101: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 102: {
1.45 kristaps 103: char c, term, numeric;
104: int i, lim, ssz, rlim;
105: const char *cp, *rstart;
106: enum mandoc_esc gly;
107:
108: cp = *end;
109: rstart = cp;
110: if (start)
111: *start = rstart;
1.46 kristaps 112: i = lim = 0;
1.45 kristaps 113: gly = ESCAPE_ERROR;
1.46 kristaps 114: term = numeric = '\0';
1.18 kristaps 115:
1.45 kristaps 116: switch ((c = cp[i++])) {
117: /*
118: * First the glyphs. There are several different forms of
119: * these, but each eventually returns a substring of the glyph
120: * name.
121: */
122: case ('('):
123: gly = ESCAPE_SPECIAL;
124: lim = 2;
125: break;
126: case ('['):
127: gly = ESCAPE_SPECIAL;
128: term = ']';
129: break;
130: case ('C'):
131: if ('\'' != cp[i])
132: return(ESCAPE_ERROR);
133: gly = ESCAPE_SPECIAL;
134: term = '\'';
135: break;
1.1 kristaps 136:
1.45 kristaps 137: /*
138: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
139: * 'X' is the trigger. These have opaque sub-strings.
140: */
141: case ('F'):
142: /* FALLTHROUGH */
143: case ('g'):
1.24 kristaps 144: /* FALLTHROUGH */
1.45 kristaps 145: case ('k'):
1.24 kristaps 146: /* FALLTHROUGH */
1.45 kristaps 147: case ('M'):
1.24 kristaps 148: /* FALLTHROUGH */
1.45 kristaps 149: case ('m'):
1.24 kristaps 150: /* FALLTHROUGH */
1.45 kristaps 151: case ('n'):
1.24 kristaps 152: /* FALLTHROUGH */
1.45 kristaps 153: case ('V'):
1.24 kristaps 154: /* FALLTHROUGH */
1.45 kristaps 155: case ('Y'):
156: if (ESCAPE_ERROR == gly)
157: gly = ESCAPE_IGNORE;
1.24 kristaps 158: /* FALLTHROUGH */
1.45 kristaps 159: case ('*'):
160: if (ESCAPE_ERROR == gly)
161: gly = ESCAPE_PREDEF;
1.24 kristaps 162: /* FALLTHROUGH */
1.45 kristaps 163: case ('f'):
164: if (ESCAPE_ERROR == gly)
165: gly = ESCAPE_FONT;
166:
167: rstart= &cp[i];
168: if (start)
169: *start = rstart;
170:
171: switch (cp[i++]) {
172: case ('('):
173: lim = 2;
174: break;
175: case ('['):
176: term = ']';
177: break;
178: default:
179: lim = 1;
180: i--;
181: break;
182: }
183: break;
184:
185: /*
186: * These escapes are of the form \X'Y', where 'X' is the trigger
187: * and 'Y' is any string. These have opaque sub-strings.
188: */
189: case ('A'):
1.24 kristaps 190: /* FALLTHROUGH */
1.45 kristaps 191: case ('b'):
1.24 kristaps 192: /* FALLTHROUGH */
193: case ('D'):
194: /* FALLTHROUGH */
1.45 kristaps 195: case ('o'):
1.24 kristaps 196: /* FALLTHROUGH */
1.45 kristaps 197: case ('R'):
1.24 kristaps 198: /* FALLTHROUGH */
1.45 kristaps 199: case ('X'):
1.24 kristaps 200: /* FALLTHROUGH */
1.45 kristaps 201: case ('Z'):
202: if ('\'' != cp[i++])
203: return(ESCAPE_ERROR);
204: gly = ESCAPE_IGNORE;
1.24 kristaps 205: term = '\'';
206: break;
1.45 kristaps 207:
208: /*
209: * These escapes are of the form \X'N', where 'X' is the trigger
210: * and 'N' resolves to a numerical expression.
211: */
212: case ('B'):
213: /* FALLTHROUGH */
1.28 kristaps 214: case ('h'):
215: /* FALLTHROUGH */
1.45 kristaps 216: case ('H'):
217: /* FALLTHROUGH */
218: case ('L'):
219: /* FALLTHROUGH */
220: case ('l'):
221: /* FALLTHROUGH */
222: case ('N'):
223: if (ESCAPE_ERROR == gly)
224: gly = ESCAPE_NUMBERED;
225: /* FALLTHROUGH */
226: case ('S'):
227: /* FALLTHROUGH */
1.28 kristaps 228: case ('v'):
229: /* FALLTHROUGH */
1.45 kristaps 230: case ('w'):
231: /* FALLTHROUGH */
232: case ('x'):
233: if (ESCAPE_ERROR == gly)
234: gly = ESCAPE_IGNORE;
235: if ('\'' != cp[i++])
236: return(ESCAPE_ERROR);
237: term = numeric = '\'';
238: break;
239:
240: /*
241: * Sizes get a special category of their own.
242: */
1.8 kristaps 243: case ('s'):
1.45 kristaps 244: gly = ESCAPE_IGNORE;
1.28 kristaps 245:
1.45 kristaps 246: rstart = &cp[i];
247: if (start)
248: *start = rstart;
249:
250: /* See +/- counts as a sign. */
251: c = cp[i];
252: if ('+' == c || '-' == c || ASCII_HYPH == c)
253: ++i;
1.8 kristaps 254:
1.45 kristaps 255: switch (cp[i++]) {
1.22 kristaps 256: case ('('):
1.45 kristaps 257: lim = 2;
1.22 kristaps 258: break;
259: case ('['):
1.45 kristaps 260: term = numeric = ']';
1.22 kristaps 261: break;
262: case ('\''):
1.45 kristaps 263: term = numeric = '\'';
1.22 kristaps 264: break;
265: default:
1.45 kristaps 266: lim = 1;
267: i--;
1.22 kristaps 268: break;
1.8 kristaps 269: }
270:
1.45 kristaps 271: /* See +/- counts as a sign. */
272: c = cp[i];
273: if ('+' == c || '-' == c || ASCII_HYPH == c)
274: ++i;
275:
276: break;
1.33 kristaps 277:
1.45 kristaps 278: /*
279: * Anything else is assumed to be a glyph.
280: */
281: default:
282: gly = ESCAPE_SPECIAL;
283: lim = 1;
284: i--;
1.22 kristaps 285: break;
1.45 kristaps 286: }
287:
288: assert(ESCAPE_ERROR != gly);
289:
290: rstart = &cp[i];
291: if (start)
292: *start = rstart;
293:
294: /*
295: * If a terminating block has been specified, we need to
296: * handle the case of recursion, which could have their
297: * own terminating blocks that mess up our parse. This, by the
298: * way, means that the "start" and "size" values will be
299: * effectively meaningless.
300: */
301:
302: ssz = 0;
303: if (numeric && -1 == (ssz = numescape(&cp[i])))
304: return(ESCAPE_ERROR);
305:
306: i += ssz;
307: rlim = -1;
308:
309: /*
310: * We have a character terminator. Try to read up to that
311: * character. If we can't (i.e., we hit the nil), then return
312: * an error; if we can, calculate our length, read past the
313: * terminating character, and exit.
314: */
315:
316: if ('\0' != term) {
317: *end = strchr(&cp[i], term);
318: if ('\0' == *end)
319: return(ESCAPE_ERROR);
320:
321: rlim = *end - &cp[i];
322: if (sz)
323: *sz = rlim;
324: (*end)++;
325: goto out;
326: }
327:
328: assert(lim > 0);
329:
330: /*
331: * We have a numeric limit. If the string is shorter than that,
332: * stop and return an error. Else adjust our endpoint, length,
333: * and return the current glyph.
334: */
335:
336: if ((size_t)lim > strlen(&cp[i]))
337: return(ESCAPE_ERROR);
338:
339: rlim = lim;
340: if (sz)
341: *sz = rlim;
342:
343: *end = &cp[i] + lim;
344:
345: out:
346: assert(rlim >= 0 && rstart);
347:
348: /* Run post-processors. */
349:
350: switch (gly) {
351: case (ESCAPE_FONT):
352: if (1 != rlim)
353: break;
354: switch (*rstart) {
355: case ('3'):
356: /* FALLTHROUGH */
357: case ('B'):
358: gly = ESCAPE_FONTBOLD;
359: break;
360: case ('2'):
361: /* FALLTHROUGH */
362: case ('I'):
363: gly = ESCAPE_FONTITALIC;
1.22 kristaps 364: break;
1.45 kristaps 365: case ('P'):
366: gly = ESCAPE_FONTPREV;
1.22 kristaps 367: break;
1.45 kristaps 368: case ('1'):
369: /* FALLTHROUGH */
370: case ('R'):
371: gly = ESCAPE_FONTROMAN;
1.1 kristaps 372: break;
373: }
1.46 kristaps 374: break;
1.45 kristaps 375: case (ESCAPE_SPECIAL):
376: if (1 != rlim)
377: break;
378: if ('c' == *rstart)
379: gly = ESCAPE_NOSPACE;
1.22 kristaps 380: break;
1.1 kristaps 381: default:
1.22 kristaps 382: break;
1.1 kristaps 383: }
384:
1.45 kristaps 385: return(gly);
1.1 kristaps 386: }
1.4 kristaps 387:
388: void *
389: mandoc_calloc(size_t num, size_t size)
390: {
391: void *ptr;
392:
393: ptr = calloc(num, size);
394: if (NULL == ptr) {
1.6 kristaps 395: perror(NULL);
1.35 kristaps 396: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 397: }
398:
399: return(ptr);
400: }
401:
402:
403: void *
404: mandoc_malloc(size_t size)
405: {
406: void *ptr;
407:
408: ptr = malloc(size);
409: if (NULL == ptr) {
1.6 kristaps 410: perror(NULL);
1.35 kristaps 411: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 412: }
413:
414: return(ptr);
415: }
416:
417:
418: void *
419: mandoc_realloc(void *ptr, size_t size)
420: {
421:
422: ptr = realloc(ptr, size);
423: if (NULL == ptr) {
1.6 kristaps 424: perror(NULL);
1.35 kristaps 425: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 426: }
427:
428: return(ptr);
429: }
430:
431:
432: char *
433: mandoc_strdup(const char *ptr)
434: {
435: char *p;
436:
437: p = strdup(ptr);
438: if (NULL == p) {
1.6 kristaps 439: perror(NULL);
1.35 kristaps 440: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 441: }
442:
443: return(p);
1.36 schwarze 444: }
445:
446: /*
447: * Parse a quoted or unquoted roff-style request or macro argument.
448: * Return a pointer to the parsed argument, which is either the original
449: * pointer or advanced by one byte in case the argument is quoted.
450: * Null-terminate the argument in place.
451: * Collapse pairs of quotes inside quoted arguments.
452: * Advance the argument pointer to the next argument,
453: * or to the null byte terminating the argument line.
454: */
455: char *
1.48 kristaps 456: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 457: {
458: char *start, *cp;
459: int quoted, pairs, white;
460:
461: /* Quoting can only start with a new word. */
462: start = *cpp;
1.47 kristaps 463: quoted = 0;
1.36 schwarze 464: if ('"' == *start) {
465: quoted = 1;
466: start++;
1.47 kristaps 467: }
1.36 schwarze 468:
469: pairs = 0;
470: white = 0;
471: for (cp = start; '\0' != *cp; cp++) {
472: /* Move left after quoted quotes and escaped backslashes. */
473: if (pairs)
474: cp[-pairs] = cp[0];
475: if ('\\' == cp[0]) {
476: if ('\\' == cp[1]) {
477: /* Poor man's copy mode. */
478: pairs++;
479: cp++;
480: } else if (0 == quoted && ' ' == cp[1])
481: /* Skip escaped blanks. */
482: cp++;
483: } else if (0 == quoted) {
484: if (' ' == cp[0]) {
485: /* Unescaped blanks end unquoted args. */
486: white = 1;
487: break;
488: }
489: } else if ('"' == cp[0]) {
490: if ('"' == cp[1]) {
491: /* Quoted quotes collapse. */
492: pairs++;
493: cp++;
494: } else {
495: /* Unquoted quotes end quoted args. */
496: quoted = 2;
497: break;
498: }
499: }
500: }
501:
502: /* Quoted argument without a closing quote. */
1.48 kristaps 503: if (1 == quoted)
1.42 kristaps 504: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.36 schwarze 505:
506: /* Null-terminate this argument and move to the next one. */
507: if (pairs)
508: cp[-pairs] = '\0';
509: if ('\0' != *cp) {
510: *cp++ = '\0';
511: while (' ' == *cp)
512: cp++;
513: }
1.39 kristaps 514: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 515: *cpp = cp;
516:
1.48 kristaps 517: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.42 kristaps 518: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.36 schwarze 519:
520: return(start);
1.4 kristaps 521: }
1.7 kristaps 522:
523: static int
524: a2time(time_t *t, const char *fmt, const char *p)
525: {
526: struct tm tm;
527: char *pp;
528:
529: memset(&tm, 0, sizeof(struct tm));
530:
531: pp = strptime(p, fmt, &tm);
532: if (NULL != pp && '\0' == *pp) {
533: *t = mktime(&tm);
534: return(1);
535: }
536:
537: return(0);
538: }
539:
1.37 schwarze 540: static char *
541: time2a(time_t t)
542: {
543: struct tm tm;
1.38 schwarze 544: char *buf, *p;
545: size_t ssz;
1.37 schwarze 546: int isz;
547:
548: localtime_r(&t, &tm);
549:
1.38 schwarze 550: /*
551: * Reserve space:
552: * up to 9 characters for the month (September) + blank
553: * up to 2 characters for the day + comma + blank
554: * 4 characters for the year and a terminating '\0'
555: */
556: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
557:
558: if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm)))
559: goto fail;
560: p += (int)ssz;
1.37 schwarze 561:
1.38 schwarze 562: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday)))
563: goto fail;
1.37 schwarze 564: p += isz;
565:
1.38 schwarze 566: if (0 == strftime(p, 4 + 1, "%Y", &tm))
567: goto fail;
568: return(buf);
569:
570: fail:
571: free(buf);
572: return(NULL);
1.37 schwarze 573: }
574:
575: char *
1.42 kristaps 576: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 577: {
1.37 schwarze 578: char *out;
1.7 kristaps 579: time_t t;
580:
1.37 schwarze 581: if (NULL == in || '\0' == *in ||
582: 0 == strcmp(in, "$" "Mdocdate$")) {
1.42 kristaps 583: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.37 schwarze 584: time(&t);
585: }
586: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
587: !a2time(&t, "%b %d, %Y", in) &&
588: !a2time(&t, "%Y-%m-%d", in)) {
1.42 kristaps 589: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.37 schwarze 590: t = 0;
1.7 kristaps 591: }
1.37 schwarze 592: out = t ? time2a(t) : NULL;
1.38 schwarze 593: return(out ? out : mandoc_strdup(in));
1.7 kristaps 594: }
595:
1.12 kristaps 596: int
1.23 schwarze 597: mandoc_eos(const char *p, size_t sz, int enclosed)
1.12 kristaps 598: {
1.23 schwarze 599: const char *q;
600: int found;
1.12 kristaps 601:
1.13 kristaps 602: if (0 == sz)
603: return(0);
1.12 kristaps 604:
1.14 kristaps 605: /*
606: * End-of-sentence recognition must include situations where
607: * some symbols, such as `)', allow prior EOS punctuation to
1.49 kristaps 608: * propagate outward.
1.14 kristaps 609: */
610:
1.23 schwarze 611: found = 0;
1.25 kristaps 612: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 613: switch (*q) {
1.14 kristaps 614: case ('\"'):
615: /* FALLTHROUGH */
616: case ('\''):
1.15 kristaps 617: /* FALLTHROUGH */
618: case (']'):
1.14 kristaps 619: /* FALLTHROUGH */
620: case (')'):
1.23 schwarze 621: if (0 == found)
622: enclosed = 1;
1.14 kristaps 623: break;
624: case ('.'):
625: /* FALLTHROUGH */
626: case ('!'):
627: /* FALLTHROUGH */
628: case ('?'):
1.23 schwarze 629: found = 1;
630: break;
1.14 kristaps 631: default:
1.27 joerg 632: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.14 kristaps 633: }
1.12 kristaps 634: }
635:
1.23 schwarze 636: return(found && !enclosed);
1.16 kristaps 637: }
638:
639: int
640: mandoc_hyph(const char *start, const char *c)
641: {
642:
643: /*
644: * Choose whether to break at a hyphenated character. We only
645: * do this if it's free-standing within a word.
646: */
647:
648: /* Skip first/last character of buffer. */
649: if (c == start || '\0' == *(c + 1))
650: return(0);
651: /* Skip first/last character of word. */
652: if ('\t' == *(c + 1) || '\t' == *(c - 1))
653: return(0);
654: if (' ' == *(c + 1) || ' ' == *(c - 1))
655: return(0);
656: /* Skip double invocations. */
657: if ('-' == *(c + 1) || '-' == *(c - 1))
658: return(0);
659: /* Skip escapes. */
660: if ('\\' == *(c - 1))
661: return(0);
662:
663: return(1);
1.40 kristaps 664: }
665:
1.44 kristaps 666: /*
667: * Find out whether a line is a macro line or not. If it is, adjust the
668: * current position and return one; if it isn't, return zero and don't
669: * change the current position.
670: */
671: int
672: mandoc_getcontrol(const char *cp, int *ppos)
673: {
674: int pos;
675:
676: pos = *ppos;
677:
678: if ('\\' == cp[pos] && '.' == cp[pos + 1])
679: pos += 2;
680: else if ('.' == cp[pos] || '\'' == cp[pos])
681: pos++;
682: else
683: return(0);
684:
685: while (' ' == cp[pos] || '\t' == cp[pos])
686: pos++;
687:
688: *ppos = pos;
689: return(1);
690: }
1.50 kristaps 691:
692: /*
693: * Convert a string to a long that may not be <0.
694: * If the string is invalid, or is less than 0, return -1.
695: */
696: int
697: mandoc_strntou(const char *p, size_t sz, int base)
698: {
699: char buf[32];
700: char *ep;
701: long v;
702:
703: if (sz > 31)
704: return(-1);
705:
706: memcpy(buf, p, sz);
1.51 ! kristaps 707: buf[(int)sz] = '\0';
1.50 kristaps 708:
709: errno = 0;
710: v = strtol(buf, &ep, base);
711:
712: if (buf[0] == '\0' || *ep != '\0')
713: return(-1);
714:
715: if ((errno == ERANGE &&
716: (v == LONG_MAX || v == LONG_MIN)) ||
717: (v > INT_MAX || v < 0))
718: return(-1);
719:
720: return((int)v);
721: }
722:
CVSweb