Annotation of mandoc/mandoc.c, Revision 1.48
1.48 ! kristaps 1: /* $Id: mandoc.c,v 1.47 2011/04/17 09:08:19 kristaps Exp $ */
1.1 kristaps 2: /*
1.22 kristaps 3: * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
1.36 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
1.7 kristaps 20: #endif
21:
1.2 kristaps 22: #include <sys/types.h>
23:
1.1 kristaps 24: #include <assert.h>
25: #include <ctype.h>
26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.18 kristaps 31: #include "mandoc.h"
1.1 kristaps 32: #include "libmandoc.h"
33:
1.37 schwarze 34: #define DATESIZE 32
35:
1.18 kristaps 36: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 37: static char *time2a(time_t);
1.45 kristaps 38: static int numescape(const char *);
1.7 kristaps 39:
1.45 kristaps 40: /*
41: * Pass over recursive numerical expressions. This context of this
42: * function is important: it's only called within character-terminating
43: * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
44: * recursion: we don't care about what's in these blocks.
45: * This returns the number of characters skipped or -1 if an error
46: * occurs (the caller should bail).
47: */
48: static int
49: numescape(const char *start)
50: {
51: int i;
52: size_t sz;
53: const char *cp;
54:
55: i = 0;
56:
57: /* The expression consists of a subexpression. */
58:
59: if ('\\' == start[i]) {
60: cp = &start[++i];
61: /*
62: * Read past the end of the subexpression.
63: * Bail immediately on errors.
64: */
65: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
66: return(-1);
67: return(i + cp - &start[i]);
68: }
69:
70: if ('(' != start[i++])
71: return(0);
72:
73: /*
74: * A parenthesised subexpression. Read until the closing
75: * parenthesis, making sure to handle any nested subexpressions
76: * that might ruin our parse.
77: */
78:
79: while (')' != start[i]) {
80: sz = strcspn(&start[i], ")\\");
81: i += (int)sz;
82:
83: if ('\0' == start[i])
84: return(-1);
85: else if ('\\' != start[i])
86: continue;
87:
88: cp = &start[++i];
89: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
90: return(-1);
91: i += cp - &start[i];
92: }
93:
94: /* Read past the terminating ')'. */
95: return(++i);
96: }
97:
98: enum mandoc_esc
99: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 100: {
1.45 kristaps 101: char c, term, numeric;
102: int i, lim, ssz, rlim;
103: const char *cp, *rstart;
104: enum mandoc_esc gly;
105:
106: cp = *end;
107: rstart = cp;
108: if (start)
109: *start = rstart;
1.46 kristaps 110: i = lim = 0;
1.45 kristaps 111: gly = ESCAPE_ERROR;
1.46 kristaps 112: term = numeric = '\0';
1.18 kristaps 113:
1.45 kristaps 114: switch ((c = cp[i++])) {
115: /*
116: * First the glyphs. There are several different forms of
117: * these, but each eventually returns a substring of the glyph
118: * name.
119: */
120: case ('('):
121: gly = ESCAPE_SPECIAL;
122: lim = 2;
123: break;
124: case ('['):
125: gly = ESCAPE_SPECIAL;
126: term = ']';
127: break;
128: case ('C'):
129: if ('\'' != cp[i])
130: return(ESCAPE_ERROR);
131: gly = ESCAPE_SPECIAL;
132: term = '\'';
133: break;
1.1 kristaps 134:
1.45 kristaps 135: /*
136: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
137: * 'X' is the trigger. These have opaque sub-strings.
138: */
139: case ('F'):
140: /* FALLTHROUGH */
141: case ('g'):
1.24 kristaps 142: /* FALLTHROUGH */
1.45 kristaps 143: case ('k'):
1.24 kristaps 144: /* FALLTHROUGH */
1.45 kristaps 145: case ('M'):
1.24 kristaps 146: /* FALLTHROUGH */
1.45 kristaps 147: case ('m'):
1.24 kristaps 148: /* FALLTHROUGH */
1.45 kristaps 149: case ('n'):
1.24 kristaps 150: /* FALLTHROUGH */
1.45 kristaps 151: case ('V'):
1.24 kristaps 152: /* FALLTHROUGH */
1.45 kristaps 153: case ('Y'):
154: if (ESCAPE_ERROR == gly)
155: gly = ESCAPE_IGNORE;
1.24 kristaps 156: /* FALLTHROUGH */
1.45 kristaps 157: case ('*'):
158: if (ESCAPE_ERROR == gly)
159: gly = ESCAPE_PREDEF;
1.24 kristaps 160: /* FALLTHROUGH */
1.45 kristaps 161: case ('f'):
162: if (ESCAPE_ERROR == gly)
163: gly = ESCAPE_FONT;
164:
165: rstart= &cp[i];
166: if (start)
167: *start = rstart;
168:
169: switch (cp[i++]) {
170: case ('('):
171: lim = 2;
172: break;
173: case ('['):
174: term = ']';
175: break;
176: default:
177: lim = 1;
178: i--;
179: break;
180: }
181: break;
182:
183: /*
184: * These escapes are of the form \X'Y', where 'X' is the trigger
185: * and 'Y' is any string. These have opaque sub-strings.
186: */
187: case ('A'):
1.24 kristaps 188: /* FALLTHROUGH */
1.45 kristaps 189: case ('b'):
1.24 kristaps 190: /* FALLTHROUGH */
191: case ('D'):
192: /* FALLTHROUGH */
1.45 kristaps 193: case ('o'):
1.24 kristaps 194: /* FALLTHROUGH */
1.45 kristaps 195: case ('R'):
1.24 kristaps 196: /* FALLTHROUGH */
1.45 kristaps 197: case ('X'):
1.24 kristaps 198: /* FALLTHROUGH */
1.45 kristaps 199: case ('Z'):
200: if ('\'' != cp[i++])
201: return(ESCAPE_ERROR);
202: gly = ESCAPE_IGNORE;
1.24 kristaps 203: term = '\'';
204: break;
1.45 kristaps 205:
206: /*
207: * These escapes are of the form \X'N', where 'X' is the trigger
208: * and 'N' resolves to a numerical expression.
209: */
210: case ('B'):
211: /* FALLTHROUGH */
1.28 kristaps 212: case ('h'):
213: /* FALLTHROUGH */
1.45 kristaps 214: case ('H'):
215: /* FALLTHROUGH */
216: case ('L'):
217: /* FALLTHROUGH */
218: case ('l'):
219: /* FALLTHROUGH */
220: case ('N'):
221: if (ESCAPE_ERROR == gly)
222: gly = ESCAPE_NUMBERED;
223: /* FALLTHROUGH */
224: case ('S'):
225: /* FALLTHROUGH */
1.28 kristaps 226: case ('v'):
227: /* FALLTHROUGH */
1.45 kristaps 228: case ('w'):
229: /* FALLTHROUGH */
230: case ('x'):
231: if (ESCAPE_ERROR == gly)
232: gly = ESCAPE_IGNORE;
233: if ('\'' != cp[i++])
234: return(ESCAPE_ERROR);
235: term = numeric = '\'';
236: break;
237:
238: /*
239: * Sizes get a special category of their own.
240: */
1.8 kristaps 241: case ('s'):
1.45 kristaps 242: gly = ESCAPE_IGNORE;
1.28 kristaps 243:
1.45 kristaps 244: rstart = &cp[i];
245: if (start)
246: *start = rstart;
247:
248: /* See +/- counts as a sign. */
249: c = cp[i];
250: if ('+' == c || '-' == c || ASCII_HYPH == c)
251: ++i;
1.8 kristaps 252:
1.45 kristaps 253: switch (cp[i++]) {
1.22 kristaps 254: case ('('):
1.45 kristaps 255: lim = 2;
1.22 kristaps 256: break;
257: case ('['):
1.45 kristaps 258: term = numeric = ']';
1.22 kristaps 259: break;
260: case ('\''):
1.45 kristaps 261: term = numeric = '\'';
1.22 kristaps 262: break;
263: default:
1.45 kristaps 264: lim = 1;
265: i--;
1.22 kristaps 266: break;
1.8 kristaps 267: }
268:
1.45 kristaps 269: /* See +/- counts as a sign. */
270: c = cp[i];
271: if ('+' == c || '-' == c || ASCII_HYPH == c)
272: ++i;
273:
274: break;
1.33 kristaps 275:
1.45 kristaps 276: /*
277: * Anything else is assumed to be a glyph.
278: */
279: default:
280: gly = ESCAPE_SPECIAL;
281: lim = 1;
282: i--;
1.22 kristaps 283: break;
1.45 kristaps 284: }
285:
286: assert(ESCAPE_ERROR != gly);
287:
288: rstart = &cp[i];
289: if (start)
290: *start = rstart;
291:
292: /*
293: * If a terminating block has been specified, we need to
294: * handle the case of recursion, which could have their
295: * own terminating blocks that mess up our parse. This, by the
296: * way, means that the "start" and "size" values will be
297: * effectively meaningless.
298: */
299:
300: ssz = 0;
301: if (numeric && -1 == (ssz = numescape(&cp[i])))
302: return(ESCAPE_ERROR);
303:
304: i += ssz;
305: rlim = -1;
306:
307: /*
308: * We have a character terminator. Try to read up to that
309: * character. If we can't (i.e., we hit the nil), then return
310: * an error; if we can, calculate our length, read past the
311: * terminating character, and exit.
312: */
313:
314: if ('\0' != term) {
315: *end = strchr(&cp[i], term);
316: if ('\0' == *end)
317: return(ESCAPE_ERROR);
318:
319: rlim = *end - &cp[i];
320: if (sz)
321: *sz = rlim;
322: (*end)++;
323: goto out;
324: }
325:
326: assert(lim > 0);
327:
328: /*
329: * We have a numeric limit. If the string is shorter than that,
330: * stop and return an error. Else adjust our endpoint, length,
331: * and return the current glyph.
332: */
333:
334: if ((size_t)lim > strlen(&cp[i]))
335: return(ESCAPE_ERROR);
336:
337: rlim = lim;
338: if (sz)
339: *sz = rlim;
340:
341: *end = &cp[i] + lim;
342:
343: out:
344: assert(rlim >= 0 && rstart);
345:
346: /* Run post-processors. */
347:
348: switch (gly) {
349: case (ESCAPE_FONT):
350: if (1 != rlim)
351: break;
352: switch (*rstart) {
353: case ('3'):
354: /* FALLTHROUGH */
355: case ('B'):
356: gly = ESCAPE_FONTBOLD;
357: break;
358: case ('2'):
359: /* FALLTHROUGH */
360: case ('I'):
361: gly = ESCAPE_FONTITALIC;
1.22 kristaps 362: break;
1.45 kristaps 363: case ('P'):
364: gly = ESCAPE_FONTPREV;
1.22 kristaps 365: break;
1.45 kristaps 366: case ('1'):
367: /* FALLTHROUGH */
368: case ('R'):
369: gly = ESCAPE_FONTROMAN;
1.1 kristaps 370: break;
371: }
1.46 kristaps 372: break;
1.45 kristaps 373: case (ESCAPE_SPECIAL):
374: if (1 != rlim)
375: break;
376: if ('c' == *rstart)
377: gly = ESCAPE_NOSPACE;
1.22 kristaps 378: break;
1.1 kristaps 379: default:
1.22 kristaps 380: break;
1.1 kristaps 381: }
382:
1.45 kristaps 383: return(gly);
1.1 kristaps 384: }
1.4 kristaps 385:
386: void *
387: mandoc_calloc(size_t num, size_t size)
388: {
389: void *ptr;
390:
391: ptr = calloc(num, size);
392: if (NULL == ptr) {
1.6 kristaps 393: perror(NULL);
1.35 kristaps 394: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 395: }
396:
397: return(ptr);
398: }
399:
400:
401: void *
402: mandoc_malloc(size_t size)
403: {
404: void *ptr;
405:
406: ptr = malloc(size);
407: if (NULL == ptr) {
1.6 kristaps 408: perror(NULL);
1.35 kristaps 409: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 410: }
411:
412: return(ptr);
413: }
414:
415:
416: void *
417: mandoc_realloc(void *ptr, size_t size)
418: {
419:
420: ptr = realloc(ptr, size);
421: if (NULL == ptr) {
1.6 kristaps 422: perror(NULL);
1.35 kristaps 423: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 424: }
425:
426: return(ptr);
427: }
428:
429:
430: char *
431: mandoc_strdup(const char *ptr)
432: {
433: char *p;
434:
435: p = strdup(ptr);
436: if (NULL == p) {
1.6 kristaps 437: perror(NULL);
1.35 kristaps 438: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 439: }
440:
441: return(p);
1.36 schwarze 442: }
443:
444: /*
445: * Parse a quoted or unquoted roff-style request or macro argument.
446: * Return a pointer to the parsed argument, which is either the original
447: * pointer or advanced by one byte in case the argument is quoted.
448: * Null-terminate the argument in place.
449: * Collapse pairs of quotes inside quoted arguments.
450: * Advance the argument pointer to the next argument,
451: * or to the null byte terminating the argument line.
452: */
453: char *
1.48 ! kristaps 454: mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
1.36 schwarze 455: {
456: char *start, *cp;
457: int quoted, pairs, white;
458:
459: /* Quoting can only start with a new word. */
460: start = *cpp;
1.47 kristaps 461: quoted = 0;
1.36 schwarze 462: if ('"' == *start) {
463: quoted = 1;
464: start++;
1.47 kristaps 465: }
1.36 schwarze 466:
467: pairs = 0;
468: white = 0;
469: for (cp = start; '\0' != *cp; cp++) {
470: /* Move left after quoted quotes and escaped backslashes. */
471: if (pairs)
472: cp[-pairs] = cp[0];
473: if ('\\' == cp[0]) {
474: if ('\\' == cp[1]) {
475: /* Poor man's copy mode. */
476: pairs++;
477: cp++;
478: } else if (0 == quoted && ' ' == cp[1])
479: /* Skip escaped blanks. */
480: cp++;
481: } else if (0 == quoted) {
482: if (' ' == cp[0]) {
483: /* Unescaped blanks end unquoted args. */
484: white = 1;
485: break;
486: }
487: } else if ('"' == cp[0]) {
488: if ('"' == cp[1]) {
489: /* Quoted quotes collapse. */
490: pairs++;
491: cp++;
492: } else {
493: /* Unquoted quotes end quoted args. */
494: quoted = 2;
495: break;
496: }
497: }
498: }
499:
500: /* Quoted argument without a closing quote. */
1.48 ! kristaps 501: if (1 == quoted)
1.42 kristaps 502: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.36 schwarze 503:
504: /* Null-terminate this argument and move to the next one. */
505: if (pairs)
506: cp[-pairs] = '\0';
507: if ('\0' != *cp) {
508: *cp++ = '\0';
509: while (' ' == *cp)
510: cp++;
511: }
1.39 kristaps 512: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 513: *cpp = cp;
514:
1.48 ! kristaps 515: if ('\0' == *cp && (white || ' ' == cp[-1]))
1.42 kristaps 516: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.36 schwarze 517:
518: return(start);
1.4 kristaps 519: }
1.7 kristaps 520:
521: static int
522: a2time(time_t *t, const char *fmt, const char *p)
523: {
524: struct tm tm;
525: char *pp;
526:
527: memset(&tm, 0, sizeof(struct tm));
528:
529: pp = strptime(p, fmt, &tm);
530: if (NULL != pp && '\0' == *pp) {
531: *t = mktime(&tm);
532: return(1);
533: }
534:
535: return(0);
536: }
537:
1.37 schwarze 538: static char *
539: time2a(time_t t)
540: {
541: struct tm tm;
1.38 schwarze 542: char *buf, *p;
543: size_t ssz;
1.37 schwarze 544: int isz;
545:
546: localtime_r(&t, &tm);
547:
1.38 schwarze 548: /*
549: * Reserve space:
550: * up to 9 characters for the month (September) + blank
551: * up to 2 characters for the day + comma + blank
552: * 4 characters for the year and a terminating '\0'
553: */
554: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
555:
556: if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm)))
557: goto fail;
558: p += (int)ssz;
1.37 schwarze 559:
1.38 schwarze 560: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday)))
561: goto fail;
1.37 schwarze 562: p += isz;
563:
1.38 schwarze 564: if (0 == strftime(p, 4 + 1, "%Y", &tm))
565: goto fail;
566: return(buf);
567:
568: fail:
569: free(buf);
570: return(NULL);
1.37 schwarze 571: }
572:
573: char *
1.42 kristaps 574: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 575: {
1.37 schwarze 576: char *out;
1.7 kristaps 577: time_t t;
578:
1.37 schwarze 579: if (NULL == in || '\0' == *in ||
580: 0 == strcmp(in, "$" "Mdocdate$")) {
1.42 kristaps 581: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.37 schwarze 582: time(&t);
583: }
584: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
585: !a2time(&t, "%b %d, %Y", in) &&
586: !a2time(&t, "%Y-%m-%d", in)) {
1.42 kristaps 587: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.37 schwarze 588: t = 0;
1.7 kristaps 589: }
1.37 schwarze 590: out = t ? time2a(t) : NULL;
1.38 schwarze 591: return(out ? out : mandoc_strdup(in));
1.7 kristaps 592: }
593:
1.12 kristaps 594: int
1.23 schwarze 595: mandoc_eos(const char *p, size_t sz, int enclosed)
1.12 kristaps 596: {
1.23 schwarze 597: const char *q;
598: int found;
1.12 kristaps 599:
1.13 kristaps 600: if (0 == sz)
601: return(0);
1.12 kristaps 602:
1.14 kristaps 603: /*
604: * End-of-sentence recognition must include situations where
605: * some symbols, such as `)', allow prior EOS punctuation to
606: * propogate outward.
607: */
608:
1.23 schwarze 609: found = 0;
1.25 kristaps 610: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 611: switch (*q) {
1.14 kristaps 612: case ('\"'):
613: /* FALLTHROUGH */
614: case ('\''):
1.15 kristaps 615: /* FALLTHROUGH */
616: case (']'):
1.14 kristaps 617: /* FALLTHROUGH */
618: case (')'):
1.23 schwarze 619: if (0 == found)
620: enclosed = 1;
1.14 kristaps 621: break;
622: case ('.'):
623: /* FALLTHROUGH */
624: case ('!'):
625: /* FALLTHROUGH */
626: case ('?'):
1.23 schwarze 627: found = 1;
628: break;
1.14 kristaps 629: default:
1.27 joerg 630: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.14 kristaps 631: }
1.12 kristaps 632: }
633:
1.23 schwarze 634: return(found && !enclosed);
1.16 kristaps 635: }
636:
637: int
638: mandoc_hyph(const char *start, const char *c)
639: {
640:
641: /*
642: * Choose whether to break at a hyphenated character. We only
643: * do this if it's free-standing within a word.
644: */
645:
646: /* Skip first/last character of buffer. */
647: if (c == start || '\0' == *(c + 1))
648: return(0);
649: /* Skip first/last character of word. */
650: if ('\t' == *(c + 1) || '\t' == *(c - 1))
651: return(0);
652: if (' ' == *(c + 1) || ' ' == *(c - 1))
653: return(0);
654: /* Skip double invocations. */
655: if ('-' == *(c + 1) || '-' == *(c - 1))
656: return(0);
657: /* Skip escapes. */
658: if ('\\' == *(c - 1))
659: return(0);
660:
661: return(1);
1.40 kristaps 662: }
663:
1.44 kristaps 664: /*
665: * Find out whether a line is a macro line or not. If it is, adjust the
666: * current position and return one; if it isn't, return zero and don't
667: * change the current position.
668: */
669: int
670: mandoc_getcontrol(const char *cp, int *ppos)
671: {
672: int pos;
673:
674: pos = *ppos;
675:
676: if ('\\' == cp[pos] && '.' == cp[pos + 1])
677: pos += 2;
678: else if ('.' == cp[pos] || '\'' == cp[pos])
679: pos++;
680: else
681: return(0);
682:
683: while (' ' == cp[pos] || '\t' == cp[pos])
684: pos++;
685:
686: *ppos = pos;
687: return(1);
688: }
CVSweb