Annotation of mandoc/mandoc.c, Revision 1.47
1.47 ! kristaps 1: /* $Id: mandoc.c,v 1.46 2011/04/09 15:35:30 kristaps Exp $ */
1.1 kristaps 2: /*
1.22 kristaps 3: * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
1.36 schwarze 4: * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
1.36 schwarze 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
1.1 kristaps 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
1.36 schwarze 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
1.1 kristaps 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.9 kristaps 18: #ifdef HAVE_CONFIG_H
19: #include "config.h"
1.7 kristaps 20: #endif
21:
1.2 kristaps 22: #include <sys/types.h>
23:
1.1 kristaps 24: #include <assert.h>
25: #include <ctype.h>
26: #include <stdlib.h>
1.4 kristaps 27: #include <stdio.h>
28: #include <string.h>
1.7 kristaps 29: #include <time.h>
1.1 kristaps 30:
1.18 kristaps 31: #include "mandoc.h"
1.1 kristaps 32: #include "libmandoc.h"
33:
1.37 schwarze 34: #define DATESIZE 32
35:
1.18 kristaps 36: static int a2time(time_t *, const char *, const char *);
1.37 schwarze 37: static char *time2a(time_t);
1.45 kristaps 38: static int numescape(const char *);
1.7 kristaps 39:
1.45 kristaps 40: /*
41: * Pass over recursive numerical expressions. This context of this
42: * function is important: it's only called within character-terminating
43: * escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial
44: * recursion: we don't care about what's in these blocks.
45: * This returns the number of characters skipped or -1 if an error
46: * occurs (the caller should bail).
47: */
48: static int
49: numescape(const char *start)
50: {
51: int i;
52: size_t sz;
53: const char *cp;
54:
55: i = 0;
56:
57: /* The expression consists of a subexpression. */
58:
59: if ('\\' == start[i]) {
60: cp = &start[++i];
61: /*
62: * Read past the end of the subexpression.
63: * Bail immediately on errors.
64: */
65: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
66: return(-1);
67: return(i + cp - &start[i]);
68: }
69:
70: if ('(' != start[i++])
71: return(0);
72:
73: /*
74: * A parenthesised subexpression. Read until the closing
75: * parenthesis, making sure to handle any nested subexpressions
76: * that might ruin our parse.
77: */
78:
79: while (')' != start[i]) {
80: sz = strcspn(&start[i], ")\\");
81: i += (int)sz;
82:
83: if ('\0' == start[i])
84: return(-1);
85: else if ('\\' != start[i])
86: continue;
87:
88: cp = &start[++i];
89: if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL))
90: return(-1);
91: i += cp - &start[i];
92: }
93:
94: /* Read past the terminating ')'. */
95: return(++i);
96: }
97:
98: /*
99: * Handle an escaped sequeence. This should be called with any
100: * string subsequent a `\'. Pass a pointer to this substring as "end";
101: * it will be set to the supremum of the parsed escape sequence. If
102: * this returns ESCAPE_ERROR, the string is bogus and should be thrown
103: * away. If not ESCAPE_ERROR or ESCAPE_IGNORE, "start" is set to the
104: * first relevant character of the substring (font, glyph, whatever) of
105: * length sz. Both "start" and "sz" may be NULL.
106: */
107: enum mandoc_esc
108: mandoc_escape(const char **end, const char **start, int *sz)
1.1 kristaps 109: {
1.45 kristaps 110: char c, term, numeric;
111: int i, lim, ssz, rlim;
112: const char *cp, *rstart;
113: enum mandoc_esc gly;
114:
115: cp = *end;
116: rstart = cp;
117: if (start)
118: *start = rstart;
1.46 kristaps 119: i = lim = 0;
1.45 kristaps 120: gly = ESCAPE_ERROR;
1.46 kristaps 121: term = numeric = '\0';
1.18 kristaps 122:
1.45 kristaps 123: switch ((c = cp[i++])) {
124: /*
125: * First the glyphs. There are several different forms of
126: * these, but each eventually returns a substring of the glyph
127: * name.
128: */
129: case ('('):
130: gly = ESCAPE_SPECIAL;
131: lim = 2;
132: break;
133: case ('['):
134: gly = ESCAPE_SPECIAL;
135: term = ']';
136: break;
137: case ('C'):
138: if ('\'' != cp[i])
139: return(ESCAPE_ERROR);
140: gly = ESCAPE_SPECIAL;
141: term = '\'';
142: break;
1.1 kristaps 143:
1.45 kristaps 144: /*
145: * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
146: * 'X' is the trigger. These have opaque sub-strings.
147: */
148: case ('F'):
149: /* FALLTHROUGH */
150: case ('g'):
1.24 kristaps 151: /* FALLTHROUGH */
1.45 kristaps 152: case ('k'):
1.24 kristaps 153: /* FALLTHROUGH */
1.45 kristaps 154: case ('M'):
1.24 kristaps 155: /* FALLTHROUGH */
1.45 kristaps 156: case ('m'):
1.24 kristaps 157: /* FALLTHROUGH */
1.45 kristaps 158: case ('n'):
1.24 kristaps 159: /* FALLTHROUGH */
1.45 kristaps 160: case ('V'):
1.24 kristaps 161: /* FALLTHROUGH */
1.45 kristaps 162: case ('Y'):
163: if (ESCAPE_ERROR == gly)
164: gly = ESCAPE_IGNORE;
1.24 kristaps 165: /* FALLTHROUGH */
1.45 kristaps 166: case ('*'):
167: if (ESCAPE_ERROR == gly)
168: gly = ESCAPE_PREDEF;
1.24 kristaps 169: /* FALLTHROUGH */
1.45 kristaps 170: case ('f'):
171: if (ESCAPE_ERROR == gly)
172: gly = ESCAPE_FONT;
173:
174: rstart= &cp[i];
175: if (start)
176: *start = rstart;
177:
178: switch (cp[i++]) {
179: case ('('):
180: lim = 2;
181: break;
182: case ('['):
183: term = ']';
184: break;
185: default:
186: lim = 1;
187: i--;
188: break;
189: }
190: break;
191:
192: /*
193: * These escapes are of the form \X'Y', where 'X' is the trigger
194: * and 'Y' is any string. These have opaque sub-strings.
195: */
196: case ('A'):
1.24 kristaps 197: /* FALLTHROUGH */
1.45 kristaps 198: case ('b'):
1.24 kristaps 199: /* FALLTHROUGH */
200: case ('D'):
201: /* FALLTHROUGH */
1.45 kristaps 202: case ('o'):
1.24 kristaps 203: /* FALLTHROUGH */
1.45 kristaps 204: case ('R'):
1.24 kristaps 205: /* FALLTHROUGH */
1.45 kristaps 206: case ('X'):
1.24 kristaps 207: /* FALLTHROUGH */
1.45 kristaps 208: case ('Z'):
209: if ('\'' != cp[i++])
210: return(ESCAPE_ERROR);
211: gly = ESCAPE_IGNORE;
1.24 kristaps 212: term = '\'';
213: break;
1.45 kristaps 214:
215: /*
216: * These escapes are of the form \X'N', where 'X' is the trigger
217: * and 'N' resolves to a numerical expression.
218: */
219: case ('B'):
220: /* FALLTHROUGH */
1.28 kristaps 221: case ('h'):
222: /* FALLTHROUGH */
1.45 kristaps 223: case ('H'):
224: /* FALLTHROUGH */
225: case ('L'):
226: /* FALLTHROUGH */
227: case ('l'):
228: /* FALLTHROUGH */
229: case ('N'):
230: if (ESCAPE_ERROR == gly)
231: gly = ESCAPE_NUMBERED;
232: /* FALLTHROUGH */
233: case ('S'):
234: /* FALLTHROUGH */
1.28 kristaps 235: case ('v'):
236: /* FALLTHROUGH */
1.45 kristaps 237: case ('w'):
238: /* FALLTHROUGH */
239: case ('x'):
240: if (ESCAPE_ERROR == gly)
241: gly = ESCAPE_IGNORE;
242: if ('\'' != cp[i++])
243: return(ESCAPE_ERROR);
244: term = numeric = '\'';
245: break;
246:
247: /*
248: * Sizes get a special category of their own.
249: */
1.8 kristaps 250: case ('s'):
1.45 kristaps 251: gly = ESCAPE_IGNORE;
1.28 kristaps 252:
1.45 kristaps 253: rstart = &cp[i];
254: if (start)
255: *start = rstart;
256:
257: /* See +/- counts as a sign. */
258: c = cp[i];
259: if ('+' == c || '-' == c || ASCII_HYPH == c)
260: ++i;
1.8 kristaps 261:
1.45 kristaps 262: switch (cp[i++]) {
1.22 kristaps 263: case ('('):
1.45 kristaps 264: lim = 2;
1.22 kristaps 265: break;
266: case ('['):
1.45 kristaps 267: term = numeric = ']';
1.22 kristaps 268: break;
269: case ('\''):
1.45 kristaps 270: term = numeric = '\'';
1.22 kristaps 271: break;
272: default:
1.45 kristaps 273: lim = 1;
274: i--;
1.22 kristaps 275: break;
1.8 kristaps 276: }
277:
1.45 kristaps 278: /* See +/- counts as a sign. */
279: c = cp[i];
280: if ('+' == c || '-' == c || ASCII_HYPH == c)
281: ++i;
282:
283: break;
1.33 kristaps 284:
1.45 kristaps 285: /*
286: * Anything else is assumed to be a glyph.
287: */
288: default:
289: gly = ESCAPE_SPECIAL;
290: lim = 1;
291: i--;
1.22 kristaps 292: break;
1.45 kristaps 293: }
294:
295: assert(ESCAPE_ERROR != gly);
296:
297: rstart = &cp[i];
298: if (start)
299: *start = rstart;
300:
301: /*
302: * If a terminating block has been specified, we need to
303: * handle the case of recursion, which could have their
304: * own terminating blocks that mess up our parse. This, by the
305: * way, means that the "start" and "size" values will be
306: * effectively meaningless.
307: */
308:
309: ssz = 0;
310: if (numeric && -1 == (ssz = numescape(&cp[i])))
311: return(ESCAPE_ERROR);
312:
313: i += ssz;
314: rlim = -1;
315:
316: /*
317: * We have a character terminator. Try to read up to that
318: * character. If we can't (i.e., we hit the nil), then return
319: * an error; if we can, calculate our length, read past the
320: * terminating character, and exit.
321: */
322:
323: if ('\0' != term) {
324: *end = strchr(&cp[i], term);
325: if ('\0' == *end)
326: return(ESCAPE_ERROR);
327:
328: rlim = *end - &cp[i];
329: if (sz)
330: *sz = rlim;
331: (*end)++;
332: goto out;
333: }
334:
335: assert(lim > 0);
336:
337: /*
338: * We have a numeric limit. If the string is shorter than that,
339: * stop and return an error. Else adjust our endpoint, length,
340: * and return the current glyph.
341: */
342:
343: if ((size_t)lim > strlen(&cp[i]))
344: return(ESCAPE_ERROR);
345:
346: rlim = lim;
347: if (sz)
348: *sz = rlim;
349:
350: *end = &cp[i] + lim;
351:
352: out:
353: assert(rlim >= 0 && rstart);
354:
355: /* Run post-processors. */
356:
357: switch (gly) {
358: case (ESCAPE_FONT):
359: if (1 != rlim)
360: break;
361: switch (*rstart) {
362: case ('3'):
363: /* FALLTHROUGH */
364: case ('B'):
365: gly = ESCAPE_FONTBOLD;
366: break;
367: case ('2'):
368: /* FALLTHROUGH */
369: case ('I'):
370: gly = ESCAPE_FONTITALIC;
1.22 kristaps 371: break;
1.45 kristaps 372: case ('P'):
373: gly = ESCAPE_FONTPREV;
1.22 kristaps 374: break;
1.45 kristaps 375: case ('1'):
376: /* FALLTHROUGH */
377: case ('R'):
378: gly = ESCAPE_FONTROMAN;
1.1 kristaps 379: break;
380: }
1.46 kristaps 381: break;
1.45 kristaps 382: case (ESCAPE_SPECIAL):
383: if (1 != rlim)
384: break;
385: if ('c' == *rstart)
386: gly = ESCAPE_NOSPACE;
1.22 kristaps 387: break;
1.1 kristaps 388: default:
1.22 kristaps 389: break;
1.1 kristaps 390: }
391:
1.45 kristaps 392: return(gly);
1.1 kristaps 393: }
1.4 kristaps 394:
395: void *
396: mandoc_calloc(size_t num, size_t size)
397: {
398: void *ptr;
399:
400: ptr = calloc(num, size);
401: if (NULL == ptr) {
1.6 kristaps 402: perror(NULL);
1.35 kristaps 403: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 404: }
405:
406: return(ptr);
407: }
408:
409:
410: void *
411: mandoc_malloc(size_t size)
412: {
413: void *ptr;
414:
415: ptr = malloc(size);
416: if (NULL == ptr) {
1.6 kristaps 417: perror(NULL);
1.35 kristaps 418: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 419: }
420:
421: return(ptr);
422: }
423:
424:
425: void *
426: mandoc_realloc(void *ptr, size_t size)
427: {
428:
429: ptr = realloc(ptr, size);
430: if (NULL == ptr) {
1.6 kristaps 431: perror(NULL);
1.35 kristaps 432: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 433: }
434:
435: return(ptr);
436: }
437:
438:
439: char *
440: mandoc_strdup(const char *ptr)
441: {
442: char *p;
443:
444: p = strdup(ptr);
445: if (NULL == p) {
1.6 kristaps 446: perror(NULL);
1.35 kristaps 447: exit((int)MANDOCLEVEL_SYSERR);
1.4 kristaps 448: }
449:
450: return(p);
1.36 schwarze 451: }
452:
453: /*
454: * Parse a quoted or unquoted roff-style request or macro argument.
455: * Return a pointer to the parsed argument, which is either the original
456: * pointer or advanced by one byte in case the argument is quoted.
457: * Null-terminate the argument in place.
458: * Collapse pairs of quotes inside quoted arguments.
459: * Advance the argument pointer to the next argument,
460: * or to the null byte terminating the argument line.
461: */
462: char *
1.47 ! kristaps 463: mandoc_getarg(struct mparse *parse,
! 464: char **cpp, int ln, int dowarn, int *pos)
1.36 schwarze 465: {
466: char *start, *cp;
467: int quoted, pairs, white;
468:
469: /* Quoting can only start with a new word. */
470: start = *cpp;
1.47 ! kristaps 471: quoted = 0;
1.36 schwarze 472: if ('"' == *start) {
473: quoted = 1;
474: start++;
1.47 ! kristaps 475: }
1.36 schwarze 476:
477: pairs = 0;
478: white = 0;
479: for (cp = start; '\0' != *cp; cp++) {
480: /* Move left after quoted quotes and escaped backslashes. */
481: if (pairs)
482: cp[-pairs] = cp[0];
483: if ('\\' == cp[0]) {
484: if ('\\' == cp[1]) {
485: /* Poor man's copy mode. */
486: pairs++;
487: cp++;
488: } else if (0 == quoted && ' ' == cp[1])
489: /* Skip escaped blanks. */
490: cp++;
491: } else if (0 == quoted) {
492: if (' ' == cp[0]) {
493: /* Unescaped blanks end unquoted args. */
494: white = 1;
495: break;
496: }
497: } else if ('"' == cp[0]) {
498: if ('"' == cp[1]) {
499: /* Quoted quotes collapse. */
500: pairs++;
501: cp++;
502: } else {
503: /* Unquoted quotes end quoted args. */
504: quoted = 2;
505: break;
506: }
507: }
508: }
509:
510: /* Quoted argument without a closing quote. */
1.47 ! kristaps 511: if (dowarn && 1 == quoted)
1.42 kristaps 512: mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
1.36 schwarze 513:
514: /* Null-terminate this argument and move to the next one. */
515: if (pairs)
516: cp[-pairs] = '\0';
517: if ('\0' != *cp) {
518: *cp++ = '\0';
519: while (' ' == *cp)
520: cp++;
521: }
1.39 kristaps 522: *pos += (int)(cp - start) + (quoted ? 1 : 0);
1.36 schwarze 523: *cpp = cp;
524:
1.47 ! kristaps 525: if (dowarn && '\0' == *cp && (white || ' ' == cp[-1]))
1.42 kristaps 526: mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
1.36 schwarze 527:
528: return(start);
1.4 kristaps 529: }
1.7 kristaps 530:
531: static int
532: a2time(time_t *t, const char *fmt, const char *p)
533: {
534: struct tm tm;
535: char *pp;
536:
537: memset(&tm, 0, sizeof(struct tm));
538:
539: pp = strptime(p, fmt, &tm);
540: if (NULL != pp && '\0' == *pp) {
541: *t = mktime(&tm);
542: return(1);
543: }
544:
545: return(0);
546: }
547:
1.37 schwarze 548: static char *
549: time2a(time_t t)
550: {
551: struct tm tm;
1.38 schwarze 552: char *buf, *p;
553: size_t ssz;
1.37 schwarze 554: int isz;
555:
556: localtime_r(&t, &tm);
557:
1.38 schwarze 558: /*
559: * Reserve space:
560: * up to 9 characters for the month (September) + blank
561: * up to 2 characters for the day + comma + blank
562: * 4 characters for the year and a terminating '\0'
563: */
564: p = buf = mandoc_malloc(10 + 4 + 4 + 1);
565:
566: if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm)))
567: goto fail;
568: p += (int)ssz;
1.37 schwarze 569:
1.38 schwarze 570: if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday)))
571: goto fail;
1.37 schwarze 572: p += isz;
573:
1.38 schwarze 574: if (0 == strftime(p, 4 + 1, "%Y", &tm))
575: goto fail;
576: return(buf);
577:
578: fail:
579: free(buf);
580: return(NULL);
1.37 schwarze 581: }
582:
583: char *
1.42 kristaps 584: mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
1.7 kristaps 585: {
1.37 schwarze 586: char *out;
1.7 kristaps 587: time_t t;
588:
1.37 schwarze 589: if (NULL == in || '\0' == *in ||
590: 0 == strcmp(in, "$" "Mdocdate$")) {
1.42 kristaps 591: mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
1.37 schwarze 592: time(&t);
593: }
594: else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
595: !a2time(&t, "%b %d, %Y", in) &&
596: !a2time(&t, "%Y-%m-%d", in)) {
1.42 kristaps 597: mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
1.37 schwarze 598: t = 0;
1.7 kristaps 599: }
1.37 schwarze 600: out = t ? time2a(t) : NULL;
1.38 schwarze 601: return(out ? out : mandoc_strdup(in));
1.7 kristaps 602: }
603:
1.12 kristaps 604: int
1.23 schwarze 605: mandoc_eos(const char *p, size_t sz, int enclosed)
1.12 kristaps 606: {
1.23 schwarze 607: const char *q;
608: int found;
1.12 kristaps 609:
1.13 kristaps 610: if (0 == sz)
611: return(0);
1.12 kristaps 612:
1.14 kristaps 613: /*
614: * End-of-sentence recognition must include situations where
615: * some symbols, such as `)', allow prior EOS punctuation to
616: * propogate outward.
617: */
618:
1.23 schwarze 619: found = 0;
1.25 kristaps 620: for (q = p + (int)sz - 1; q >= p; q--) {
1.23 schwarze 621: switch (*q) {
1.14 kristaps 622: case ('\"'):
623: /* FALLTHROUGH */
624: case ('\''):
1.15 kristaps 625: /* FALLTHROUGH */
626: case (']'):
1.14 kristaps 627: /* FALLTHROUGH */
628: case (')'):
1.23 schwarze 629: if (0 == found)
630: enclosed = 1;
1.14 kristaps 631: break;
632: case ('.'):
633: /* FALLTHROUGH */
634: case ('!'):
635: /* FALLTHROUGH */
636: case ('?'):
1.23 schwarze 637: found = 1;
638: break;
1.14 kristaps 639: default:
1.27 joerg 640: return(found && (!enclosed || isalnum((unsigned char)*q)));
1.14 kristaps 641: }
1.12 kristaps 642: }
643:
1.23 schwarze 644: return(found && !enclosed);
1.16 kristaps 645: }
646:
647: int
648: mandoc_hyph(const char *start, const char *c)
649: {
650:
651: /*
652: * Choose whether to break at a hyphenated character. We only
653: * do this if it's free-standing within a word.
654: */
655:
656: /* Skip first/last character of buffer. */
657: if (c == start || '\0' == *(c + 1))
658: return(0);
659: /* Skip first/last character of word. */
660: if ('\t' == *(c + 1) || '\t' == *(c - 1))
661: return(0);
662: if (' ' == *(c + 1) || ' ' == *(c - 1))
663: return(0);
664: /* Skip double invocations. */
665: if ('-' == *(c + 1) || '-' == *(c - 1))
666: return(0);
667: /* Skip escapes. */
668: if ('\\' == *(c - 1))
669: return(0);
670:
671: return(1);
1.40 kristaps 672: }
673:
1.44 kristaps 674: /*
675: * Find out whether a line is a macro line or not. If it is, adjust the
676: * current position and return one; if it isn't, return zero and don't
677: * change the current position.
678: */
679: int
680: mandoc_getcontrol(const char *cp, int *ppos)
681: {
682: int pos;
683:
684: pos = *ppos;
685:
686: if ('\\' == cp[pos] && '.' == cp[pos + 1])
687: pos += 2;
688: else if ('.' == cp[pos] || '\'' == cp[pos])
689: pos++;
690: else
691: return(0);
692:
693: while (' ' == cp[pos] || '\t' == cp[pos])
694: pos++;
695:
696: *ppos = pos;
697: return(1);
698: }
CVSweb