Annotation of mandoc/roff_escape.c, Revision 1.8
1.1 schwarze 1: /* $OpenBSD$ */
2: /*
3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4: * Ingo Schwarze <schwarze@openbsd.org>
5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6: *
7: * Permission to use, copy, modify, and distribute this software for any
8: * purpose with or without fee is hereby granted, provided that the above
9: * copyright notice and this permission notice appear in all copies.
10: *
11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18: *
19: * Parser for roff(7) escape sequences.
20: * To be used by all mandoc(1) parsers and formatters.
21: */
22: #include <assert.h>
23: #include <ctype.h>
24: #include <limits.h>
25: #include <stdio.h>
26: #include <string.h>
27:
28: #include "mandoc.h"
29: #include "roff.h"
30: #include "roff_int.h"
31:
32: /*
33: * Traditional escape sequence interpreter for general use
34: * including in high-level formatters. This function does not issue
35: * diagnostics and is not usable for expansion in the roff(7) parser.
36: * It is documented in the mandoc_escape(3) manual page.
37: */
38: enum mandoc_esc
39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40: {
41: int iarg, iendarg, iend;
42: enum mandoc_esc rval;
43:
1.7 schwarze 44: rval = roff_escape(--*rendarg, 0, 0,
45: NULL, NULL, &iarg, &iendarg, &iend);
1.1 schwarze 46: assert(rval != ESCAPE_EXPAND);
47: if (rarg != NULL)
48: *rarg = *rendarg + iarg;
49: if (rargl != NULL)
50: *rargl = iendarg - iarg;
51: *rendarg += iend;
52: return rval;
53: }
54:
55: /*
56: * Full-featured escape sequence parser.
57: * If it encounters a nested escape sequence that requires expansion
58: * by the parser and re-parsing, the positions of that inner escape
59: * sequence are returned in *resc ... *rend.
60: * Otherwise, *resc is set to aesc and the positions of the escape
61: * sequence starting at aesc are returned.
62: * Diagnostic messages are generated if and only if resc != NULL,
63: * that is, if and only if called by roff_expand().
64: */
65: enum mandoc_esc
66: roff_escape(const char *buf, const int ln, const int aesc,
1.7 schwarze 67: int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
1.1 schwarze 68: {
69: int iesc; /* index of leading escape char */
1.7 schwarze 70: int inam; /* index of escape name */
1.1 schwarze 71: int iarg; /* index beginning the argument */
72: int iendarg; /* index right after the argument */
73: int iend; /* index right after the sequence */
1.7 schwarze 74: int sesc, snam, sarg, sendarg, send; /* for sub-escape */
1.1 schwarze 75: int maxl; /* expected length of the argument */
76: int argl; /* actual length of the argument */
77: int c, i; /* for \[char...] parsing */
1.5 schwarze 78: int valid_A; /* for \A parsing */
1.1 schwarze 79: enum mandoc_esc rval; /* return value */
80: enum mandocerr err; /* diagnostic code */
81: char term; /* byte terminating the argument */
82:
83: /*
84: * Treat "\E" just like "\";
85: * it only makes a difference in copy mode.
86: */
87:
1.7 schwarze 88: iesc = inam = aesc;
1.1 schwarze 89: do {
1.7 schwarze 90: inam++;
91: } while (buf[inam] == 'E');
1.1 schwarze 92:
93: /*
94: * Sort the following cases first by syntax category,
95: * then by escape sequence type, and finally by ASCII code.
96: */
97:
1.7 schwarze 98: iarg = iendarg = iend = inam + 1;
1.1 schwarze 99: maxl = INT_MAX;
100: term = '\0';
1.7 schwarze 101: switch (buf[inam]) {
1.1 schwarze 102:
103: /* Escape sequences taking no arguments at all. */
104:
105: case '!':
106: case '?':
1.2 schwarze 107: case 'r':
1.1 schwarze 108: rval = ESCAPE_UNSUPP;
109: goto out;
110:
111: case '%':
112: case '&':
113: case ')':
114: case ',':
115: case '/':
116: case '^':
117: case 'a':
118: case 'd':
119: case 't':
120: case 'u':
121: case '{':
122: case '|':
123: case '}':
124: rval = ESCAPE_IGNORE;
125: goto out;
126:
1.6 schwarze 127: case '\0':
128: iendarg = --iend;
129: /* FALLTHROUGH */
1.8 ! schwarze 130: case '.':
1.1 schwarze 131: case '\\':
132: default:
133: iarg--;
134: rval = ESCAPE_UNDEF;
135: goto out;
136:
137: case ' ':
138: case '\'':
139: case '-':
140: case '0':
141: case ':':
142: case '_':
143: case '`':
144: case 'e':
145: case '~':
146: iarg--;
147: argl = 1;
148: rval = ESCAPE_SPECIAL;
149: goto out;
150: case 'p':
151: rval = ESCAPE_BREAK;
152: goto out;
153: case 'c':
154: rval = ESCAPE_NOSPACE;
155: goto out;
156: case 'z':
157: rval = ESCAPE_SKIPCHAR;
158: goto out;
159:
160: /* Standard argument format. */
161:
162: case '$':
163: case '*':
1.3 schwarze 164: case 'V':
1.4 schwarze 165: case 'g':
1.1 schwarze 166: case 'n':
167: rval = ESCAPE_EXPAND;
168: break;
169: case 'F':
170: case 'M':
171: case 'O':
172: case 'Y':
173: case 'k':
174: case 'm':
175: rval = ESCAPE_IGNORE;
176: break;
177: case '(':
178: case '[':
179: rval = ESCAPE_SPECIAL;
180: iendarg = iend = --iarg;
181: break;
182: case 'f':
183: rval = ESCAPE_FONT;
184: break;
185:
186: /* Quoted arguments */
187:
1.5 schwarze 188: case 'A':
1.1 schwarze 189: case 'B':
190: case 'w':
191: rval = ESCAPE_EXPAND;
192: term = '\b';
193: break;
194: case 'D':
195: case 'H':
196: case 'L':
197: case 'R':
198: case 'S':
199: case 'X':
200: case 'Z':
201: case 'b':
202: case 'v':
203: case 'x':
204: rval = ESCAPE_IGNORE;
205: term = '\b';
206: break;
207: case 'C':
208: if (buf[iarg] != '\'') {
209: rval = ESCAPE_ERROR;
210: goto out;
211: }
212: rval = ESCAPE_SPECIAL;
213: term = '\b';
214: break;
215: case 'N':
216: rval = ESCAPE_NUMBERED;
217: term = '\b';
218: break;
219: case 'h':
220: rval = ESCAPE_HORIZ;
221: term = '\b';
222: break;
223: case 'l':
224: rval = ESCAPE_HLINE;
225: term = '\b';
226: break;
227: case 'o':
228: rval = ESCAPE_OVERSTRIKE;
229: term = '\b';
230: break;
231:
232: /* Sizes support both forms, with additional peculiarities. */
233:
234: case 's':
235: rval = ESCAPE_IGNORE;
236: if (buf[iarg] == '+' || buf[iarg] == '-'||
237: buf[iarg] == ASCII_HYPH)
238: iarg++;
239: switch (buf[iarg]) {
240: case '(':
241: maxl = 2;
242: iarg++;
243: break;
244: case '[':
245: term = ']';
246: iarg++;
247: break;
248: case '\'':
249: term = '\'';
250: iarg++;
251: break;
252: case '1':
253: case '2':
254: case '3':
255: if (buf[iarg - 1] == 's' &&
256: isdigit((unsigned char)buf[iarg + 1])) {
257: maxl = 2;
258: break;
259: }
260: /* FALLTHROUGH */
261: default:
262: maxl = 1;
263: break;
264: }
265: iendarg = iend = iarg;
266: }
267:
268: /* Decide how to end the argument. */
269:
270: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
271: buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
1.7 schwarze 272: &sesc, &snam, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
1.1 schwarze 273: goto out_sub;
274:
275: if (term == '\b') {
1.7 schwarze 276: if ((buf[inam] == 'N' && isdigit((unsigned char)buf[iarg])) ||
277: (buf[inam] == 'h' && strchr(" %&()*+-./0123456789:<=>",
1.1 schwarze 278: buf[iarg]) != NULL)) {
279: iendarg = iend = iarg + 1;
280: rval = ESCAPE_ERROR;
281: goto out;
282: }
283: term = buf[iarg++];
284: } else if (term == '\0' && maxl == INT_MAX) {
1.7 schwarze 285: if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
1.1 schwarze 286: iarg++;
287: switch (buf[iarg]) {
288: case '(':
289: maxl = 2;
290: iarg++;
291: break;
292: case '[':
293: if (buf[++iarg] == ' ') {
294: iendarg = iend = iarg + 1;
295: rval = ESCAPE_ERROR;
296: goto out;
297: }
298: term = ']';
299: break;
300: default:
301: maxl = 1;
302: break;
303: }
304: }
305:
306: /* Advance to the end of the argument. */
307:
1.5 schwarze 308: valid_A = 1;
1.1 schwarze 309: iendarg = iarg;
310: while (maxl > 0) {
311: if (buf[iendarg] == '\0') {
312: /* Ignore an incomplete argument except for \w. */
1.7 schwarze 313: if (buf[inam] != 'w')
1.1 schwarze 314: iendarg = iarg;
315: break;
316: }
317: if (buf[iendarg] == term) {
318: iend = iendarg + 1;
319: break;
320: }
1.7 schwarze 321: if (buf[inam] == 'N' &&
1.1 schwarze 322: isdigit((unsigned char)buf[iendarg]) == 0) {
323: iend = iendarg + 1;
324: break;
325: }
326: if (buf[iendarg] == buf[iesc]) {
1.5 schwarze 327: switch (roff_escape(buf, ln, iendarg,
1.7 schwarze 328: &sesc, &snam, &sarg, &sendarg, &send)) {
1.5 schwarze 329: case ESCAPE_EXPAND:
1.1 schwarze 330: goto out_sub;
1.5 schwarze 331: case ESCAPE_UNDEF:
332: break;
333: default:
334: valid_A = 0;
335: break;
336: }
1.1 schwarze 337: iendarg = iend = send;
338: } else {
1.5 schwarze 339: if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
340: valid_A = 0;
1.1 schwarze 341: if (maxl != INT_MAX)
342: maxl--;
343: iend = ++iendarg;
344: }
345: }
346: if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
347: (term != '\0' && buf[iendarg] != term)))
348: mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
349:
350: /* Post-process depending on the content of the argument. */
351:
352: argl = iendarg - iarg;
1.7 schwarze 353: switch (buf[inam]) {
1.1 schwarze 354: case '*':
355: if (resc == NULL && argl == 2 &&
356: buf[iarg] == '.' && buf[iarg + 1] == 'T')
357: rval = ESCAPE_DEVICE;
1.5 schwarze 358: break;
359: case 'A':
360: if (valid_A == 0)
361: iendarg = iarg;
1.1 schwarze 362: break;
363: case 'O':
364: switch (buf[iarg]) {
365: case '0':
366: rval = ESCAPE_UNSUPP;
367: break;
368: case '1':
369: case '2':
370: case '3':
371: case '4':
372: rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
373: break;
374: case '5':
375: rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
376: ESCAPE_ERROR;
377: break;
378: default:
379: rval = ESCAPE_ERROR;
380: break;
381: }
382: break;
383: default:
384: break;
385: }
386:
387: switch (rval) {
388: case ESCAPE_FONT:
389: rval = mandoc_font(buf + iarg, argl);
390: break;
391:
392: case ESCAPE_SPECIAL:
393:
394: /*
395: * The file chars.c only provides one common list of
396: * character names, but \[-] == \- is the only one of
397: * the characters with one-byte names that allows
398: * enclosing the name in brackets.
399: */
400:
401: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
402: rval = ESCAPE_ERROR;
403: break;
404: }
405:
406: /* Treat \[char...] as an alias for \N'...'. */
407:
408: if (buf[iarg] == 'c') {
409: if (argl < 6 || argl > 7 ||
410: strncmp(buf + iarg, "char", 4) != 0 ||
411: (int)strspn(buf + iarg + 4, "0123456789")
412: + 4 < argl)
413: break;
414: c = 0;
415: for (i = iarg; i < iendarg; i++)
416: c = 10 * c + (buf[i] - '0');
417: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
418: break;
419: iarg += 4;
420: rval = ESCAPE_NUMBERED;
421: break;
422: }
423:
424: /*
425: * Unicode escapes are defined in groff as \[u0000]
426: * to \[u10FFFF], where the contained value must be
427: * a valid Unicode codepoint. Here, however, only
428: * check the length and range.
429: */
430:
431: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
432: break;
433: if (argl == 7 &&
434: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
435: break;
436: if (argl == 6 && buf[iarg + 1] == '0')
437: break;
438: if (argl == 5 && buf[iarg + 1] == 'D' &&
439: strchr("89ABCDEF", buf[iarg + 2]) != NULL)
440: break;
441: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
442: + 1 == argl)
443: rval = ESCAPE_UNICODE;
444: break;
445: default:
446: break;
447: }
448: goto out;
449:
450: out_sub:
451: iesc = sesc;
1.7 schwarze 452: inam = snam;
1.1 schwarze 453: iarg = sarg;
454: iendarg = sendarg;
455: iend = send;
456: rval = ESCAPE_EXPAND;
457:
458: out:
1.7 schwarze 459: if (rnam != NULL)
460: *rnam = inam;
1.1 schwarze 461: if (rarg != NULL)
462: *rarg = iarg;
463: if (rendarg != NULL)
464: *rendarg = iendarg;
465: if (rend != NULL)
466: *rend = iend;
467: if (resc == NULL)
468: return rval;
469:
470: /*
471: * Diagnostic messages are only issued when called
472: * from the parser, not when called from the formatters.
473: */
474:
475: *resc = iesc;
476: switch (rval) {
477: case ESCAPE_ERROR:
478: err = MANDOCERR_ESC_BAD;
479: break;
480: case ESCAPE_UNSUPP:
481: err = MANDOCERR_ESC_UNSUPP;
482: break;
483: case ESCAPE_UNDEF:
1.8 ! schwarze 484: if (buf[inam] == '\\' || buf[inam] == '.')
1.1 schwarze 485: return rval;
486: err = MANDOCERR_ESC_UNDEF;
487: break;
488: case ESCAPE_SPECIAL:
489: if (mchars_spec2cp(buf + iarg, argl) >= 0)
490: return rval;
491: err = MANDOCERR_ESC_BAD;
492: break;
493: default:
494: return rval;
495: }
496: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
497: return rval;
498: }
CVSweb