Annotation of mandoc/roff_escape.c, Revision 1.5
1.1 schwarze 1: /* $OpenBSD$ */
2: /*
3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4: * Ingo Schwarze <schwarze@openbsd.org>
5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6: *
7: * Permission to use, copy, modify, and distribute this software for any
8: * purpose with or without fee is hereby granted, provided that the above
9: * copyright notice and this permission notice appear in all copies.
10: *
11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18: *
19: * Parser for roff(7) escape sequences.
20: * To be used by all mandoc(1) parsers and formatters.
21: */
22: #include <assert.h>
23: #include <ctype.h>
24: #include <limits.h>
25: #include <stdio.h>
26: #include <string.h>
27:
28: #include "mandoc.h"
29: #include "roff.h"
30: #include "roff_int.h"
31:
32: /*
33: * Traditional escape sequence interpreter for general use
34: * including in high-level formatters. This function does not issue
35: * diagnostics and is not usable for expansion in the roff(7) parser.
36: * It is documented in the mandoc_escape(3) manual page.
37: */
38: enum mandoc_esc
39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40: {
41: int iarg, iendarg, iend;
42: enum mandoc_esc rval;
43:
44: rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
45: assert(rval != ESCAPE_EXPAND);
46: if (rarg != NULL)
47: *rarg = *rendarg + iarg;
48: if (rargl != NULL)
49: *rargl = iendarg - iarg;
50: *rendarg += iend;
51: return rval;
52: }
53:
54: /*
55: * Full-featured escape sequence parser.
56: * If it encounters a nested escape sequence that requires expansion
57: * by the parser and re-parsing, the positions of that inner escape
58: * sequence are returned in *resc ... *rend.
59: * Otherwise, *resc is set to aesc and the positions of the escape
60: * sequence starting at aesc are returned.
61: * Diagnostic messages are generated if and only if resc != NULL,
62: * that is, if and only if called by roff_expand().
63: */
64: enum mandoc_esc
65: roff_escape(const char *buf, const int ln, const int aesc,
66: int *resc, int *rarg, int *rendarg, int *rend)
67: {
68: int iesc; /* index of leading escape char */
69: int iarg; /* index beginning the argument */
70: int iendarg; /* index right after the argument */
71: int iend; /* index right after the sequence */
72: int sesc, sarg, sendarg, send; /* for sub-escape */
73: int maxl; /* expected length of the argument */
74: int argl; /* actual length of the argument */
75: int c, i; /* for \[char...] parsing */
1.5 ! schwarze 76: int valid_A; /* for \A parsing */
1.1 schwarze 77: enum mandoc_esc rval; /* return value */
78: enum mandocerr err; /* diagnostic code */
79: char esc_name;
80: char term; /* byte terminating the argument */
81:
82: /*
83: * Treat "\E" just like "\";
84: * it only makes a difference in copy mode.
85: */
86:
87: iesc = iarg = aesc;
88: do {
89: iarg++;
90: } while (buf[iarg] == 'E');
91:
92: /*
93: * Sort the following cases first by syntax category,
94: * then by escape sequence type, and finally by ASCII code.
95: */
96:
97: esc_name = buf[iarg];
98: iendarg = iend = ++iarg;
99: maxl = INT_MAX;
100: term = '\0';
101: switch (esc_name) {
102:
103: /* Escape sequences taking no arguments at all. */
104:
105: case '!':
106: case '?':
1.2 schwarze 107: case 'r':
1.1 schwarze 108: rval = ESCAPE_UNSUPP;
109: goto out;
110:
111: case '%':
112: case '&':
113: case ')':
114: case ',':
115: case '/':
116: case '^':
117: case 'a':
118: case 'd':
119: case 't':
120: case 'u':
121: case '{':
122: case '|':
123: case '}':
124: rval = ESCAPE_IGNORE;
125: goto out;
126:
127: case '\\':
128: default:
129: iarg--;
130: rval = ESCAPE_UNDEF;
131: goto out;
132:
133: case ' ':
134: case '\'':
135: case '-':
136: case '.':
137: case '0':
138: case ':':
139: case '_':
140: case '`':
141: case 'e':
142: case '~':
143: iarg--;
144: argl = 1;
145: rval = ESCAPE_SPECIAL;
146: goto out;
147: case 'p':
148: rval = ESCAPE_BREAK;
149: goto out;
150: case 'c':
151: rval = ESCAPE_NOSPACE;
152: goto out;
153: case 'z':
154: rval = ESCAPE_SKIPCHAR;
155: goto out;
156:
157: /* Standard argument format. */
158:
159: case '$':
160: case '*':
1.3 schwarze 161: case 'V':
1.4 schwarze 162: case 'g':
1.1 schwarze 163: case 'n':
164: rval = ESCAPE_EXPAND;
165: break;
166: case 'F':
167: case 'M':
168: case 'O':
169: case 'Y':
170: case 'k':
171: case 'm':
172: rval = ESCAPE_IGNORE;
173: break;
174: case '(':
175: case '[':
176: rval = ESCAPE_SPECIAL;
177: iendarg = iend = --iarg;
178: break;
179: case 'f':
180: rval = ESCAPE_FONT;
181: break;
182:
183: /* Quoted arguments */
184:
1.5 ! schwarze 185: case 'A':
1.1 schwarze 186: case 'B':
187: case 'w':
188: rval = ESCAPE_EXPAND;
189: term = '\b';
190: break;
191: case 'D':
192: case 'H':
193: case 'L':
194: case 'R':
195: case 'S':
196: case 'X':
197: case 'Z':
198: case 'b':
199: case 'v':
200: case 'x':
201: rval = ESCAPE_IGNORE;
202: term = '\b';
203: break;
204: case 'C':
205: if (buf[iarg] != '\'') {
206: rval = ESCAPE_ERROR;
207: goto out;
208: }
209: rval = ESCAPE_SPECIAL;
210: term = '\b';
211: break;
212: case 'N':
213: rval = ESCAPE_NUMBERED;
214: term = '\b';
215: break;
216: case 'h':
217: rval = ESCAPE_HORIZ;
218: term = '\b';
219: break;
220: case 'l':
221: rval = ESCAPE_HLINE;
222: term = '\b';
223: break;
224: case 'o':
225: rval = ESCAPE_OVERSTRIKE;
226: term = '\b';
227: break;
228:
229: /* Sizes support both forms, with additional peculiarities. */
230:
231: case 's':
232: rval = ESCAPE_IGNORE;
233: if (buf[iarg] == '+' || buf[iarg] == '-'||
234: buf[iarg] == ASCII_HYPH)
235: iarg++;
236: switch (buf[iarg]) {
237: case '(':
238: maxl = 2;
239: iarg++;
240: break;
241: case '[':
242: term = ']';
243: iarg++;
244: break;
245: case '\'':
246: term = '\'';
247: iarg++;
248: break;
249: case '1':
250: case '2':
251: case '3':
252: if (buf[iarg - 1] == 's' &&
253: isdigit((unsigned char)buf[iarg + 1])) {
254: maxl = 2;
255: break;
256: }
257: /* FALLTHROUGH */
258: default:
259: maxl = 1;
260: break;
261: }
262: iendarg = iend = iarg;
263: }
264:
265: /* Decide how to end the argument. */
266:
267: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
268: buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
269: &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
270: goto out_sub;
271:
272: if (term == '\b') {
273: if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
274: (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
275: buf[iarg]) != NULL)) {
276: iendarg = iend = iarg + 1;
277: rval = ESCAPE_ERROR;
278: goto out;
279: }
280: term = buf[iarg++];
281: } else if (term == '\0' && maxl == INT_MAX) {
282: if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
283: iarg++;
284: switch (buf[iarg]) {
285: case '(':
286: maxl = 2;
287: iarg++;
288: break;
289: case '[':
290: if (buf[++iarg] == ' ') {
291: iendarg = iend = iarg + 1;
292: rval = ESCAPE_ERROR;
293: goto out;
294: }
295: term = ']';
296: break;
297: default:
298: maxl = 1;
299: break;
300: }
301: }
302:
303: /* Advance to the end of the argument. */
304:
1.5 ! schwarze 305: valid_A = 1;
1.1 schwarze 306: iendarg = iarg;
307: while (maxl > 0) {
308: if (buf[iendarg] == '\0') {
309: /* Ignore an incomplete argument except for \w. */
310: if (esc_name != 'w')
311: iendarg = iarg;
312: break;
313: }
314: if (buf[iendarg] == term) {
315: iend = iendarg + 1;
316: break;
317: }
318: if (esc_name == 'N' &&
319: isdigit((unsigned char)buf[iendarg]) == 0) {
320: iend = iendarg + 1;
321: break;
322: }
323: if (buf[iendarg] == buf[iesc]) {
1.5 ! schwarze 324: switch (roff_escape(buf, ln, iendarg,
! 325: &sesc, &sarg, &sendarg, &send)) {
! 326: case ESCAPE_EXPAND:
1.1 schwarze 327: goto out_sub;
1.5 ! schwarze 328: case ESCAPE_UNDEF:
! 329: break;
! 330: default:
! 331: valid_A = 0;
! 332: break;
! 333: }
1.1 schwarze 334: iendarg = iend = send;
335: } else {
1.5 ! schwarze 336: if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
! 337: valid_A = 0;
1.1 schwarze 338: if (maxl != INT_MAX)
339: maxl--;
340: iend = ++iendarg;
341: }
342: }
343: if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
344: (term != '\0' && buf[iendarg] != term)))
345: mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
346:
347: /* Post-process depending on the content of the argument. */
348:
349: argl = iendarg - iarg;
350: switch (esc_name) {
351: case '*':
352: if (resc == NULL && argl == 2 &&
353: buf[iarg] == '.' && buf[iarg + 1] == 'T')
354: rval = ESCAPE_DEVICE;
1.5 ! schwarze 355: break;
! 356: case 'A':
! 357: if (valid_A == 0)
! 358: iendarg = iarg;
1.1 schwarze 359: break;
360: case 'O':
361: switch (buf[iarg]) {
362: case '0':
363: rval = ESCAPE_UNSUPP;
364: break;
365: case '1':
366: case '2':
367: case '3':
368: case '4':
369: rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
370: break;
371: case '5':
372: rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
373: ESCAPE_ERROR;
374: break;
375: default:
376: rval = ESCAPE_ERROR;
377: break;
378: }
379: break;
380: default:
381: break;
382: }
383:
384: switch (rval) {
385: case ESCAPE_FONT:
386: rval = mandoc_font(buf + iarg, argl);
387: break;
388:
389: case ESCAPE_SPECIAL:
390:
391: /*
392: * The file chars.c only provides one common list of
393: * character names, but \[-] == \- is the only one of
394: * the characters with one-byte names that allows
395: * enclosing the name in brackets.
396: */
397:
398: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
399: rval = ESCAPE_ERROR;
400: break;
401: }
402:
403: /* Treat \[char...] as an alias for \N'...'. */
404:
405: if (buf[iarg] == 'c') {
406: if (argl < 6 || argl > 7 ||
407: strncmp(buf + iarg, "char", 4) != 0 ||
408: (int)strspn(buf + iarg + 4, "0123456789")
409: + 4 < argl)
410: break;
411: c = 0;
412: for (i = iarg; i < iendarg; i++)
413: c = 10 * c + (buf[i] - '0');
414: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
415: break;
416: iarg += 4;
417: rval = ESCAPE_NUMBERED;
418: break;
419: }
420:
421: /*
422: * Unicode escapes are defined in groff as \[u0000]
423: * to \[u10FFFF], where the contained value must be
424: * a valid Unicode codepoint. Here, however, only
425: * check the length and range.
426: */
427:
428: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
429: break;
430: if (argl == 7 &&
431: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
432: break;
433: if (argl == 6 && buf[iarg + 1] == '0')
434: break;
435: if (argl == 5 && buf[iarg + 1] == 'D' &&
436: strchr("89ABCDEF", buf[iarg + 2]) != NULL)
437: break;
438: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
439: + 1 == argl)
440: rval = ESCAPE_UNICODE;
441: break;
442: default:
443: break;
444: }
445: goto out;
446:
447: out_sub:
448: iesc = sesc;
449: iarg = sarg;
450: iendarg = sendarg;
451: iend = send;
452: rval = ESCAPE_EXPAND;
453:
454: out:
455: if (rarg != NULL)
456: *rarg = iarg;
457: if (rendarg != NULL)
458: *rendarg = iendarg;
459: if (rend != NULL)
460: *rend = iend;
461: if (resc == NULL)
462: return rval;
463:
464: /*
465: * Diagnostic messages are only issued when called
466: * from the parser, not when called from the formatters.
467: */
468:
469: *resc = iesc;
470: switch (rval) {
471: case ESCAPE_ERROR:
472: err = MANDOCERR_ESC_BAD;
473: break;
474: case ESCAPE_UNSUPP:
475: err = MANDOCERR_ESC_UNSUPP;
476: break;
477: case ESCAPE_UNDEF:
478: if (esc_name == '\\')
479: return rval;
480: err = MANDOCERR_ESC_UNDEF;
481: break;
482: case ESCAPE_SPECIAL:
483: if (mchars_spec2cp(buf + iarg, argl) >= 0)
484: return rval;
485: err = MANDOCERR_ESC_BAD;
486: break;
487: default:
488: return rval;
489: }
490: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
491: return rval;
492: }
CVSweb