Annotation of mandoc/roff_escape.c, Revision 1.2
1.1 schwarze 1: /* $OpenBSD$ */
2: /*
3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4: * Ingo Schwarze <schwarze@openbsd.org>
5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6: *
7: * Permission to use, copy, modify, and distribute this software for any
8: * purpose with or without fee is hereby granted, provided that the above
9: * copyright notice and this permission notice appear in all copies.
10: *
11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18: *
19: * Parser for roff(7) escape sequences.
20: * To be used by all mandoc(1) parsers and formatters.
21: */
22: #include <assert.h>
23: #include <ctype.h>
24: #include <limits.h>
25: #include <stdio.h>
26: #include <string.h>
27:
28: #include "mandoc.h"
29: #include "roff.h"
30: #include "roff_int.h"
31:
32: /*
33: * Traditional escape sequence interpreter for general use
34: * including in high-level formatters. This function does not issue
35: * diagnostics and is not usable for expansion in the roff(7) parser.
36: * It is documented in the mandoc_escape(3) manual page.
37: */
38: enum mandoc_esc
39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40: {
41: int iarg, iendarg, iend;
42: enum mandoc_esc rval;
43:
44: rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
45: assert(rval != ESCAPE_EXPAND);
46: if (rarg != NULL)
47: *rarg = *rendarg + iarg;
48: if (rargl != NULL)
49: *rargl = iendarg - iarg;
50: *rendarg += iend;
51: return rval;
52: }
53:
54: /*
55: * Full-featured escape sequence parser.
56: * If it encounters a nested escape sequence that requires expansion
57: * by the parser and re-parsing, the positions of that inner escape
58: * sequence are returned in *resc ... *rend.
59: * Otherwise, *resc is set to aesc and the positions of the escape
60: * sequence starting at aesc are returned.
61: * Diagnostic messages are generated if and only if resc != NULL,
62: * that is, if and only if called by roff_expand().
63: */
64: enum mandoc_esc
65: roff_escape(const char *buf, const int ln, const int aesc,
66: int *resc, int *rarg, int *rendarg, int *rend)
67: {
68: int iesc; /* index of leading escape char */
69: int iarg; /* index beginning the argument */
70: int iendarg; /* index right after the argument */
71: int iend; /* index right after the sequence */
72: int sesc, sarg, sendarg, send; /* for sub-escape */
73: int maxl; /* expected length of the argument */
74: int argl; /* actual length of the argument */
75: int c, i; /* for \[char...] parsing */
76: enum mandoc_esc rval; /* return value */
77: enum mandocerr err; /* diagnostic code */
78: char esc_name;
79: char term; /* byte terminating the argument */
80:
81: /*
82: * Treat "\E" just like "\";
83: * it only makes a difference in copy mode.
84: */
85:
86: iesc = iarg = aesc;
87: do {
88: iarg++;
89: } while (buf[iarg] == 'E');
90:
91: /*
92: * Sort the following cases first by syntax category,
93: * then by escape sequence type, and finally by ASCII code.
94: */
95:
96: esc_name = buf[iarg];
97: iendarg = iend = ++iarg;
98: maxl = INT_MAX;
99: term = '\0';
100: switch (esc_name) {
101:
102: /* Escape sequences taking no arguments at all. */
103:
104: case '!':
105: case '?':
1.2 ! schwarze 106: case 'r':
1.1 schwarze 107: rval = ESCAPE_UNSUPP;
108: goto out;
109:
110: case '%':
111: case '&':
112: case ')':
113: case ',':
114: case '/':
115: case '^':
116: case 'a':
117: case 'd':
118: case 't':
119: case 'u':
120: case '{':
121: case '|':
122: case '}':
123: rval = ESCAPE_IGNORE;
124: goto out;
125:
126: case '\\':
127: default:
128: iarg--;
129: rval = ESCAPE_UNDEF;
130: goto out;
131:
132: case ' ':
133: case '\'':
134: case '-':
135: case '.':
136: case '0':
137: case ':':
138: case '_':
139: case '`':
140: case 'e':
141: case '~':
142: iarg--;
143: argl = 1;
144: rval = ESCAPE_SPECIAL;
145: goto out;
146: case 'p':
147: rval = ESCAPE_BREAK;
148: goto out;
149: case 'c':
150: rval = ESCAPE_NOSPACE;
151: goto out;
152: case 'z':
153: rval = ESCAPE_SKIPCHAR;
154: goto out;
155:
156: /* Standard argument format. */
157:
158: case '$':
159: case '*':
160: case 'n':
161: rval = ESCAPE_EXPAND;
162: break;
163: case 'F':
164: case 'M':
165: case 'O':
166: case 'V':
167: case 'Y':
168: case 'g':
169: case 'k':
170: case 'm':
171: rval = ESCAPE_IGNORE;
172: break;
173: case '(':
174: case '[':
175: rval = ESCAPE_SPECIAL;
176: iendarg = iend = --iarg;
177: break;
178: case 'f':
179: rval = ESCAPE_FONT;
180: break;
181:
182: /* Quoted arguments */
183:
184: case 'B':
185: case 'w':
186: rval = ESCAPE_EXPAND;
187: term = '\b';
188: break;
189: case 'A':
190: case 'D':
191: case 'H':
192: case 'L':
193: case 'R':
194: case 'S':
195: case 'X':
196: case 'Z':
197: case 'b':
198: case 'v':
199: case 'x':
200: rval = ESCAPE_IGNORE;
201: term = '\b';
202: break;
203: case 'C':
204: if (buf[iarg] != '\'') {
205: rval = ESCAPE_ERROR;
206: goto out;
207: }
208: rval = ESCAPE_SPECIAL;
209: term = '\b';
210: break;
211: case 'N':
212: rval = ESCAPE_NUMBERED;
213: term = '\b';
214: break;
215: case 'h':
216: rval = ESCAPE_HORIZ;
217: term = '\b';
218: break;
219: case 'l':
220: rval = ESCAPE_HLINE;
221: term = '\b';
222: break;
223: case 'o':
224: rval = ESCAPE_OVERSTRIKE;
225: term = '\b';
226: break;
227:
228: /* Sizes support both forms, with additional peculiarities. */
229:
230: case 's':
231: rval = ESCAPE_IGNORE;
232: if (buf[iarg] == '+' || buf[iarg] == '-'||
233: buf[iarg] == ASCII_HYPH)
234: iarg++;
235: switch (buf[iarg]) {
236: case '(':
237: maxl = 2;
238: iarg++;
239: break;
240: case '[':
241: term = ']';
242: iarg++;
243: break;
244: case '\'':
245: term = '\'';
246: iarg++;
247: break;
248: case '1':
249: case '2':
250: case '3':
251: if (buf[iarg - 1] == 's' &&
252: isdigit((unsigned char)buf[iarg + 1])) {
253: maxl = 2;
254: break;
255: }
256: /* FALLTHROUGH */
257: default:
258: maxl = 1;
259: break;
260: }
261: iendarg = iend = iarg;
262: }
263:
264: /* Decide how to end the argument. */
265:
266: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
267: buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
268: &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
269: goto out_sub;
270:
271: if (term == '\b') {
272: if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
273: (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
274: buf[iarg]) != NULL)) {
275: iendarg = iend = iarg + 1;
276: rval = ESCAPE_ERROR;
277: goto out;
278: }
279: term = buf[iarg++];
280: } else if (term == '\0' && maxl == INT_MAX) {
281: if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
282: iarg++;
283: switch (buf[iarg]) {
284: case '(':
285: maxl = 2;
286: iarg++;
287: break;
288: case '[':
289: if (buf[++iarg] == ' ') {
290: iendarg = iend = iarg + 1;
291: rval = ESCAPE_ERROR;
292: goto out;
293: }
294: term = ']';
295: break;
296: default:
297: maxl = 1;
298: break;
299: }
300: }
301:
302: /* Advance to the end of the argument. */
303:
304: iendarg = iarg;
305: while (maxl > 0) {
306: if (buf[iendarg] == '\0') {
307: /* Ignore an incomplete argument except for \w. */
308: if (esc_name != 'w')
309: iendarg = iarg;
310: break;
311: }
312: if (buf[iendarg] == term) {
313: iend = iendarg + 1;
314: break;
315: }
316: if (esc_name == 'N' &&
317: isdigit((unsigned char)buf[iendarg]) == 0) {
318: iend = iendarg + 1;
319: break;
320: }
321: if (buf[iendarg] == buf[iesc]) {
322: if (roff_escape(buf, ln, iendarg,
323: &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
324: goto out_sub;
325: iendarg = iend = send;
326: } else {
327: if (maxl != INT_MAX)
328: maxl--;
329: iend = ++iendarg;
330: }
331: }
332: if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
333: (term != '\0' && buf[iendarg] != term)))
334: mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
335:
336: /* Post-process depending on the content of the argument. */
337:
338: argl = iendarg - iarg;
339: switch (esc_name) {
340: case '*':
341: if (resc == NULL && argl == 2 &&
342: buf[iarg] == '.' && buf[iarg + 1] == 'T')
343: rval = ESCAPE_DEVICE;
344: break;
345: case 'O':
346: switch (buf[iarg]) {
347: case '0':
348: rval = ESCAPE_UNSUPP;
349: break;
350: case '1':
351: case '2':
352: case '3':
353: case '4':
354: rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
355: break;
356: case '5':
357: rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
358: ESCAPE_ERROR;
359: break;
360: default:
361: rval = ESCAPE_ERROR;
362: break;
363: }
364: break;
365: default:
366: break;
367: }
368:
369: switch (rval) {
370: case ESCAPE_FONT:
371: rval = mandoc_font(buf + iarg, argl);
372: break;
373:
374: case ESCAPE_SPECIAL:
375:
376: /*
377: * The file chars.c only provides one common list of
378: * character names, but \[-] == \- is the only one of
379: * the characters with one-byte names that allows
380: * enclosing the name in brackets.
381: */
382:
383: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
384: rval = ESCAPE_ERROR;
385: break;
386: }
387:
388: /* Treat \[char...] as an alias for \N'...'. */
389:
390: if (buf[iarg] == 'c') {
391: if (argl < 6 || argl > 7 ||
392: strncmp(buf + iarg, "char", 4) != 0 ||
393: (int)strspn(buf + iarg + 4, "0123456789")
394: + 4 < argl)
395: break;
396: c = 0;
397: for (i = iarg; i < iendarg; i++)
398: c = 10 * c + (buf[i] - '0');
399: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
400: break;
401: iarg += 4;
402: rval = ESCAPE_NUMBERED;
403: break;
404: }
405:
406: /*
407: * Unicode escapes are defined in groff as \[u0000]
408: * to \[u10FFFF], where the contained value must be
409: * a valid Unicode codepoint. Here, however, only
410: * check the length and range.
411: */
412:
413: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
414: break;
415: if (argl == 7 &&
416: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
417: break;
418: if (argl == 6 && buf[iarg + 1] == '0')
419: break;
420: if (argl == 5 && buf[iarg + 1] == 'D' &&
421: strchr("89ABCDEF", buf[iarg + 2]) != NULL)
422: break;
423: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
424: + 1 == argl)
425: rval = ESCAPE_UNICODE;
426: break;
427: default:
428: break;
429: }
430: goto out;
431:
432: out_sub:
433: iesc = sesc;
434: iarg = sarg;
435: iendarg = sendarg;
436: iend = send;
437: rval = ESCAPE_EXPAND;
438:
439: out:
440: if (rarg != NULL)
441: *rarg = iarg;
442: if (rendarg != NULL)
443: *rendarg = iendarg;
444: if (rend != NULL)
445: *rend = iend;
446: if (resc == NULL)
447: return rval;
448:
449: /*
450: * Diagnostic messages are only issued when called
451: * from the parser, not when called from the formatters.
452: */
453:
454: *resc = iesc;
455: switch (rval) {
456: case ESCAPE_ERROR:
457: err = MANDOCERR_ESC_BAD;
458: break;
459: case ESCAPE_UNSUPP:
460: err = MANDOCERR_ESC_UNSUPP;
461: break;
462: case ESCAPE_UNDEF:
463: if (esc_name == '\\')
464: return rval;
465: err = MANDOCERR_ESC_UNDEF;
466: break;
467: case ESCAPE_SPECIAL:
468: if (mchars_spec2cp(buf + iarg, argl) >= 0)
469: return rval;
470: err = MANDOCERR_ESC_BAD;
471: break;
472: default:
473: return rval;
474: }
475: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
476: return rval;
477: }
CVSweb