Annotation of mandoc/roff_escape.c, Revision 1.13
1.13 ! schwarze 1: /* $Id: roff_escape.c,v 1.12 2022/06/06 19:23:13 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4: * Ingo Schwarze <schwarze@openbsd.org>
5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6: *
7: * Permission to use, copy, modify, and distribute this software for any
8: * purpose with or without fee is hereby granted, provided that the above
9: * copyright notice and this permission notice appear in all copies.
10: *
11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18: *
19: * Parser for roff(7) escape sequences.
20: * To be used by all mandoc(1) parsers and formatters.
21: */
22: #include <assert.h>
23: #include <ctype.h>
24: #include <limits.h>
25: #include <stdio.h>
26: #include <string.h>
27:
28: #include "mandoc.h"
29: #include "roff.h"
30: #include "roff_int.h"
31:
32: /*
33: * Traditional escape sequence interpreter for general use
34: * including in high-level formatters. This function does not issue
35: * diagnostics and is not usable for expansion in the roff(7) parser.
36: * It is documented in the mandoc_escape(3) manual page.
37: */
38: enum mandoc_esc
39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40: {
41: int iarg, iendarg, iend;
42: enum mandoc_esc rval;
43:
1.7 schwarze 44: rval = roff_escape(--*rendarg, 0, 0,
45: NULL, NULL, &iarg, &iendarg, &iend);
1.1 schwarze 46: assert(rval != ESCAPE_EXPAND);
47: if (rarg != NULL)
48: *rarg = *rendarg + iarg;
49: if (rargl != NULL)
50: *rargl = iendarg - iarg;
51: *rendarg += iend;
52: return rval;
53: }
54:
55: /*
56: * Full-featured escape sequence parser.
57: * If it encounters a nested escape sequence that requires expansion
58: * by the parser and re-parsing, the positions of that inner escape
59: * sequence are returned in *resc ... *rend.
60: * Otherwise, *resc is set to aesc and the positions of the escape
61: * sequence starting at aesc are returned.
62: * Diagnostic messages are generated if and only if resc != NULL,
63: * that is, if and only if called by roff_expand().
64: */
65: enum mandoc_esc
66: roff_escape(const char *buf, const int ln, const int aesc,
1.7 schwarze 67: int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
1.1 schwarze 68: {
69: int iesc; /* index of leading escape char */
1.7 schwarze 70: int inam; /* index of escape name */
1.1 schwarze 71: int iarg; /* index beginning the argument */
72: int iendarg; /* index right after the argument */
73: int iend; /* index right after the sequence */
1.7 schwarze 74: int sesc, snam, sarg, sendarg, send; /* for sub-escape */
1.1 schwarze 75: int maxl; /* expected length of the argument */
76: int argl; /* actual length of the argument */
77: int c, i; /* for \[char...] parsing */
1.5 schwarze 78: int valid_A; /* for \A parsing */
1.1 schwarze 79: enum mandoc_esc rval; /* return value */
80: enum mandocerr err; /* diagnostic code */
81: char term; /* byte terminating the argument */
82:
83: /*
84: * Treat "\E" just like "\";
85: * it only makes a difference in copy mode.
86: */
87:
1.7 schwarze 88: iesc = inam = aesc;
1.1 schwarze 89: do {
1.7 schwarze 90: inam++;
91: } while (buf[inam] == 'E');
1.1 schwarze 92:
93: /*
94: * Sort the following cases first by syntax category,
95: * then by escape sequence type, and finally by ASCII code.
96: */
97:
1.7 schwarze 98: iarg = iendarg = iend = inam + 1;
1.1 schwarze 99: maxl = INT_MAX;
100: term = '\0';
1.9 schwarze 101: err = MANDOCERR_OK;
1.7 schwarze 102: switch (buf[inam]) {
1.1 schwarze 103:
104: /* Escape sequences taking no arguments at all. */
105:
106: case '!':
107: case '?':
1.2 schwarze 108: case 'r':
1.1 schwarze 109: rval = ESCAPE_UNSUPP;
110: goto out;
111:
112: case '%':
113: case '&':
114: case ')':
115: case ',':
116: case '/':
117: case '^':
118: case 'a':
119: case 'd':
120: case 't':
121: case 'u':
122: case '{':
123: case '|':
124: case '}':
125: rval = ESCAPE_IGNORE;
126: goto out;
127:
1.6 schwarze 128: case '\0':
129: iendarg = --iend;
130: /* FALLTHROUGH */
1.8 schwarze 131: case '.':
1.1 schwarze 132: case '\\':
133: default:
134: iarg--;
135: rval = ESCAPE_UNDEF;
136: goto out;
137:
138: case ' ':
139: case '\'':
140: case '-':
141: case '0':
142: case ':':
143: case '_':
144: case '`':
145: case 'e':
146: case '~':
147: iarg--;
148: argl = 1;
149: rval = ESCAPE_SPECIAL;
150: goto out;
151: case 'p':
152: rval = ESCAPE_BREAK;
153: goto out;
154: case 'c':
155: rval = ESCAPE_NOSPACE;
156: goto out;
157: case 'z':
158: rval = ESCAPE_SKIPCHAR;
159: goto out;
160:
161: /* Standard argument format. */
162:
163: case '$':
164: case '*':
1.3 schwarze 165: case 'V':
1.4 schwarze 166: case 'g':
1.1 schwarze 167: case 'n':
168: rval = ESCAPE_EXPAND;
169: break;
170: case 'F':
171: case 'M':
172: case 'O':
173: case 'Y':
174: case 'k':
175: case 'm':
176: rval = ESCAPE_IGNORE;
177: break;
178: case '(':
179: case '[':
180: rval = ESCAPE_SPECIAL;
181: iendarg = iend = --iarg;
182: break;
183: case 'f':
184: rval = ESCAPE_FONT;
185: break;
186:
187: /* Quoted arguments */
188:
1.5 schwarze 189: case 'A':
1.1 schwarze 190: case 'B':
191: case 'w':
192: rval = ESCAPE_EXPAND;
193: term = '\b';
194: break;
195: case 'D':
196: case 'H':
197: case 'L':
198: case 'R':
199: case 'S':
200: case 'X':
201: case 'Z':
202: case 'b':
203: case 'v':
204: case 'x':
205: rval = ESCAPE_IGNORE;
206: term = '\b';
207: break;
208: case 'C':
209: rval = ESCAPE_SPECIAL;
210: term = '\b';
211: break;
212: case 'N':
213: rval = ESCAPE_NUMBERED;
214: term = '\b';
215: break;
216: case 'h':
217: rval = ESCAPE_HORIZ;
218: term = '\b';
219: break;
220: case 'l':
221: rval = ESCAPE_HLINE;
222: term = '\b';
223: break;
224: case 'o':
225: rval = ESCAPE_OVERSTRIKE;
226: term = '\b';
227: break;
228:
229: /* Sizes support both forms, with additional peculiarities. */
230:
231: case 's':
232: rval = ESCAPE_IGNORE;
233: if (buf[iarg] == '+' || buf[iarg] == '-'||
234: buf[iarg] == ASCII_HYPH)
235: iarg++;
236: switch (buf[iarg]) {
237: case '(':
238: maxl = 2;
239: iarg++;
240: break;
241: case '[':
242: term = ']';
243: iarg++;
244: break;
245: case '\'':
246: term = '\'';
247: iarg++;
248: break;
249: case '1':
250: case '2':
251: case '3':
252: if (buf[iarg - 1] == 's' &&
253: isdigit((unsigned char)buf[iarg + 1])) {
254: maxl = 2;
255: break;
256: }
257: /* FALLTHROUGH */
258: default:
259: maxl = 1;
260: break;
261: }
262: iendarg = iend = iarg;
263: }
264:
265: /* Decide how to end the argument. */
266:
267: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
268: buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
1.7 schwarze 269: &sesc, &snam, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
1.1 schwarze 270: goto out_sub;
271:
272: if (term == '\b') {
1.12 schwarze 273: if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
274: strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
1.13 ! schwarze 275: err = MANDOCERR_ESC_DELIM;
1.12 schwarze 276: if (rval != ESCAPE_EXPAND)
277: rval = ESCAPE_ERROR;
278: if (buf[inam] != 'D') {
279: iendarg = iend = iarg + 1;
280: goto out;
281: }
1.1 schwarze 282: }
283: term = buf[iarg++];
284: } else if (term == '\0' && maxl == INT_MAX) {
1.7 schwarze 285: if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
1.1 schwarze 286: iarg++;
287: switch (buf[iarg]) {
288: case '(':
289: maxl = 2;
290: iarg++;
291: break;
292: case '[':
293: if (buf[++iarg] == ' ') {
294: iendarg = iend = iarg + 1;
1.13 ! schwarze 295: err = MANDOCERR_ESC_ARG;
1.1 schwarze 296: rval = ESCAPE_ERROR;
297: goto out;
298: }
299: term = ']';
300: break;
301: default:
302: maxl = 1;
303: break;
304: }
305: }
306:
307: /* Advance to the end of the argument. */
308:
1.5 schwarze 309: valid_A = 1;
1.1 schwarze 310: iendarg = iarg;
311: while (maxl > 0) {
312: if (buf[iendarg] == '\0') {
1.10 schwarze 313: err = MANDOCERR_ESC_INCOMPLETE;
314: if (rval != ESCAPE_EXPAND)
315: rval = ESCAPE_ERROR;
1.1 schwarze 316: /* Ignore an incomplete argument except for \w. */
1.7 schwarze 317: if (buf[inam] != 'w')
1.1 schwarze 318: iendarg = iarg;
319: break;
320: }
321: if (buf[iendarg] == term) {
322: iend = iendarg + 1;
323: break;
324: }
1.7 schwarze 325: if (buf[inam] == 'N' &&
1.1 schwarze 326: isdigit((unsigned char)buf[iendarg]) == 0) {
327: iend = iendarg + 1;
328: break;
329: }
330: if (buf[iendarg] == buf[iesc]) {
1.5 schwarze 331: switch (roff_escape(buf, ln, iendarg,
1.7 schwarze 332: &sesc, &snam, &sarg, &sendarg, &send)) {
1.5 schwarze 333: case ESCAPE_EXPAND:
1.1 schwarze 334: goto out_sub;
1.5 schwarze 335: case ESCAPE_UNDEF:
336: break;
337: default:
338: valid_A = 0;
339: break;
340: }
1.1 schwarze 341: iendarg = iend = send;
342: } else {
1.5 schwarze 343: if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
344: valid_A = 0;
1.1 schwarze 345: if (maxl != INT_MAX)
346: maxl--;
347: iend = ++iendarg;
348: }
349: }
350:
351: /* Post-process depending on the content of the argument. */
352:
353: argl = iendarg - iarg;
1.7 schwarze 354: switch (buf[inam]) {
1.1 schwarze 355: case '*':
356: if (resc == NULL && argl == 2 &&
357: buf[iarg] == '.' && buf[iarg + 1] == 'T')
358: rval = ESCAPE_DEVICE;
1.5 schwarze 359: break;
360: case 'A':
361: if (valid_A == 0)
362: iendarg = iarg;
1.1 schwarze 363: break;
364: case 'O':
365: switch (buf[iarg]) {
366: case '0':
367: rval = ESCAPE_UNSUPP;
368: break;
369: case '1':
370: case '2':
371: case '3':
372: case '4':
1.13 ! schwarze 373: if (argl == 1)
! 374: rval = ESCAPE_IGNORE;
! 375: else {
! 376: err = MANDOCERR_ESC_ARG;
! 377: rval = ESCAPE_ERROR;
! 378: }
1.1 schwarze 379: break;
380: case '5':
1.13 ! schwarze 381: if (buf[iarg - 1] == '[')
! 382: rval = ESCAPE_UNSUPP;
! 383: else {
! 384: err = MANDOCERR_ESC_ARG;
! 385: rval = ESCAPE_ERROR;
! 386: }
1.1 schwarze 387: break;
388: default:
1.13 ! schwarze 389: err = MANDOCERR_ESC_ARG;
1.1 schwarze 390: rval = ESCAPE_ERROR;
391: break;
392: }
393: break;
394: default:
395: break;
396: }
397:
398: switch (rval) {
399: case ESCAPE_FONT:
400: rval = mandoc_font(buf + iarg, argl);
1.13 ! schwarze 401: if (rval == ESCAPE_ERROR)
! 402: err = MANDOCERR_ESC_ARG;
1.1 schwarze 403: break;
404:
405: case ESCAPE_SPECIAL:
1.11 schwarze 406: if (argl == 0) {
407: err = MANDOCERR_ESC_BADCHAR;
408: rval = ESCAPE_ERROR;
409: break;
410: }
1.1 schwarze 411:
412: /*
413: * The file chars.c only provides one common list of
414: * character names, but \[-] == \- is the only one of
415: * the characters with one-byte names that allows
416: * enclosing the name in brackets.
417: */
418:
419: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
1.10 schwarze 420: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 421: rval = ESCAPE_ERROR;
422: break;
423: }
424:
425: /* Treat \[char...] as an alias for \N'...'. */
426:
427: if (buf[iarg] == 'c') {
428: if (argl < 6 || argl > 7 ||
429: strncmp(buf + iarg, "char", 4) != 0 ||
430: (int)strspn(buf + iarg + 4, "0123456789")
431: + 4 < argl)
432: break;
433: c = 0;
434: for (i = iarg; i < iendarg; i++)
435: c = 10 * c + (buf[i] - '0');
1.10 schwarze 436: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
437: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 438: break;
1.10 schwarze 439: }
1.1 schwarze 440: iarg += 4;
441: rval = ESCAPE_NUMBERED;
442: break;
443: }
444:
445: /*
446: * Unicode escapes are defined in groff as \[u0000]
447: * to \[u10FFFF], where the contained value must be
448: * a valid Unicode codepoint. Here, however, only
449: * check the length and range.
450: */
451:
452: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
453: break;
454: if (argl == 7 &&
1.10 schwarze 455: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
456: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 457: break;
1.10 schwarze 458: }
459: if (argl == 6 && buf[iarg + 1] == '0') {
460: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 461: break;
1.10 schwarze 462: }
1.1 schwarze 463: if (argl == 5 && buf[iarg + 1] == 'D' &&
1.10 schwarze 464: strchr("89ABCDEF", buf[iarg + 2]) != NULL) {
465: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 466: break;
1.10 schwarze 467: }
1.1 schwarze 468: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
469: + 1 == argl)
470: rval = ESCAPE_UNICODE;
471: break;
472: default:
473: break;
474: }
475: goto out;
476:
477: out_sub:
478: iesc = sesc;
1.7 schwarze 479: inam = snam;
1.1 schwarze 480: iarg = sarg;
481: iendarg = sendarg;
482: iend = send;
483: rval = ESCAPE_EXPAND;
484:
485: out:
1.7 schwarze 486: if (rnam != NULL)
487: *rnam = inam;
1.1 schwarze 488: if (rarg != NULL)
489: *rarg = iarg;
490: if (rendarg != NULL)
491: *rendarg = iendarg;
492: if (rend != NULL)
493: *rend = iend;
494: if (resc == NULL)
495: return rval;
496:
497: /*
498: * Diagnostic messages are only issued when called
499: * from the parser, not when called from the formatters.
500: */
501:
502: *resc = iesc;
503: switch (rval) {
504: case ESCAPE_UNSUPP:
505: err = MANDOCERR_ESC_UNSUPP;
506: break;
507: case ESCAPE_UNDEF:
1.9 schwarze 508: if (buf[inam] != '\\' && buf[inam] != '.')
509: err = MANDOCERR_ESC_UNDEF;
1.1 schwarze 510: break;
511: case ESCAPE_SPECIAL:
1.10 schwarze 512: if (mchars_spec2cp(buf + iarg, argl) >= 0)
513: err = MANDOCERR_OK;
514: else if (err == MANDOCERR_OK)
515: err = MANDOCERR_ESC_UNKCHAR;
1.1 schwarze 516: break;
517: default:
1.9 schwarze 518: break;
1.1 schwarze 519: }
1.9 schwarze 520: if (err != MANDOCERR_OK)
521: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
1.1 schwarze 522: return rval;
523: }
CVSweb