Annotation of mandoc/roff_escape.c, Revision 1.14
1.14 ! schwarze 1: /* $Id: roff_escape.c,v 1.13 2022/06/07 09:54:40 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
4: * Ingo Schwarze <schwarze@openbsd.org>
5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
6: *
7: * Permission to use, copy, modify, and distribute this software for any
8: * purpose with or without fee is hereby granted, provided that the above
9: * copyright notice and this permission notice appear in all copies.
10: *
11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
18: *
19: * Parser for roff(7) escape sequences.
20: * To be used by all mandoc(1) parsers and formatters.
21: */
22: #include <assert.h>
23: #include <ctype.h>
24: #include <limits.h>
25: #include <stdio.h>
26: #include <string.h>
27:
28: #include "mandoc.h"
29: #include "roff.h"
30: #include "roff_int.h"
31:
32: /*
33: * Traditional escape sequence interpreter for general use
34: * including in high-level formatters. This function does not issue
35: * diagnostics and is not usable for expansion in the roff(7) parser.
36: * It is documented in the mandoc_escape(3) manual page.
37: */
38: enum mandoc_esc
39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
40: {
41: int iarg, iendarg, iend;
42: enum mandoc_esc rval;
43:
1.7 schwarze 44: rval = roff_escape(--*rendarg, 0, 0,
45: NULL, NULL, &iarg, &iendarg, &iend);
1.1 schwarze 46: assert(rval != ESCAPE_EXPAND);
47: if (rarg != NULL)
48: *rarg = *rendarg + iarg;
49: if (rargl != NULL)
50: *rargl = iendarg - iarg;
51: *rendarg += iend;
52: return rval;
53: }
54:
55: /*
56: * Full-featured escape sequence parser.
57: * If it encounters a nested escape sequence that requires expansion
58: * by the parser and re-parsing, the positions of that inner escape
59: * sequence are returned in *resc ... *rend.
60: * Otherwise, *resc is set to aesc and the positions of the escape
61: * sequence starting at aesc are returned.
1.14 ! schwarze 62: * Diagnostic messages are generated if and only if ln != 0,
1.1 schwarze 63: * that is, if and only if called by roff_expand().
64: */
65: enum mandoc_esc
66: roff_escape(const char *buf, const int ln, const int aesc,
1.7 schwarze 67: int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
1.1 schwarze 68: {
69: int iesc; /* index of leading escape char */
1.7 schwarze 70: int inam; /* index of escape name */
1.1 schwarze 71: int iarg; /* index beginning the argument */
72: int iendarg; /* index right after the argument */
73: int iend; /* index right after the sequence */
1.7 schwarze 74: int sesc, snam, sarg, sendarg, send; /* for sub-escape */
1.14 ! schwarze 75: int escterm; /* whether term is escaped */
1.1 schwarze 76: int maxl; /* expected length of the argument */
77: int argl; /* actual length of the argument */
78: int c, i; /* for \[char...] parsing */
1.5 schwarze 79: int valid_A; /* for \A parsing */
1.1 schwarze 80: enum mandoc_esc rval; /* return value */
1.14 ! schwarze 81: enum mandoc_esc stype; /* for sub-escape */
1.1 schwarze 82: enum mandocerr err; /* diagnostic code */
83: char term; /* byte terminating the argument */
84:
85: /*
86: * Treat "\E" just like "\";
87: * it only makes a difference in copy mode.
88: */
89:
1.7 schwarze 90: iesc = inam = aesc;
1.1 schwarze 91: do {
1.7 schwarze 92: inam++;
93: } while (buf[inam] == 'E');
1.1 schwarze 94:
95: /*
96: * Sort the following cases first by syntax category,
97: * then by escape sequence type, and finally by ASCII code.
98: */
99:
1.7 schwarze 100: iarg = iendarg = iend = inam + 1;
1.1 schwarze 101: maxl = INT_MAX;
102: term = '\0';
1.9 schwarze 103: err = MANDOCERR_OK;
1.7 schwarze 104: switch (buf[inam]) {
1.1 schwarze 105:
106: /* Escape sequences taking no arguments at all. */
107:
108: case '!':
109: case '?':
1.2 schwarze 110: case 'r':
1.1 schwarze 111: rval = ESCAPE_UNSUPP;
112: goto out;
113:
114: case '%':
115: case '&':
116: case ')':
117: case ',':
118: case '/':
119: case '^':
120: case 'a':
121: case 'd':
122: case 't':
123: case 'u':
124: case '{':
125: case '|':
126: case '}':
127: rval = ESCAPE_IGNORE;
128: goto out;
129:
1.6 schwarze 130: case '\0':
131: iendarg = --iend;
132: /* FALLTHROUGH */
1.8 schwarze 133: case '.':
1.1 schwarze 134: case '\\':
135: default:
136: iarg--;
137: rval = ESCAPE_UNDEF;
138: goto out;
139:
140: case ' ':
141: case '\'':
142: case '-':
143: case '0':
144: case ':':
145: case '_':
146: case '`':
147: case 'e':
148: case '~':
149: iarg--;
150: argl = 1;
151: rval = ESCAPE_SPECIAL;
152: goto out;
153: case 'p':
154: rval = ESCAPE_BREAK;
155: goto out;
156: case 'c':
157: rval = ESCAPE_NOSPACE;
158: goto out;
159: case 'z':
160: rval = ESCAPE_SKIPCHAR;
161: goto out;
162:
163: /* Standard argument format. */
164:
165: case '$':
166: case '*':
1.3 schwarze 167: case 'V':
1.4 schwarze 168: case 'g':
1.1 schwarze 169: case 'n':
170: rval = ESCAPE_EXPAND;
171: break;
172: case 'F':
173: case 'M':
174: case 'O':
175: case 'Y':
176: case 'k':
177: case 'm':
178: rval = ESCAPE_IGNORE;
179: break;
180: case '(':
181: case '[':
182: rval = ESCAPE_SPECIAL;
183: iendarg = iend = --iarg;
184: break;
185: case 'f':
186: rval = ESCAPE_FONT;
187: break;
188:
189: /* Quoted arguments */
190:
1.5 schwarze 191: case 'A':
1.1 schwarze 192: case 'B':
193: case 'w':
194: rval = ESCAPE_EXPAND;
195: term = '\b';
196: break;
197: case 'D':
198: case 'H':
199: case 'L':
200: case 'R':
201: case 'S':
202: case 'X':
203: case 'Z':
204: case 'b':
205: case 'v':
206: case 'x':
207: rval = ESCAPE_IGNORE;
208: term = '\b';
209: break;
210: case 'C':
211: rval = ESCAPE_SPECIAL;
212: term = '\b';
213: break;
214: case 'N':
215: rval = ESCAPE_NUMBERED;
216: term = '\b';
217: break;
218: case 'h':
219: rval = ESCAPE_HORIZ;
220: term = '\b';
221: break;
222: case 'l':
223: rval = ESCAPE_HLINE;
224: term = '\b';
225: break;
226: case 'o':
227: rval = ESCAPE_OVERSTRIKE;
228: term = '\b';
229: break;
230:
231: /* Sizes support both forms, with additional peculiarities. */
232:
233: case 's':
234: rval = ESCAPE_IGNORE;
235: if (buf[iarg] == '+' || buf[iarg] == '-'||
236: buf[iarg] == ASCII_HYPH)
237: iarg++;
238: switch (buf[iarg]) {
239: case '(':
240: maxl = 2;
241: iarg++;
242: break;
243: case '[':
244: term = ']';
245: iarg++;
246: break;
247: case '\'':
248: term = '\'';
249: iarg++;
250: break;
251: case '1':
252: case '2':
253: case '3':
254: if (buf[iarg - 1] == 's' &&
255: isdigit((unsigned char)buf[iarg + 1])) {
256: maxl = 2;
257: break;
258: }
259: /* FALLTHROUGH */
260: default:
261: maxl = 1;
262: break;
263: }
264: iendarg = iend = iarg;
265: }
266:
267: /* Decide how to end the argument. */
268:
1.14 ! schwarze 269: escterm = 0;
! 270: stype = ESCAPE_EXPAND;
1.1 schwarze 271: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
1.14 ! schwarze 272: buf[iarg] == buf[iesc]) {
! 273: stype = roff_escape(buf, ln, iendarg,
! 274: &sesc, &snam, &sarg, &sendarg, &send);
! 275: if (stype == ESCAPE_EXPAND)
! 276: goto out_sub;
! 277: }
1.1 schwarze 278:
279: if (term == '\b') {
1.14 ! schwarze 280: if (stype == ESCAPE_UNDEF)
! 281: iarg++;
! 282: if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) {
! 283: if (strchr("BHLRSNhlvx", buf[inam]) != NULL &&
! 284: strchr(" ,.0DLOXYZ^abdhlortuvx|~",
! 285: buf[snam]) != NULL) {
! 286: err = MANDOCERR_ESC_DELIM;
! 287: iend = send;
! 288: iarg = iendarg = sesc;
! 289: goto out;
! 290: }
! 291: escterm = 1;
! 292: iarg = send;
! 293: term = buf[snam];
! 294: } else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
1.12 schwarze 295: strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
1.13 schwarze 296: err = MANDOCERR_ESC_DELIM;
1.12 schwarze 297: if (rval != ESCAPE_EXPAND)
298: rval = ESCAPE_ERROR;
299: if (buf[inam] != 'D') {
300: iendarg = iend = iarg + 1;
301: goto out;
302: }
1.1 schwarze 303: }
1.14 ! schwarze 304: if (term == '\b')
! 305: term = buf[iarg++];
1.1 schwarze 306: } else if (term == '\0' && maxl == INT_MAX) {
1.7 schwarze 307: if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
1.1 schwarze 308: iarg++;
309: switch (buf[iarg]) {
310: case '(':
311: maxl = 2;
312: iarg++;
313: break;
314: case '[':
315: if (buf[++iarg] == ' ') {
316: iendarg = iend = iarg + 1;
1.13 schwarze 317: err = MANDOCERR_ESC_ARG;
1.1 schwarze 318: rval = ESCAPE_ERROR;
319: goto out;
320: }
321: term = ']';
322: break;
323: default:
324: maxl = 1;
325: break;
326: }
327: }
328:
329: /* Advance to the end of the argument. */
330:
1.5 schwarze 331: valid_A = 1;
1.1 schwarze 332: iendarg = iarg;
333: while (maxl > 0) {
334: if (buf[iendarg] == '\0') {
1.10 schwarze 335: err = MANDOCERR_ESC_INCOMPLETE;
1.14 ! schwarze 336: if (rval != ESCAPE_EXPAND &&
! 337: rval != ESCAPE_OVERSTRIKE)
1.10 schwarze 338: rval = ESCAPE_ERROR;
1.14 ! schwarze 339: /* Usually, ignore an incomplete argument. */
! 340: if (strchr("Aow", buf[inam]) == NULL)
1.1 schwarze 341: iendarg = iarg;
342: break;
343: }
1.14 ! schwarze 344: if (escterm == 0 && buf[iendarg] == term) {
1.1 schwarze 345: iend = iendarg + 1;
346: break;
347: }
348: if (buf[iendarg] == buf[iesc]) {
1.14 ! schwarze 349: stype = roff_escape(buf, ln, iendarg,
! 350: &sesc, &snam, &sarg, &sendarg, &send);
! 351: if (stype == ESCAPE_EXPAND)
1.1 schwarze 352: goto out_sub;
1.14 ! schwarze 353: iend = send;
! 354: if (escterm == 1 &&
! 355: (buf[snam] == term || buf[inam] == 'N'))
1.5 schwarze 356: break;
1.14 ! schwarze 357: if (stype != ESCAPE_UNDEF)
1.5 schwarze 358: valid_A = 0;
1.14 ! schwarze 359: iendarg = send;
! 360: } else if (buf[inam] == 'N' &&
! 361: isdigit((unsigned char)buf[iendarg]) == 0) {
! 362: iend = iendarg + 1;
! 363: break;
1.1 schwarze 364: } else {
1.5 schwarze 365: if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
366: valid_A = 0;
1.1 schwarze 367: if (maxl != INT_MAX)
368: maxl--;
369: iend = ++iendarg;
370: }
371: }
372:
373: /* Post-process depending on the content of the argument. */
374:
375: argl = iendarg - iarg;
1.7 schwarze 376: switch (buf[inam]) {
1.1 schwarze 377: case '*':
378: if (resc == NULL && argl == 2 &&
379: buf[iarg] == '.' && buf[iarg + 1] == 'T')
380: rval = ESCAPE_DEVICE;
1.5 schwarze 381: break;
382: case 'A':
383: if (valid_A == 0)
384: iendarg = iarg;
1.1 schwarze 385: break;
386: case 'O':
387: switch (buf[iarg]) {
388: case '0':
389: rval = ESCAPE_UNSUPP;
390: break;
391: case '1':
392: case '2':
393: case '3':
394: case '4':
1.13 schwarze 395: if (argl == 1)
396: rval = ESCAPE_IGNORE;
397: else {
398: err = MANDOCERR_ESC_ARG;
399: rval = ESCAPE_ERROR;
400: }
1.1 schwarze 401: break;
402: case '5':
1.13 schwarze 403: if (buf[iarg - 1] == '[')
404: rval = ESCAPE_UNSUPP;
405: else {
406: err = MANDOCERR_ESC_ARG;
407: rval = ESCAPE_ERROR;
408: }
1.1 schwarze 409: break;
410: default:
1.13 schwarze 411: err = MANDOCERR_ESC_ARG;
1.1 schwarze 412: rval = ESCAPE_ERROR;
413: break;
414: }
415: break;
416: default:
417: break;
418: }
419:
420: switch (rval) {
421: case ESCAPE_FONT:
422: rval = mandoc_font(buf + iarg, argl);
1.13 schwarze 423: if (rval == ESCAPE_ERROR)
424: err = MANDOCERR_ESC_ARG;
1.1 schwarze 425: break;
426:
427: case ESCAPE_SPECIAL:
1.11 schwarze 428: if (argl == 0) {
429: err = MANDOCERR_ESC_BADCHAR;
430: rval = ESCAPE_ERROR;
431: break;
432: }
1.1 schwarze 433:
434: /*
435: * The file chars.c only provides one common list of
436: * character names, but \[-] == \- is the only one of
437: * the characters with one-byte names that allows
438: * enclosing the name in brackets.
439: */
440:
441: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
1.10 schwarze 442: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 443: rval = ESCAPE_ERROR;
444: break;
445: }
446:
447: /* Treat \[char...] as an alias for \N'...'. */
448:
449: if (buf[iarg] == 'c') {
450: if (argl < 6 || argl > 7 ||
451: strncmp(buf + iarg, "char", 4) != 0 ||
452: (int)strspn(buf + iarg + 4, "0123456789")
453: + 4 < argl)
454: break;
455: c = 0;
456: for (i = iarg; i < iendarg; i++)
457: c = 10 * c + (buf[i] - '0');
1.10 schwarze 458: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
459: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 460: break;
1.10 schwarze 461: }
1.1 schwarze 462: iarg += 4;
463: rval = ESCAPE_NUMBERED;
464: break;
465: }
466:
467: /*
468: * Unicode escapes are defined in groff as \[u0000]
469: * to \[u10FFFF], where the contained value must be
470: * a valid Unicode codepoint. Here, however, only
471: * check the length and range.
472: */
473:
474: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
475: break;
476: if (argl == 7 &&
1.10 schwarze 477: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
478: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 479: break;
1.10 schwarze 480: }
481: if (argl == 6 && buf[iarg + 1] == '0') {
482: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 483: break;
1.10 schwarze 484: }
1.1 schwarze 485: if (argl == 5 && buf[iarg + 1] == 'D' &&
1.10 schwarze 486: strchr("89ABCDEF", buf[iarg + 2]) != NULL) {
487: err = MANDOCERR_ESC_BADCHAR;
1.1 schwarze 488: break;
1.10 schwarze 489: }
1.1 schwarze 490: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
491: + 1 == argl)
492: rval = ESCAPE_UNICODE;
493: break;
494: default:
495: break;
496: }
497: goto out;
498:
499: out_sub:
500: iesc = sesc;
1.7 schwarze 501: inam = snam;
1.1 schwarze 502: iarg = sarg;
503: iendarg = sendarg;
504: iend = send;
505: rval = ESCAPE_EXPAND;
506:
507: out:
1.14 ! schwarze 508: if (resc != NULL)
! 509: *resc = iesc;
1.7 schwarze 510: if (rnam != NULL)
511: *rnam = inam;
1.1 schwarze 512: if (rarg != NULL)
513: *rarg = iarg;
514: if (rendarg != NULL)
515: *rendarg = iendarg;
516: if (rend != NULL)
517: *rend = iend;
1.14 ! schwarze 518: if (ln == 0)
1.1 schwarze 519: return rval;
520:
521: /*
522: * Diagnostic messages are only issued when called
523: * from the parser, not when called from the formatters.
524: */
525:
526: switch (rval) {
527: case ESCAPE_UNSUPP:
528: err = MANDOCERR_ESC_UNSUPP;
529: break;
530: case ESCAPE_UNDEF:
1.9 schwarze 531: if (buf[inam] != '\\' && buf[inam] != '.')
532: err = MANDOCERR_ESC_UNDEF;
1.1 schwarze 533: break;
534: case ESCAPE_SPECIAL:
1.10 schwarze 535: if (mchars_spec2cp(buf + iarg, argl) >= 0)
536: err = MANDOCERR_OK;
537: else if (err == MANDOCERR_OK)
538: err = MANDOCERR_ESC_UNKCHAR;
1.1 schwarze 539: break;
540: default:
1.9 schwarze 541: break;
1.1 schwarze 542: }
1.9 schwarze 543: if (err != MANDOCERR_OK)
544: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
1.1 schwarze 545: return rval;
546: }
CVSweb