Annotation of mandoc/roff_escape.c, Revision 1.1
1.1 ! schwarze 1: /* $OpenBSD$ */
! 2: /*
! 3: * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
! 4: * Ingo Schwarze <schwarze@openbsd.org>
! 5: * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
! 6: *
! 7: * Permission to use, copy, modify, and distribute this software for any
! 8: * purpose with or without fee is hereby granted, provided that the above
! 9: * copyright notice and this permission notice appear in all copies.
! 10: *
! 11: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
! 12: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 13: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
! 14: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 15: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 16: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 17: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 18: *
! 19: * Parser for roff(7) escape sequences.
! 20: * To be used by all mandoc(1) parsers and formatters.
! 21: */
! 22: #include <assert.h>
! 23: #include <ctype.h>
! 24: #include <limits.h>
! 25: #include <stdio.h>
! 26: #include <string.h>
! 27:
! 28: #include "mandoc.h"
! 29: #include "roff.h"
! 30: #include "roff_int.h"
! 31:
! 32: /*
! 33: * Traditional escape sequence interpreter for general use
! 34: * including in high-level formatters. This function does not issue
! 35: * diagnostics and is not usable for expansion in the roff(7) parser.
! 36: * It is documented in the mandoc_escape(3) manual page.
! 37: */
! 38: enum mandoc_esc
! 39: mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
! 40: {
! 41: int iarg, iendarg, iend;
! 42: enum mandoc_esc rval;
! 43:
! 44: rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
! 45: assert(rval != ESCAPE_EXPAND);
! 46: if (rarg != NULL)
! 47: *rarg = *rendarg + iarg;
! 48: if (rargl != NULL)
! 49: *rargl = iendarg - iarg;
! 50: *rendarg += iend;
! 51: return rval;
! 52: }
! 53:
! 54: /*
! 55: * Full-featured escape sequence parser.
! 56: * If it encounters a nested escape sequence that requires expansion
! 57: * by the parser and re-parsing, the positions of that inner escape
! 58: * sequence are returned in *resc ... *rend.
! 59: * Otherwise, *resc is set to aesc and the positions of the escape
! 60: * sequence starting at aesc are returned.
! 61: * Diagnostic messages are generated if and only if resc != NULL,
! 62: * that is, if and only if called by roff_expand().
! 63: */
! 64: enum mandoc_esc
! 65: roff_escape(const char *buf, const int ln, const int aesc,
! 66: int *resc, int *rarg, int *rendarg, int *rend)
! 67: {
! 68: int iesc; /* index of leading escape char */
! 69: int iarg; /* index beginning the argument */
! 70: int iendarg; /* index right after the argument */
! 71: int iend; /* index right after the sequence */
! 72: int sesc, sarg, sendarg, send; /* for sub-escape */
! 73: int maxl; /* expected length of the argument */
! 74: int argl; /* actual length of the argument */
! 75: int c, i; /* for \[char...] parsing */
! 76: enum mandoc_esc rval; /* return value */
! 77: enum mandocerr err; /* diagnostic code */
! 78: char esc_name;
! 79: char term; /* byte terminating the argument */
! 80:
! 81: /*
! 82: * Treat "\E" just like "\";
! 83: * it only makes a difference in copy mode.
! 84: */
! 85:
! 86: iesc = iarg = aesc;
! 87: do {
! 88: iarg++;
! 89: } while (buf[iarg] == 'E');
! 90:
! 91: /*
! 92: * Sort the following cases first by syntax category,
! 93: * then by escape sequence type, and finally by ASCII code.
! 94: */
! 95:
! 96: esc_name = buf[iarg];
! 97: iendarg = iend = ++iarg;
! 98: maxl = INT_MAX;
! 99: term = '\0';
! 100: switch (esc_name) {
! 101:
! 102: /* Escape sequences taking no arguments at all. */
! 103:
! 104: case '!':
! 105: case '?':
! 106: rval = ESCAPE_UNSUPP;
! 107: goto out;
! 108:
! 109: case '%':
! 110: case '&':
! 111: case ')':
! 112: case ',':
! 113: case '/':
! 114: case '^':
! 115: case 'a':
! 116: case 'd':
! 117: case 'r':
! 118: case 't':
! 119: case 'u':
! 120: case '{':
! 121: case '|':
! 122: case '}':
! 123: rval = ESCAPE_IGNORE;
! 124: goto out;
! 125:
! 126: case '\\':
! 127: default:
! 128: iarg--;
! 129: rval = ESCAPE_UNDEF;
! 130: goto out;
! 131:
! 132: case ' ':
! 133: case '\'':
! 134: case '-':
! 135: case '.':
! 136: case '0':
! 137: case ':':
! 138: case '_':
! 139: case '`':
! 140: case 'e':
! 141: case '~':
! 142: iarg--;
! 143: argl = 1;
! 144: rval = ESCAPE_SPECIAL;
! 145: goto out;
! 146: case 'p':
! 147: rval = ESCAPE_BREAK;
! 148: goto out;
! 149: case 'c':
! 150: rval = ESCAPE_NOSPACE;
! 151: goto out;
! 152: case 'z':
! 153: rval = ESCAPE_SKIPCHAR;
! 154: goto out;
! 155:
! 156: /* Standard argument format. */
! 157:
! 158: case '$':
! 159: case '*':
! 160: case 'n':
! 161: rval = ESCAPE_EXPAND;
! 162: break;
! 163: case 'F':
! 164: case 'M':
! 165: case 'O':
! 166: case 'V':
! 167: case 'Y':
! 168: case 'g':
! 169: case 'k':
! 170: case 'm':
! 171: rval = ESCAPE_IGNORE;
! 172: break;
! 173: case '(':
! 174: case '[':
! 175: rval = ESCAPE_SPECIAL;
! 176: iendarg = iend = --iarg;
! 177: break;
! 178: case 'f':
! 179: rval = ESCAPE_FONT;
! 180: break;
! 181:
! 182: /* Quoted arguments */
! 183:
! 184: case 'B':
! 185: case 'w':
! 186: rval = ESCAPE_EXPAND;
! 187: term = '\b';
! 188: break;
! 189: case 'A':
! 190: case 'D':
! 191: case 'H':
! 192: case 'L':
! 193: case 'R':
! 194: case 'S':
! 195: case 'X':
! 196: case 'Z':
! 197: case 'b':
! 198: case 'v':
! 199: case 'x':
! 200: rval = ESCAPE_IGNORE;
! 201: term = '\b';
! 202: break;
! 203: case 'C':
! 204: if (buf[iarg] != '\'') {
! 205: rval = ESCAPE_ERROR;
! 206: goto out;
! 207: }
! 208: rval = ESCAPE_SPECIAL;
! 209: term = '\b';
! 210: break;
! 211: case 'N':
! 212: rval = ESCAPE_NUMBERED;
! 213: term = '\b';
! 214: break;
! 215: case 'h':
! 216: rval = ESCAPE_HORIZ;
! 217: term = '\b';
! 218: break;
! 219: case 'l':
! 220: rval = ESCAPE_HLINE;
! 221: term = '\b';
! 222: break;
! 223: case 'o':
! 224: rval = ESCAPE_OVERSTRIKE;
! 225: term = '\b';
! 226: break;
! 227:
! 228: /* Sizes support both forms, with additional peculiarities. */
! 229:
! 230: case 's':
! 231: rval = ESCAPE_IGNORE;
! 232: if (buf[iarg] == '+' || buf[iarg] == '-'||
! 233: buf[iarg] == ASCII_HYPH)
! 234: iarg++;
! 235: switch (buf[iarg]) {
! 236: case '(':
! 237: maxl = 2;
! 238: iarg++;
! 239: break;
! 240: case '[':
! 241: term = ']';
! 242: iarg++;
! 243: break;
! 244: case '\'':
! 245: term = '\'';
! 246: iarg++;
! 247: break;
! 248: case '1':
! 249: case '2':
! 250: case '3':
! 251: if (buf[iarg - 1] == 's' &&
! 252: isdigit((unsigned char)buf[iarg + 1])) {
! 253: maxl = 2;
! 254: break;
! 255: }
! 256: /* FALLTHROUGH */
! 257: default:
! 258: maxl = 1;
! 259: break;
! 260: }
! 261: iendarg = iend = iarg;
! 262: }
! 263:
! 264: /* Decide how to end the argument. */
! 265:
! 266: if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
! 267: buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
! 268: &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
! 269: goto out_sub;
! 270:
! 271: if (term == '\b') {
! 272: if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
! 273: (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
! 274: buf[iarg]) != NULL)) {
! 275: iendarg = iend = iarg + 1;
! 276: rval = ESCAPE_ERROR;
! 277: goto out;
! 278: }
! 279: term = buf[iarg++];
! 280: } else if (term == '\0' && maxl == INT_MAX) {
! 281: if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
! 282: iarg++;
! 283: switch (buf[iarg]) {
! 284: case '(':
! 285: maxl = 2;
! 286: iarg++;
! 287: break;
! 288: case '[':
! 289: if (buf[++iarg] == ' ') {
! 290: iendarg = iend = iarg + 1;
! 291: rval = ESCAPE_ERROR;
! 292: goto out;
! 293: }
! 294: term = ']';
! 295: break;
! 296: default:
! 297: maxl = 1;
! 298: break;
! 299: }
! 300: }
! 301:
! 302: /* Advance to the end of the argument. */
! 303:
! 304: iendarg = iarg;
! 305: while (maxl > 0) {
! 306: if (buf[iendarg] == '\0') {
! 307: /* Ignore an incomplete argument except for \w. */
! 308: if (esc_name != 'w')
! 309: iendarg = iarg;
! 310: break;
! 311: }
! 312: if (buf[iendarg] == term) {
! 313: iend = iendarg + 1;
! 314: break;
! 315: }
! 316: if (esc_name == 'N' &&
! 317: isdigit((unsigned char)buf[iendarg]) == 0) {
! 318: iend = iendarg + 1;
! 319: break;
! 320: }
! 321: if (buf[iendarg] == buf[iesc]) {
! 322: if (roff_escape(buf, ln, iendarg,
! 323: &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
! 324: goto out_sub;
! 325: iendarg = iend = send;
! 326: } else {
! 327: if (maxl != INT_MAX)
! 328: maxl--;
! 329: iend = ++iendarg;
! 330: }
! 331: }
! 332: if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
! 333: (term != '\0' && buf[iendarg] != term)))
! 334: mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);
! 335:
! 336: /* Post-process depending on the content of the argument. */
! 337:
! 338: argl = iendarg - iarg;
! 339: switch (esc_name) {
! 340: case '*':
! 341: if (resc == NULL && argl == 2 &&
! 342: buf[iarg] == '.' && buf[iarg + 1] == 'T')
! 343: rval = ESCAPE_DEVICE;
! 344: break;
! 345: case 'O':
! 346: switch (buf[iarg]) {
! 347: case '0':
! 348: rval = ESCAPE_UNSUPP;
! 349: break;
! 350: case '1':
! 351: case '2':
! 352: case '3':
! 353: case '4':
! 354: rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
! 355: break;
! 356: case '5':
! 357: rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
! 358: ESCAPE_ERROR;
! 359: break;
! 360: default:
! 361: rval = ESCAPE_ERROR;
! 362: break;
! 363: }
! 364: break;
! 365: default:
! 366: break;
! 367: }
! 368:
! 369: switch (rval) {
! 370: case ESCAPE_FONT:
! 371: rval = mandoc_font(buf + iarg, argl);
! 372: break;
! 373:
! 374: case ESCAPE_SPECIAL:
! 375:
! 376: /*
! 377: * The file chars.c only provides one common list of
! 378: * character names, but \[-] == \- is the only one of
! 379: * the characters with one-byte names that allows
! 380: * enclosing the name in brackets.
! 381: */
! 382:
! 383: if (term != '\0' && argl == 1 && buf[iarg] != '-') {
! 384: rval = ESCAPE_ERROR;
! 385: break;
! 386: }
! 387:
! 388: /* Treat \[char...] as an alias for \N'...'. */
! 389:
! 390: if (buf[iarg] == 'c') {
! 391: if (argl < 6 || argl > 7 ||
! 392: strncmp(buf + iarg, "char", 4) != 0 ||
! 393: (int)strspn(buf + iarg + 4, "0123456789")
! 394: + 4 < argl)
! 395: break;
! 396: c = 0;
! 397: for (i = iarg; i < iendarg; i++)
! 398: c = 10 * c + (buf[i] - '0');
! 399: if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
! 400: break;
! 401: iarg += 4;
! 402: rval = ESCAPE_NUMBERED;
! 403: break;
! 404: }
! 405:
! 406: /*
! 407: * Unicode escapes are defined in groff as \[u0000]
! 408: * to \[u10FFFF], where the contained value must be
! 409: * a valid Unicode codepoint. Here, however, only
! 410: * check the length and range.
! 411: */
! 412:
! 413: if (buf[iarg] != 'u' || argl < 5 || argl > 7)
! 414: break;
! 415: if (argl == 7 &&
! 416: (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
! 417: break;
! 418: if (argl == 6 && buf[iarg + 1] == '0')
! 419: break;
! 420: if (argl == 5 && buf[iarg + 1] == 'D' &&
! 421: strchr("89ABCDEF", buf[iarg + 2]) != NULL)
! 422: break;
! 423: if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
! 424: + 1 == argl)
! 425: rval = ESCAPE_UNICODE;
! 426: break;
! 427: default:
! 428: break;
! 429: }
! 430: goto out;
! 431:
! 432: out_sub:
! 433: iesc = sesc;
! 434: iarg = sarg;
! 435: iendarg = sendarg;
! 436: iend = send;
! 437: rval = ESCAPE_EXPAND;
! 438:
! 439: out:
! 440: if (rarg != NULL)
! 441: *rarg = iarg;
! 442: if (rendarg != NULL)
! 443: *rendarg = iendarg;
! 444: if (rend != NULL)
! 445: *rend = iend;
! 446: if (resc == NULL)
! 447: return rval;
! 448:
! 449: /*
! 450: * Diagnostic messages are only issued when called
! 451: * from the parser, not when called from the formatters.
! 452: */
! 453:
! 454: *resc = iesc;
! 455: switch (rval) {
! 456: case ESCAPE_ERROR:
! 457: err = MANDOCERR_ESC_BAD;
! 458: break;
! 459: case ESCAPE_UNSUPP:
! 460: err = MANDOCERR_ESC_UNSUPP;
! 461: break;
! 462: case ESCAPE_UNDEF:
! 463: if (esc_name == '\\')
! 464: return rval;
! 465: err = MANDOCERR_ESC_UNDEF;
! 466: break;
! 467: case ESCAPE_SPECIAL:
! 468: if (mchars_spec2cp(buf + iarg, argl) >= 0)
! 469: return rval;
! 470: err = MANDOCERR_ESC_BAD;
! 471: break;
! 472: default:
! 473: return rval;
! 474: }
! 475: mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
! 476: return rval;
! 477: }
CVSweb