[BACK]Return to roff_escape.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / mandoc

File: [cvsweb.bsd.lv] / mandoc / roff_escape.c (download)

Revision 1.14, Wed Jun 8 13:23:57 2022 UTC (21 months, 2 weeks ago) by schwarze
Branch: MAIN
CVS Tags: HEAD
Changes since 1.13: +49 -26 lines

Surprisingly, every escape sequence can also be used as an argument
delimiter for an outer escape sequence, in which case the delimiting
escape sequence retains its syntax but usually ignores its argument
and loses its inherent effect.  Add rudimentary support for this
syntax quirk in order to improve parsing compatibility with groff.

/* $Id: roff_escape.c,v 1.14 2022/06/08 13:23:57 schwarze Exp $ */
/*
 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
 *               Ingo Schwarze <schwarze@openbsd.org>
 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * Parser for roff(7) escape sequences.
 * To be used by all mandoc(1) parsers and formatters.
 */
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>

#include "mandoc.h"
#include "roff.h"
#include "roff_int.h"

/*
 * Traditional escape sequence interpreter for general use
 * including in high-level formatters.  This function does not issue
 * diagnostics and is not usable for expansion in the roff(7) parser.
 * It is documented in the mandoc_escape(3) manual page.
 */
enum mandoc_esc
mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
{
        int		 iarg, iendarg, iend;
        enum mandoc_esc  rval;

        rval = roff_escape(--*rendarg, 0, 0,
	    NULL, NULL, &iarg, &iendarg, &iend);
        assert(rval != ESCAPE_EXPAND);
        if (rarg != NULL)
	       *rarg = *rendarg + iarg;
        if (rargl != NULL)
	       *rargl = iendarg - iarg;
        *rendarg += iend;
        return rval;
}

/*
 * Full-featured escape sequence parser.
 * If it encounters a nested escape sequence that requires expansion
 * by the parser and re-parsing, the positions of that inner escape
 * sequence are returned in *resc ... *rend.
 * Otherwise, *resc is set to aesc and the positions of the escape
 * sequence starting at aesc are returned.
 * Diagnostic messages are generated if and only if ln != 0,
 * that is, if and only if called by roff_expand().
 */
enum mandoc_esc
roff_escape(const char *buf, const int ln, const int aesc,
    int *resc, int *rnam, int *rarg, int *rendarg, int *rend)
{
	int		 iesc;		/* index of leading escape char */
	int		 inam;		/* index of escape name */
	int		 iarg;		/* index beginning the argument */
	int		 iendarg;	/* index right after the argument */
	int		 iend;		/* index right after the sequence */
	int		 sesc, snam, sarg, sendarg, send; /* for sub-escape */
	int		 escterm;	/* whether term is escaped */
	int		 maxl;		/* expected length of the argument */
	int		 argl;		/* actual length of the argument */
	int		 c, i;		/* for \[char...] parsing */
	int 		 valid_A;	/* for \A parsing */
	enum mandoc_esc	 rval;		/* return value */
	enum mandoc_esc	 stype;		/* for sub-escape */
	enum mandocerr	 err;		/* diagnostic code */
	char		 term;		/* byte terminating the argument */

	/*
	 * Treat "\E" just like "\";
	 * it only makes a difference in copy mode.
	 */

	iesc = inam = aesc;
	do {
		inam++;
	} while (buf[inam] == 'E');

	/*
	 * Sort the following cases first by syntax category,
	 * then by escape sequence type, and finally by ASCII code.
	 */

	iarg = iendarg = iend = inam + 1;
	maxl = INT_MAX;
	term = '\0';
	err = MANDOCERR_OK;
	switch (buf[inam]) {

	/* Escape sequences taking no arguments at all. */

	case '!':
	case '?':
	case 'r':
		rval = ESCAPE_UNSUPP;
		goto out;

	case '%':
	case '&':
	case ')':
	case ',':
	case '/':
	case '^':
	case 'a':
	case 'd':
	case 't':
	case 'u':
	case '{':
	case '|':
	case '}':
		rval = ESCAPE_IGNORE;
		goto out;

	case '\0':
		iendarg = --iend;
		/* FALLTHROUGH */
	case '.':
	case '\\':
	default:
		iarg--;
		rval = ESCAPE_UNDEF;
		goto out;

	case ' ':
	case '\'':
	case '-':
	case '0':
	case ':':
	case '_':
	case '`':
	case 'e':
	case '~':
		iarg--;
		argl = 1;
		rval = ESCAPE_SPECIAL;
		goto out;
	case 'p':
		rval = ESCAPE_BREAK;
		goto out;
	case 'c':
		rval = ESCAPE_NOSPACE;
		goto out;
	case 'z':
		rval = ESCAPE_SKIPCHAR;
		goto out;

	/* Standard argument format. */

	case '$':
	case '*':
	case 'V':
	case 'g':
	case 'n':
		rval = ESCAPE_EXPAND;
		break;
	case 'F':
	case 'M':
	case 'O':
	case 'Y':
	case 'k':
	case 'm':
		rval = ESCAPE_IGNORE;
		break;
	case '(':
	case '[':
		rval = ESCAPE_SPECIAL;
		iendarg = iend = --iarg;
		break;
	case 'f':
		rval = ESCAPE_FONT;
		break;

	/* Quoted arguments */

	case 'A':
	case 'B':
	case 'w':
		rval = ESCAPE_EXPAND;
		term = '\b';
		break;
	case 'D':
	case 'H':
	case 'L':
	case 'R':
	case 'S':
	case 'X':
	case 'Z':
	case 'b':
	case 'v':
	case 'x':
		rval = ESCAPE_IGNORE;
		term = '\b';
		break;
	case 'C':
		rval = ESCAPE_SPECIAL;
		term = '\b';
		break;
	case 'N':
		rval = ESCAPE_NUMBERED;
		term = '\b';
		break;
	case 'h':
		rval = ESCAPE_HORIZ;
		term = '\b';
		break;
	case 'l':
		rval = ESCAPE_HLINE;
		term = '\b';
		break;
	case 'o':
		rval = ESCAPE_OVERSTRIKE;
		term = '\b';
		break;

	/* Sizes support both forms, with additional peculiarities. */

	case 's':
		rval = ESCAPE_IGNORE;
		if (buf[iarg] == '+' || buf[iarg] == '-'||
		    buf[iarg] == ASCII_HYPH)
			iarg++;
		switch (buf[iarg]) {
		case '(':
			maxl = 2;
			iarg++;
			break;
		case '[':
			term = ']';
			iarg++;
			break;
		case '\'':
			term = '\'';
			iarg++;
			break;
		case '1':
		case '2':
		case '3':
			if (buf[iarg - 1] == 's' &&
			    isdigit((unsigned char)buf[iarg + 1])) {
				maxl = 2;
				break;
			}
			/* FALLTHROUGH */
		default:
			maxl = 1;
			break;
		}
		iendarg = iend = iarg;
	}

	/* Decide how to end the argument. */

	escterm = 0;
	stype = ESCAPE_EXPAND;
	if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
	    buf[iarg] == buf[iesc]) {
		stype = roff_escape(buf, ln, iendarg,
		    &sesc, &snam, &sarg, &sendarg, &send);
		if (stype == ESCAPE_EXPAND)
			goto out_sub;
	}

	if (term == '\b') {
		if (stype == ESCAPE_UNDEF)
			iarg++;
		if (stype != ESCAPE_EXPAND && stype != ESCAPE_UNDEF) {
			if (strchr("BHLRSNhlvx", buf[inam]) != NULL &&
			    strchr(" ,.0DLOXYZ^abdhlortuvx|~",
			    buf[snam]) != NULL) {
				err = MANDOCERR_ESC_DELIM;
				iend = send;
				iarg = iendarg = sesc;
				goto out;
			}
			escterm = 1;
			iarg = send;
			term = buf[snam];
		} else if (strchr("BDHLRSvxNhl", buf[inam]) != NULL &&
		    strchr(" %&()*+-./0123456789:<=>", buf[iarg]) != NULL) {
			err = MANDOCERR_ESC_DELIM;
			if (rval != ESCAPE_EXPAND)
				rval = ESCAPE_ERROR;
			if (buf[inam] != 'D') {
				iendarg = iend = iarg + 1;
				goto out;
			}
		}
		if (term == '\b')
			term = buf[iarg++];
	} else if (term == '\0' && maxl == INT_MAX) {
		if (buf[inam] == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
			iarg++;
		switch (buf[iarg]) {
		case '(':
			maxl = 2;
			iarg++;
			break;
		case '[':
			if (buf[++iarg] == ' ') {
				iendarg = iend = iarg + 1;
				err = MANDOCERR_ESC_ARG;
				rval = ESCAPE_ERROR;
				goto out;
			}
			term = ']';
			break;
		default:
			maxl = 1;
			break;
		}
	}

	/* Advance to the end of the argument. */

	valid_A = 1;
	iendarg = iarg;
	while (maxl > 0) {
		if (buf[iendarg] == '\0') {
			err = MANDOCERR_ESC_INCOMPLETE;
			if (rval != ESCAPE_EXPAND &&
			    rval != ESCAPE_OVERSTRIKE)
				rval = ESCAPE_ERROR;
			/* Usually, ignore an incomplete argument. */
			if (strchr("Aow", buf[inam]) == NULL)
				iendarg = iarg;
			break;
		}
		if (escterm == 0 && buf[iendarg] == term) {
			iend = iendarg + 1;
			break;
		}
		if (buf[iendarg] == buf[iesc]) {
			stype = roff_escape(buf, ln, iendarg,
			    &sesc, &snam, &sarg, &sendarg, &send);
			if (stype == ESCAPE_EXPAND)
				goto out_sub;
			iend = send;
			if (escterm == 1 &&
			    (buf[snam] == term || buf[inam] == 'N'))
				break;
			if (stype != ESCAPE_UNDEF)
				valid_A = 0;
			iendarg = send;
		} else if (buf[inam] == 'N' &&
		    isdigit((unsigned char)buf[iendarg]) == 0) {
			iend = iendarg + 1;
			break;
		} else {
			if (buf[iendarg] == ' ' || buf[iendarg] == '\t')
				valid_A = 0;
			if (maxl != INT_MAX)
				maxl--;
			iend = ++iendarg;
		}
	}

	/* Post-process depending on the content of the argument. */

	argl = iendarg - iarg;
	switch (buf[inam]) {
	case '*':
		if (resc == NULL && argl == 2 &&
		    buf[iarg] == '.' && buf[iarg + 1] == 'T')
			rval = ESCAPE_DEVICE;
		break;
	case 'A':
		if (valid_A == 0)
			iendarg = iarg;
		break;
	case 'O':
		switch (buf[iarg]) {
		case '0':
			rval = ESCAPE_UNSUPP;
			break;
		case '1':
		case '2':
		case '3':
		case '4':
			if (argl == 1)
				rval = ESCAPE_IGNORE;
			else {
				err = MANDOCERR_ESC_ARG;
				rval = ESCAPE_ERROR;
			}
			break;
		case '5':
			if (buf[iarg - 1] == '[')
				rval = ESCAPE_UNSUPP;
			else {
				err = MANDOCERR_ESC_ARG;
				rval = ESCAPE_ERROR;
			}
			break;
		default:
			err = MANDOCERR_ESC_ARG;
			rval = ESCAPE_ERROR;
			break;
		}
		break;
	default:
		break;
	}

	switch (rval) {
	case ESCAPE_FONT:
		rval = mandoc_font(buf + iarg, argl);
		if (rval == ESCAPE_ERROR)
			err = MANDOCERR_ESC_ARG;
		break;

	case ESCAPE_SPECIAL:
		if (argl == 0) {
			err = MANDOCERR_ESC_BADCHAR;
			rval = ESCAPE_ERROR;
			break;
		}

		/*
		 * The file chars.c only provides one common list of
		 * character names, but \[-] == \- is the only one of
		 * the characters with one-byte names that allows
		 * enclosing the name in brackets.
		 */

		if (term != '\0' && argl == 1 && buf[iarg] != '-') {
			err = MANDOCERR_ESC_BADCHAR;
			rval = ESCAPE_ERROR;
			break;
		}

		/* Treat \[char...] as an alias for \N'...'. */

		if (buf[iarg] == 'c') {
			if (argl < 6 || argl > 7 ||
			    strncmp(buf + iarg, "char", 4) != 0 ||
			    (int)strspn(buf + iarg + 4, "0123456789")
			     + 4 < argl)
				break;
			c = 0;
			for (i = iarg; i < iendarg; i++)
				c = 10 * c + (buf[i] - '0');
			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff) {
				err = MANDOCERR_ESC_BADCHAR;
				break;
			}
			iarg += 4;
			rval = ESCAPE_NUMBERED;
			break;
		}

		/*
		 * Unicode escapes are defined in groff as \[u0000]
		 * to \[u10FFFF], where the contained value must be
		 * a valid Unicode codepoint.  Here, however, only
		 * check the length and range.
		 */

		if (buf[iarg] != 'u' || argl < 5 || argl > 7)
			break;
		if (argl == 7 &&
		    (buf[iarg + 1] != '1' || buf[iarg + 2] != '0')) {
			err = MANDOCERR_ESC_BADCHAR;
			break;
		}
		if (argl == 6 && buf[iarg + 1] == '0') {
			err = MANDOCERR_ESC_BADCHAR;
			break;
		}
		if (argl == 5 && buf[iarg + 1] == 'D' &&
		    strchr("89ABCDEF", buf[iarg + 2]) != NULL) {
			err = MANDOCERR_ESC_BADCHAR;
			break;
		}
		if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
		    + 1 == argl)
			rval = ESCAPE_UNICODE;
		break;
	default:
		break;
	}
	goto out;

out_sub:
	iesc = sesc;
	inam = snam;
	iarg = sarg;
	iendarg = sendarg;
	iend = send;
	rval = ESCAPE_EXPAND;

out:
	if (resc != NULL)
		*resc = iesc;
	if (rnam != NULL)
		*rnam = inam;
	if (rarg != NULL)
		*rarg = iarg;
	if (rendarg != NULL)
		*rendarg = iendarg;
	if (rend != NULL)
		*rend = iend;
	if (ln == 0)
		return rval;

	/*
	 * Diagnostic messages are only issued when called
	 * from the parser, not when called from the formatters.
	 */

	switch (rval) {
	case ESCAPE_UNSUPP:
		err = MANDOCERR_ESC_UNSUPP;
		break;
	case ESCAPE_UNDEF:
		if (buf[inam] != '\\' && buf[inam] != '.')
			err = MANDOCERR_ESC_UNDEF;
		break;
	case ESCAPE_SPECIAL:
		if (mchars_spec2cp(buf + iarg, argl) >= 0)
			err = MANDOCERR_OK;
		else if (err == MANDOCERR_OK)
			err = MANDOCERR_ESC_UNKCHAR;
		break;
	default:
		break;
	}
	if (err != MANDOCERR_OK)
		mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
	return rval;
}