mandoc/roff_escape.c - view

Return to roff_escape.c CVS log

Up to [cvsweb.bsd.lv] / mandoc

File: [cvsweb.bsd.lv] / mandoc / roff_escape.c (download)

Revision 1.1, Thu May 19 15:37:47 2022 UTC (3 years, 1 month ago) by schwarze
Branch: MAIN

Make roff_expand() parse left-to-right rather than right-to-left.
Some escape sequences have side effects on global state, implying
that the order of evaluation matters.  For example, this fixes the
long-standing bug that "\n+x\n+x\n+x" after ".nr x 0 1" used to
print "321"; now it correctly prints "123".

Right-to-left parsing was convenient because it implicitly handled
nested escape sequences.  With correct left-to-right parsing, nesting
now requires an explicit implementation, here solved as follows:
1. Handle nested expanding escape sequences iteratively.
When finding one, expand it, then retry parsing the enclosing escape
sequence from the beginning, which will ultimately succeed as soon
as it no longer contains any nested expanding escape sequences.
2. Handle nested non-expanding escape sequences recursively.
When finding one, the escape sequence parser calls itself to find
the end of the inner sequence, then continues parsing the outer
sequence after that point.

This requires the mandoc_escape() function to operate in two different
modes.  The roff(7) parser uses it in a mode where it generates
diagnostics and may return an expansion request instead of a parse
result.  All other callers, in particular the formatters, use it
in a simpler mode that never generates diagnostics and always returns
a definite parsing result, but that requires all expanding escape
sequences to already have been expanded earlier.  The bulk of the
code is the same for both modes.
Since this required a major rewrite of the function anyway, move
it into its own new file roff_escape.c and out of the file mandoc.c,
which was misnamed in the first place and lacks a clear focus.

As a side benefit, this also fixes a number of assertion failures
that tb@ found with afl(1), for example "\n\\\\*0", "\v\-\\*0",
and "\w\-\\\\\$0*0".

As another side benefit, it also resolves some code duplication
between mandoc_escape() and roff_expand() and centralizes all
handling of escape sequences (except for expansion) in roff_escape.c,
hopefully easing maintenance and feature improvements in the future.

While here, also move end-of-input handling out of the complicated
function roff_expand() and into the simpler function roff_parse_comment(),
making the logic easier to understand.

Since this is a major reorganization of a central component of
mandoc(1), stability of the program might slightly suffer for a few
weeks, but i believe that's not a problem at this point of the
release cycle.  The new code already satisfies the regression suite,
but more tweaking and regression testing to further improve the
handling of various escape sequences will likely follow in the near
future.

/* $OpenBSD$ */
/*
 * Copyright (c) 2011, 2012, 2013, 2014, 2015, 2017, 2018, 2020, 2022
 *               Ingo Schwarze <schwarze@openbsd.org>
 * Copyright (c) 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 *
 * Parser for roff(7) escape sequences.
 * To be used by all mandoc(1) parsers and formatters.
 */
#include <assert.h>
#include <ctype.h>
#include <limits.h>
#include <stdio.h>
#include <string.h>

#include "mandoc.h"
#include "roff.h"
#include "roff_int.h"

/*
 * Traditional escape sequence interpreter for general use
 * including in high-level formatters.  This function does not issue
 * diagnostics and is not usable for expansion in the roff(7) parser.
 * It is documented in the mandoc_escape(3) manual page.
 */
enum mandoc_esc
mandoc_escape(const char **rendarg, const char **rarg, int *rargl)
{
        int		 iarg, iendarg, iend;
        enum mandoc_esc  rval;

        rval = roff_escape(--*rendarg, 0, 0, NULL, &iarg, &iendarg, &iend);
        assert(rval != ESCAPE_EXPAND);
        if (rarg != NULL)
	       *rarg = *rendarg + iarg;
        if (rargl != NULL)
	       *rargl = iendarg - iarg;
        *rendarg += iend;
        return rval;
}

/*
 * Full-featured escape sequence parser.
 * If it encounters a nested escape sequence that requires expansion
 * by the parser and re-parsing, the positions of that inner escape
 * sequence are returned in *resc ... *rend.
 * Otherwise, *resc is set to aesc and the positions of the escape
 * sequence starting at aesc are returned.
 * Diagnostic messages are generated if and only if resc != NULL,
 * that is, if and only if called by roff_expand().
 */
enum mandoc_esc
roff_escape(const char *buf, const int ln, const int aesc,
    int *resc, int *rarg, int *rendarg, int *rend)
{
	int		 iesc;		/* index of leading escape char */
	int		 iarg;		/* index beginning the argument */
	int		 iendarg;	/* index right after the argument */
	int		 iend;		/* index right after the sequence */
	int		 sesc, sarg, sendarg, send; /* for sub-escape */
	int		 maxl;		/* expected length of the argument */
	int		 argl;		/* actual length of the argument */
	int		 c, i;		/* for \[char...] parsing */
	enum mandoc_esc	 rval;		/* return value */
	enum mandocerr	 err;		/* diagnostic code */
	char		 esc_name;
	char		 term;		/* byte terminating the argument */

	/*
	 * Treat "\E" just like "\";
	 * it only makes a difference in copy mode.
	 */

	iesc = iarg = aesc;
	do {
		iarg++;
	} while (buf[iarg] == 'E');

	/*
	 * Sort the following cases first by syntax category,
	 * then by escape sequence type, and finally by ASCII code.
	 */

	esc_name = buf[iarg];
	iendarg = iend = ++iarg;
	maxl = INT_MAX;
	term = '\0';
	switch (esc_name) {

	/* Escape sequences taking no arguments at all. */

	case '!':
	case '?':
		rval = ESCAPE_UNSUPP;
		goto out;

	case '%':
	case '&':
	case ')':
	case ',':
	case '/':
	case '^':
	case 'a':
	case 'd':
	case 'r':
	case 't':
	case 'u':
	case '{':
	case '|':
	case '}':
		rval = ESCAPE_IGNORE;
		goto out;

	case '\\':
	default:
		iarg--;
		rval = ESCAPE_UNDEF;
		goto out;

	case ' ':
	case '\'':
	case '-':
	case '.':
	case '0':
	case ':':
	case '_':
	case '`':
	case 'e':
	case '~':
		iarg--;
		argl = 1;
		rval = ESCAPE_SPECIAL;
		goto out;
	case 'p':
		rval = ESCAPE_BREAK;
		goto out;
	case 'c':
		rval = ESCAPE_NOSPACE;
		goto out;
	case 'z':
		rval = ESCAPE_SKIPCHAR;
		goto out;

	/* Standard argument format. */

	case '$':
	case '*':
	case 'n':
		rval = ESCAPE_EXPAND;
		break;
	case 'F':
	case 'M':
	case 'O':
	case 'V':
	case 'Y':
	case 'g':
	case 'k':
	case 'm':
		rval = ESCAPE_IGNORE;
		break;
	case '(':
	case '[':
		rval = ESCAPE_SPECIAL;
		iendarg = iend = --iarg;
		break;
	case 'f':
		rval = ESCAPE_FONT;
		break;

	/* Quoted arguments */

	case 'B':
	case 'w':
		rval = ESCAPE_EXPAND;
		term = '\b';
		break;
	case 'A':
	case 'D':
	case 'H':
	case 'L':
	case 'R':
	case 'S':
	case 'X':
	case 'Z':
	case 'b':
	case 'v':
	case 'x':
		rval = ESCAPE_IGNORE;
		term = '\b';
		break;
	case 'C':
		if (buf[iarg] != '\'') {
			rval = ESCAPE_ERROR;
			goto out;
		}
		rval = ESCAPE_SPECIAL;
		term = '\b';
		break;
	case 'N':
		rval = ESCAPE_NUMBERED;
		term = '\b';
		break;
	case 'h':
		rval = ESCAPE_HORIZ;
		term = '\b';
		break;
	case 'l':
		rval = ESCAPE_HLINE;
		term = '\b';
		break;
	case 'o':
		rval = ESCAPE_OVERSTRIKE;
		term = '\b';
		break;

	/* Sizes support both forms, with additional peculiarities. */

	case 's':
		rval = ESCAPE_IGNORE;
		if (buf[iarg] == '+' || buf[iarg] == '-'||
		    buf[iarg] == ASCII_HYPH)
			iarg++;
		switch (buf[iarg]) {
		case '(':
			maxl = 2;
			iarg++;
			break;
		case '[':
			term = ']';
			iarg++;
			break;
		case '\'':
			term = '\'';
			iarg++;
			break;
		case '1':
		case '2':
		case '3':
			if (buf[iarg - 1] == 's' &&
			    isdigit((unsigned char)buf[iarg + 1])) {
				maxl = 2;
				break;
			}
			/* FALLTHROUGH */
		default:
			maxl = 1;
			break;
		}
		iendarg = iend = iarg;
	}

	/* Decide how to end the argument. */

	if ((term == '\b' || (term == '\0' && maxl == INT_MAX)) &&
	    buf[iarg] == buf[iesc] && roff_escape(buf, ln, iendarg,
	    &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
		goto out_sub;

	if (term == '\b') {
		if ((esc_name == 'N' && isdigit((unsigned char)buf[iarg])) ||
		    (esc_name == 'h' && strchr(" %&()*+-./0123456789:<=>",
		     buf[iarg]) != NULL)) {
			iendarg = iend = iarg + 1;
			rval = ESCAPE_ERROR;
			goto out;
		}
		term = buf[iarg++];
	} else if (term == '\0' && maxl == INT_MAX) {
		if (esc_name == 'n' && (buf[iarg] == '+' || buf[iarg] == '-'))
			iarg++;
		switch (buf[iarg]) {
		case '(':
			maxl = 2;
			iarg++;
			break;
		case '[':
			if (buf[++iarg] == ' ') {
				iendarg = iend = iarg + 1;
				rval = ESCAPE_ERROR;
				goto out;
			}
			term = ']';
			break;
		default:
			maxl = 1;
			break;
		}
	}

	/* Advance to the end of the argument. */

	iendarg = iarg;
	while (maxl > 0) {
		if (buf[iendarg] == '\0') {
			/* Ignore an incomplete argument except for \w. */
			if (esc_name != 'w')
				iendarg = iarg;
			break;
		}
		if (buf[iendarg] == term) {
			iend = iendarg + 1;
			break;
		}
		if (esc_name == 'N' &&
		    isdigit((unsigned char)buf[iendarg]) == 0) {
			iend = iendarg + 1;
			break;
		}
		if (buf[iendarg] == buf[iesc]) {
			if (roff_escape(buf, ln, iendarg,
			    &sesc, &sarg, &sendarg, &send) == ESCAPE_EXPAND)
				goto out_sub;
			iendarg = iend = send;
		} else {
			if (maxl != INT_MAX)
				maxl--;
			iend = ++iendarg;
		}
	}
	if (resc != NULL && ((maxl != INT_MAX && maxl != 0) ||
	    (term != '\0' && buf[iendarg] != term)))
		mandoc_msg(MANDOCERR_ESC_BAD, ln, iesc, "%s", buf + iesc);

	/* Post-process depending on the content of the argument. */

	argl = iendarg - iarg;
	switch (esc_name) {
	case '*':
		if (resc == NULL && argl == 2 &&
		    buf[iarg] == '.' && buf[iarg + 1] == 'T')
			rval = ESCAPE_DEVICE;
		break;
	case 'O':
		switch (buf[iarg]) {
		case '0':
			rval = ESCAPE_UNSUPP;
			break;
		case '1':
		case '2':
		case '3':
		case '4':
			rval = argl == 1 ? ESCAPE_IGNORE : ESCAPE_ERROR;
			break;
		case '5':
			rval = buf[iarg - 1] == '[' ? ESCAPE_UNSUPP :
			    ESCAPE_ERROR;
			break;
		default:
			rval = ESCAPE_ERROR;
			break;
		}
		break;
	default:
		break;
	}

	switch (rval) {
	case ESCAPE_FONT:
		rval = mandoc_font(buf + iarg, argl);
		break;

	case ESCAPE_SPECIAL:

		/*
		 * The file chars.c only provides one common list of
		 * character names, but \[-] == \- is the only one of
		 * the characters with one-byte names that allows
		 * enclosing the name in brackets.
		 */

		if (term != '\0' && argl == 1 && buf[iarg] != '-') {
			rval = ESCAPE_ERROR;
			break;
		}

		/* Treat \[char...] as an alias for \N'...'. */

		if (buf[iarg] == 'c') {
			if (argl < 6 || argl > 7 ||
			    strncmp(buf + iarg, "char", 4) != 0 ||
			    (int)strspn(buf + iarg + 4, "0123456789")
			     + 4 < argl)
				break;
			c = 0;
			for (i = iarg; i < iendarg; i++)
				c = 10 * c + (buf[i] - '0');
			if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
				break;
			iarg += 4;
			rval = ESCAPE_NUMBERED;
			break;
		}

		/*
		 * Unicode escapes are defined in groff as \[u0000]
		 * to \[u10FFFF], where the contained value must be
		 * a valid Unicode codepoint.  Here, however, only
		 * check the length and range.
		 */

		if (buf[iarg] != 'u' || argl < 5 || argl > 7)
			break;
		if (argl == 7 &&
		    (buf[iarg + 1] != '1' || buf[iarg + 2] != '0'))
			break;
		if (argl == 6 && buf[iarg + 1] == '0')
			break;
		if (argl == 5 && buf[iarg + 1] == 'D' &&
		    strchr("89ABCDEF", buf[iarg + 2]) != NULL)
			break;
		if ((int)strspn(buf + iarg + 1, "0123456789ABCDEFabcdef")
		    + 1 == argl)
			rval = ESCAPE_UNICODE;
		break;
	default:
		break;
	}
	goto out;

out_sub:
	iesc = sesc;
	iarg = sarg;
	iendarg = sendarg;
	iend = send;
	rval = ESCAPE_EXPAND;

out:
	if (rarg != NULL)
		*rarg = iarg;
	if (rendarg != NULL)
		*rendarg = iendarg;
	if (rend != NULL)
		*rend = iend;
	if (resc == NULL)
		return rval;

	/*
	 * Diagnostic messages are only issued when called
	 * from the parser, not when called from the formatters.
	 */

	*resc = iesc;
	switch (rval) {
	case ESCAPE_ERROR:
		err = MANDOCERR_ESC_BAD;
		break;
	case ESCAPE_UNSUPP:
		err = MANDOCERR_ESC_UNSUPP;
		break;
	case ESCAPE_UNDEF:
		if (esc_name == '\\')
			return rval;
		err = MANDOCERR_ESC_UNDEF;
		break;
	case ESCAPE_SPECIAL:
		if (mchars_spec2cp(buf + iarg, argl) >= 0)
			return rval;
		err = MANDOCERR_ESC_BAD;
		break;
	default:
		return rval;
	}
	mandoc_msg(err, ln, iesc, "%.*s", iend - iesc, buf + iesc);
	return rval;
}