[BACK]Return to parse.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / docbook2mdoc

File: [cvsweb.bsd.lv] / docbook2mdoc / parse.c (download)

Revision 1.5, Thu Mar 28 12:21:10 2019 UTC (5 years ago) by schwarze
Branch: MAIN
Changes since 1.4: +322 -100 lines

The expat library aborts parsing as soon as it encounters invalid
input, and the basic design of the library practically precludes
fixing it.  However, whether the input is well-formed XML or not
is totally irrelevant, and in fact, i have seen real-world documents
from X.org that expat rejects as not well-formed.  Kristaps reports
the same from OpenGL.

We really want to parse *ANYTHING* whatsoever without ever throwing
a fatal error - after all, the point is to convert legacy documents
to a better format, and nitpicking about the syntax merely alienates
users (including myself).

Consequently, ditch expat and write a parser from scratch, optimized
for robustness on invalid input.

Oh, and by the way, it only requires 200 lines of code,
compared to 15,000 lines in expat - an economy of 98.5%
at the sime time as being much more useful in practice.

/* $Id: parse.c,v 1.5 2019/03/28 12:21:10 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <assert.h>
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "node.h"
#include "parse.h"

/*
 * The implementation of the DocBook parser.
 */

/*
 * Global parse state.
 * Keep this as simple and small as possible.
 */
struct	parse {
	const char	*fname;  /* Name of the input file. */
	struct ptree	*tree;   /* Complete parse result. */
	struct pnode	*cur;	 /* Current node in the tree. */
	enum nodeid	 ncur;   /* Type of the current node. */
	int		 line;   /* Line number in the input file. */
	int		 col;	 /* Column number in the input file. */
	int		 nline;  /* Line number of next token. */
	int		 ncol;   /* Column number of next token. */
	int		 del;    /* Levels of nested nodes being deleted. */
	int		 attr;   /* The most recent attribute is valid. */
	int		 warn;
};

struct	element {
	const char	*name;   /* DocBook element name. */
	enum nodeid	 node;   /* Node type to generate. */
};

static	const struct element elements[] = {
	{ "acronym",		NODE_IGNORE },
	{ "affiliation",	NODE_AFFILIATION },
	{ "anchor",		NODE_DELETE },
	{ "application",	NODE_APPLICATION },
	{ "arg",		NODE_ARG },
	{ "author",		NODE_AUTHOR },
	{ "authorgroup",	NODE_AUTHORGROUP },
	{ "blockquote",		NODE_BLOCKQUOTE },
	{ "book",		NODE_BOOK },
	{ "bookinfo",		NODE_BOOKINFO },
	{ "caution",		NODE_CAUTION },
	{ "chapter",		NODE_SECTION },
	{ "citerefentry",	NODE_CITEREFENTRY },
	{ "citetitle",		NODE_CITETITLE },
	{ "cmdsynopsis",	NODE_CMDSYNOPSIS },
	{ "code",		NODE_CODE },
	{ "colspec",		NODE_COLSPEC },
	{ "command",		NODE_COMMAND },
	{ "constant",		NODE_CONSTANT },
	{ "copyright",		NODE_COPYRIGHT },
	{ "date",		NODE_DATE },
	{ "editor",		NODE_EDITOR },
	{ "email",		NODE_EMAIL },
	{ "emphasis",		NODE_EMPHASIS },
	{ "entry",		NODE_ENTRY },
	{ "envar",		NODE_ENVAR },
	{ "fieldsynopsis",	NODE_FIELDSYNOPSIS },
	{ "filename",		NODE_FILENAME },
	{ "firstname",		NODE_IGNORE },
	{ "firstterm",		NODE_FIRSTTERM },
	{ "footnote",		NODE_FOOTNOTE },
	{ "funcdef",		NODE_FUNCDEF },
	{ "funcprototype",	NODE_FUNCPROTOTYPE },
	{ "funcsynopsis",	NODE_FUNCSYNOPSIS },
	{ "funcsynopsisinfo",	NODE_FUNCSYNOPSISINFO },
	{ "function",		NODE_FUNCTION },
	{ "glossterm",		NODE_GLOSSTERM },
	{ "group",		NODE_GROUP },
	{ "holder",		NODE_HOLDER },
	{ "index",		NODE_INDEX },
	{ "indexterm",		NODE_DELETE },
	{ "info",		NODE_INFO },
	{ "informalequation",	NODE_INFORMALEQUATION },
	{ "informaltable",	NODE_INFORMALTABLE },
	{ "inlineequation",	NODE_INLINEEQUATION },
	{ "itemizedlist",	NODE_ITEMIZEDLIST },
	{ "keysym",		NODE_KEYSYM },
	{ "legalnotice",	NODE_LEGALNOTICE },
	{ "link",		NODE_LINK },
	{ "listitem",		NODE_LISTITEM },
	{ "literal",		NODE_LITERAL },
	{ "literallayout",	NODE_LITERALLAYOUT },
	{ "manvolnum",		NODE_MANVOLNUM },
	{ "member",		NODE_MEMBER },
	{ "mml:math",		NODE_MML_MATH },
	{ "mml:mfenced",	NODE_MML_MFENCED },
	{ "mml:mfrac",		NODE_MML_MFRAC },
	{ "mml:mi",		NODE_MML_MI },
	{ "mml:mn",		NODE_MML_MN },
	{ "mml:mo",		NODE_MML_MO },
	{ "mml:mrow",		NODE_MML_MROW },
	{ "mml:msub",		NODE_MML_MSUB },
	{ "mml:msup",		NODE_MML_MSUP },
	{ "modifier",		NODE_MODIFIER },
	{ "note",		NODE_NOTE },
	{ "option",		NODE_OPTION },
	{ "orderedlist",	NODE_ORDEREDLIST },
	{ "orgname",		NODE_ORGNAME },
	{ "othername",		NODE_IGNORE },
	{ "para",		NODE_PARA },
	{ "paramdef",		NODE_PARAMDEF },
	{ "parameter",		NODE_PARAMETER },
	{ "part",		NODE_SECTION },
	{ "personname",		NODE_PERSONNAME },
	{ "phrase",		NODE_IGNORE },
	{ "preface",		NODE_PREFACE },
	{ "primary",		NODE_DELETE },
	{ "programlisting",	NODE_PROGRAMLISTING },
	{ "prompt",		NODE_PROMPT },
	{ "quote",		NODE_QUOTE },
	{ "refclass",		NODE_REFCLASS },
	{ "refdescriptor",	NODE_REFDESCRIPTOR },
	{ "refentry",		NODE_REFENTRY },
	{ "refentryinfo",	NODE_REFENTRYINFO },
	{ "refentrytitle",	NODE_REFENTRYTITLE },
	{ "refmeta",		NODE_REFMETA },
	{ "refmetainfo",	NODE_REFMETAINFO },
	{ "refmiscinfo",	NODE_REFMISCINFO },
	{ "refname",		NODE_REFNAME },
	{ "refnamediv",		NODE_REFNAMEDIV },
	{ "refpurpose",		NODE_REFPURPOSE },
	{ "refsect1",		NODE_SECTION },
	{ "refsect2",		NODE_SECTION },
	{ "refsect3",		NODE_SECTION },
	{ "refsection",		NODE_SECTION },
	{ "refsynopsisdiv",	NODE_REFSYNOPSISDIV },
	{ "releaseinfo",	NODE_RELEASEINFO },
	{ "replaceable",	NODE_REPLACEABLE },
	{ "row",		NODE_ROW },
	{ "sbr",		NODE_SBR },
	{ "screen",		NODE_SCREEN },
	{ "secondary",		NODE_DELETE },
	{ "sect1",		NODE_SECTION },
	{ "sect2",		NODE_SECTION },
	{ "section",		NODE_SECTION },
	{ "sgmltag",		NODE_SGMLTAG },
	{ "simplelist",		NODE_SIMPLELIST },
	{ "spanspec",		NODE_SPANSPEC },
	{ "structname",		NODE_STRUCTNAME },
	{ "subtitle",		NODE_SUBTITLE },
	{ "surname",		NODE_IGNORE },
	{ "synopsis",		NODE_SYNOPSIS },
	{ "table",		NODE_TABLE },
	{ "tbody",		NODE_TBODY },
	{ "term",		NODE_TERM },
	{ "tfoot",		NODE_TFOOT },
	{ "tgroup",		NODE_TGROUP },
	{ "thead",		NODE_THEAD },
	{ "tip",		NODE_TIP },
	{ "title",		NODE_TITLE },
	{ "trademark",		NODE_IGNORE },
	{ "type",		NODE_TYPE },
	{ "ulink",		NODE_ULINK },
	{ "userinput",		NODE_USERINPUT },
	{ "variablelist",	NODE_VARIABLELIST },
	{ "varlistentry",	NODE_VARLISTENTRY },
	{ "varname",		NODE_VARNAME },
	{ "warning",		NODE_WARNING },
	{ "wordasword",		NODE_WORDASWORD },
	{ "xi:include",		NODE_DELETE_WARN },
	{ "year",		NODE_YEAR },
	{ NULL,			NODE_IGNORE }
};

/*
 * Process a string of characters.
 * If a text node is already open, append to it.
 * Otherwise, create a new one as a child of the current node.
 */
static void
xml_char(struct parse *ps, const char *p, int sz)
{
	struct pnode	*dat;

	if (ps->del > 0)
		return;

	if (ps->cur == NULL) {
		fprintf(stderr, "%s:%d:%d: discarding text before docum"
		    "ent: %.*s\n", ps->fname, ps->line, ps->col, sz, p);
		ps->tree->flags |= TREE_FAIL;
		return;
	}

	if (ps->cur->node != NODE_TEXT) {
		if ((dat = calloc(1, sizeof(*dat))) == NULL) {
			perror(NULL);
			exit(1);
		}
		dat->node = NODE_TEXT;
		dat->parent = ps->cur;
		TAILQ_INIT(&dat->childq);
		TAILQ_INIT(&dat->attrq);
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
		ps->cur = dat;
	}

	if (ps->tree->flags & TREE_CLOSED &&
	    ps->cur->parent == ps->tree->root && ps->warn)
		fprintf(stderr, "%s:%d:%d: warning: "
		    "text after end of document: %.*s\n",
		    ps->fname, ps->line, ps->col, sz, p);

	/* Append to the current text node. */

	assert(sz >= 0);
	ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
	if (ps->cur->b == NULL) {
		perror(NULL);
		exit(1);
	}
	memcpy(ps->cur->b + ps->cur->bsz, p, sz);
	ps->cur->bsz += sz;
	ps->cur->b[ps->cur->bsz] = '\0';
	ps->cur->real = ps->cur->b;
}

static void
pnode_trim(struct pnode *pn)
{
	assert(pn->node == NODE_TEXT);
	for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
		if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
			break;
}

/*
 * Begin an element.
 */
static void
xml_elem_start(struct parse *ps, const char *name)
{
	const struct element	*elem;
	struct pnode		*dat;

	if (*name == '!' || *name == '?')
		return;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 0) {
		ps->del++;
		return;
	}

	/* Close out the text node, if there is one. */
	if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
		pnode_trim(ps->cur);
		ps->cur = ps->cur->parent;
	}

	for (elem = elements; elem->name != NULL; elem++)
		if (strcmp(elem->name, name) == 0)
			break;

	if (elem->name == NULL) {
		fprintf(stderr, "%s:%d:%d: unknown element <%s>\n",
			ps->fname, ps->line, ps->col, name);
		ps->tree->flags |= TREE_FAIL;
	}
	ps->ncur = elem->node;

	switch (ps->ncur) {
	case NODE_DELETE_WARN:
		if (ps->warn)
			fprintf(stderr, "%s:%d:%d: warning: "
			    "skipping element <%s>\n",
			    ps->fname, ps->line, ps->col, name);
		/* FALLTHROUGH */
	case NODE_DELETE:
		ps->del = 1;
		/* FALLTHROUGH */
	case NODE_IGNORE:
		return;
	case NODE_INLINEEQUATION:
		ps->tree->flags |= TREE_EQN;
		break;
	default:
		break;
	}

	if (ps->tree->flags & TREE_CLOSED &&
	    ps->cur->parent == NULL && ps->warn)
		fprintf(stderr, "%s:%d:%d: warning: "
		    "element after end of document: %s\n",
		    ps->fname, ps->line, ps->col, name);

	if ((dat = calloc(1, sizeof(*dat))) == NULL) {
		perror(NULL);
		exit(1);
	}
	dat->node = elem->node;
	dat->parent = ps->cur;
	TAILQ_INIT(&dat->childq);
	TAILQ_INIT(&dat->attrq);

	if (ps->cur != NULL)
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);

	ps->cur = dat;
	if (ps->tree->root == NULL)
		ps->tree->root = dat;
}

static void
xml_attrkey(struct parse *ps, const char *name)
{
	struct pattr	*attr;
	enum attrkey	 key;

	if (ps->del > 0 || *name == '\0')
		return;
	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
		if (ps->warn)
			fprintf(stderr, "%s:%d:%d: warning: "
			    "unknown attribute \"%s\"\n",
			    ps->fname, ps->line, ps->col, name);
		ps->attr = 0;
		return;
	}
	if ((attr = calloc(1, sizeof(*attr))) == NULL) {
		perror(NULL);
		exit(1);
	}
	attr->key = key;
	attr->val = ATTRVAL__MAX;
	attr->rawval = NULL;
	TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
	ps->attr = 1;
}

static void
xml_attrval(struct parse *ps, const char *name)
{
	struct pattr	*attr;

	if (ps->del > 0 || ps->attr == 0)
		return;
	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
		return;
	if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
	    (attr->rawval = strdup(name)) == NULL) {
		perror(NULL);
		exit(1);
	}
}

/*
 * Roll up the parse tree.
 * If we're at a text node, roll that one up first.
 */
static void
xml_elem_end(struct parse *ps, const char *name)
{
	const struct element	*elem;
	enum nodeid		 node;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 1) {
		ps->del--;
		return;
	}

	/* Close out the text node, if there is one. */
	if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
		pnode_trim(ps->cur);
		ps->cur = ps->cur->parent;
	}

	if (name != NULL) {
		for (elem = elements; elem->name != NULL; elem++)
			if (strcmp(elem->name, name) == 0)
				break;
		node = elem->node;
	} else
		node = ps->ncur;

	switch (node) {
	case NODE_DELETE_WARN:
	case NODE_DELETE:
		if (ps->del > 0)
			ps->del--;
		break;
	case NODE_IGNORE:
		break;
	default:
		if (ps->cur == NULL || node != ps->cur->node) {
			if (ps->warn)
				fprintf(stderr, "%s:%d:%d: warning: "
				    "element not open: </%s>\n",
				    ps->fname, ps->line, ps->col, name);
			break;
		}

		/*
		 * Refrain from actually closing the document element.
		 * If no more content follows, no harm is done, but if
		 * some content still follows, simply processing it is
		 * obviously better than discarding it or crashing.
		 */

		if (ps->cur->parent == NULL)
			ps->tree->flags |= TREE_CLOSED;
		else
			ps->cur = ps->cur->parent;
		break;
	}
	assert(ps->del == 0);
}

struct parse *
parse_alloc(int warn)
{
	struct parse	*p;

	if ((p = calloc(1, sizeof(*p))) == NULL)
		return NULL;

	if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
		free(p);
		return NULL;
	}
	p->warn = warn;
	return p;
}

void
parse_free(struct parse *p)
{
	if (p == NULL)
		return;
	if (p->tree != NULL) {
		pnode_unlink(p->tree->root);
		free(p->tree);
	}
	free(p);
}

/*
 * Advance the pend pointer to the next character in the charset.
 * If the charset starts with a space, it stands for any whitespace.
 * Update the new input file position, used for messages.
 * Do not overrun the buffer b of length rlen.
 * When reaching the end, NUL-terminate the buffer and return 1;
 * otherwise, return 0.
 */
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
    const char *charset)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	p->nline = p->line;
	p->ncol = p->col;
	while (*pend < rlen) {
		if (b[*pend] == '\n') {
			p->nline++;
			p->ncol = 1;
		} else
			p->ncol++;
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		++*pend;
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return 1;
	} else
		return 0;
}

struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
	char		 b[4096];
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;  /* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 in_tag, in_arg, in_quotes, elem_end;

	p->fname = fname;
	p->nline = 1;
	p->ncol = 1;
	rlen = 0;
	in_tag = in_arg = in_quotes = 0;

	/*
	 * Read loop.
	 *
	 * We have to enter the read loop once more even on EOF
	 * because the previous token may have been incomplete,
	 * such that it asked for more input.
	 * Once rsz is 0, incomplete tokens will no longer ask
	 * for more input but instead use whatever there is,
	 * and then exit the read loop.
	 * The minus one on the size limit for read(2) is needed
	 * such that advance() can set b[rlen] to NUL when needed.
	 */

	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
		if ((rlen += rsz) == 0)
			break;

		/* Token loop. */

		pend = 0;
		for (;;) {

			/* Proceed to the next token, skipping whitespace. */

			p->line = p->nline;
			p->col = p->ncol;
			if ((poff = pend) == rlen)
				break;
			if (isspace((unsigned char)b[pend])) {
				if (b[pend++] == '\n') {
					p->nline++;
					p->ncol = 1;
				} else
					p->ncol++;
				continue;
			}

			/*
			 * The following three cases (in_arg, in_tag,
			 * and starting a tag) all parse a word or
			 * quoted string.  If that extends beyond the
			 * read buffer and the last read(2) still got
			 * data, they all break out of the token loop
			 * to request more data from the read loop.
			 *
			 * Also, they all detect self-closing tags,
			 * those ending with "/>", setting the flag
			 * elem_end and calling xml_elem_end() at the
			 * very end, after handling the attribute value,
			 * attribute name, or tag name, respectively.
			 */

			/* Parse an attribute value. */

			if (in_arg) {
				if (in_quotes == 0 && b[pend] == '"') {
					in_quotes = 1;
					p->ncol++;
					pend++;
					continue;
				}
				if (advance(p, b, rlen, &pend,
				    in_quotes ? "\"" : " >") && rsz > 0)
					break;
				in_arg = in_quotes = elem_end = 0;
				if (b[pend] == '>') {
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				xml_attrval(p, b + poff);
				if (elem_end)
					xml_elem_end(p, NULL);

			/* Look for an attribute name. */

			} else if (in_tag) {
				if (advance(p, b, rlen, &pend, " =>") &&
				    rsz > 0)
					break;
				elem_end = 0;
				switch (b[pend]) {
				case '>':
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
					break;
				case '=':
					in_arg = 1;
					break;
				default:
					break;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				xml_attrkey(p, b + poff);
				if (elem_end)
					xml_elem_end(p, NULL);

			/* Begin an opening or closing tag. */

			} else if (b[poff] == '<') {
				if (advance(p, b, rlen, &pend, " >") &&
				    rsz > 0)
					break;
				elem_end = 0;
				if (b[pend] != '>')
					in_tag = 1;
				else if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (b[++poff] == '/') {
					elem_end = 1;
					poff++;
				} else
					xml_elem_start(p, b + poff);
				if (elem_end)
					xml_elem_end(p, b + poff);

			/* Process text up to the next tag. */

			} else {
				if (advance(p, b, rlen, &pend, "<") == 0)
					p->ncol--;
				xml_char(p, b + poff, pend - poff);
			}
		}

		/* Buffer exhausted; shift left and re-fill. */

		assert(poff > 0);
		memmove(b, b + poff, rlen - poff);
		rlen -= poff;
	}
	if (rsz < 0) {
		perror(fname);
		p->tree->flags |= TREE_FAIL;
	}
	if (p->cur != NULL && p->cur->node == NODE_TEXT) {
		pnode_trim(p->cur);
		p->cur = p->cur->parent;
	}
	if ((p->tree->flags & TREE_CLOSED) == 0 && p->warn)
		fprintf(stderr, "%s:%d:%d: warning: document not closed\n",
		    p->fname, p->line, p->col);
	return p->tree;
}