/* $Id: parse.c,v 1.23 2019/04/08 14:37:31 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "node.h"
#include "parse.h"

/*
 * The implementation of the DocBook parser.
 */

enum	pstate {
	PARSE_ELEM,
	PARSE_TAG,
	PARSE_ARG,
	PARSE_SQ,
	PARSE_DQ
};

/*
 * Global parse state.
 * Keep this as simple and small as possible.
 */
struct	parse {
	const char	*fname;  /* Name of the input file. */
	struct ptree	*tree;   /* Complete parse result. */
	struct pnode	*doctype;
	struct pnode	*cur;	 /* Current node in the tree. */
	enum nodeid	 ncur;   /* Type of the current node. */
	int		 line;   /* Line number in the input file. */
	int		 col;	 /* Column number in the input file. */
	int		 nline;  /* Line number of next token. */
	int		 ncol;   /* Column number of next token. */
	int		 del;    /* Levels of nested nodes being deleted. */
	int		 flags;
#define	PFLAG_WARN	 (1 << 0)  /* Print warning messages. */
#define	PFLAG_SPC	 (1 << 1)  /* Whitespace before the next element. */
#define	PFLAG_ATTR	 (1 << 2)  /* The most recent attribute is valid. */
#define	PFLAG_EEND	 (1 << 3)  /* This element is self-closing. */
};

struct	element {
	const char	*name;   /* DocBook element name. */
	enum nodeid	 node;   /* Node type to generate. */
};

static	const struct element elements[] = {
	{ "acronym",		NODE_IGNORE },
	{ "affiliation",	NODE_AFFILIATION },
	{ "anchor",		NODE_DELETE },
	{ "appendix",		NODE_APPENDIX },
	{ "application",	NODE_APPLICATION },
	{ "arg",		NODE_ARG },
	{ "article",		NODE_SECTION },
	{ "author",		NODE_AUTHOR },
	{ "authorgroup",	NODE_AUTHORGROUP },
	{ "blockquote",		NODE_BLOCKQUOTE },
	{ "book",		NODE_SECTION },
	{ "bookinfo",		NODE_BOOKINFO },
	{ "caution",		NODE_CAUTION },
	{ "chapter",		NODE_SECTION },
	{ "citerefentry",	NODE_CITEREFENTRY },
	{ "citetitle",		NODE_CITETITLE },
	{ "cmdsynopsis",	NODE_CMDSYNOPSIS },
	{ "code",		NODE_LITERAL },
	{ "colspec",		NODE_COLSPEC },
	{ "command",		NODE_COMMAND },
	{ "constant",		NODE_CONSTANT },
	{ "contrib",		NODE_CONTRIB },
	{ "copyright",		NODE_COPYRIGHT },
	{ "date",		NODE_DATE },
	{ "!doctype",		NODE_DOCTYPE },
	{ "!DOCTYPE",		NODE_DOCTYPE },
	{ "editor",		NODE_EDITOR },
	{ "email",		NODE_EMAIL },
	{ "emphasis",		NODE_EMPHASIS },
	{ "!ENTITY",		NODE_ENTITY },
	{ "entry",		NODE_ENTRY },
	{ "envar",		NODE_ENVAR },
	{ "errorname",		NODE_ERRORNAME },
	{ "fieldsynopsis",	NODE_FIELDSYNOPSIS },
	{ "filename",		NODE_FILENAME },
	{ "firstname",		NODE_PERSONNAME },
	{ "firstterm",		NODE_FIRSTTERM },
	{ "footnote",		NODE_FOOTNOTE },
	{ "funcdef",		NODE_FUNCDEF },
	{ "funcprototype",	NODE_FUNCPROTOTYPE },
	{ "funcsynopsis",	NODE_FUNCSYNOPSIS },
	{ "funcsynopsisinfo",	NODE_FUNCSYNOPSISINFO },
	{ "function",		NODE_FUNCTION },
	{ "glossary",		NODE_VARIABLELIST },
	{ "glossdef",		NODE_IGNORE },
	{ "glossdiv",		NODE_IGNORE },
	{ "glossentry",		NODE_VARLISTENTRY },
	{ "glosslist",		NODE_VARIABLELIST },
	{ "glossterm",		NODE_GLOSSTERM },
	{ "group",		NODE_GROUP },
	{ "holder",		NODE_HOLDER },
	{ "index",		NODE_INDEX },
	{ "indexterm",		NODE_DELETE },
	{ "info",		NODE_INFO },
	{ "informalequation",	NODE_INFORMALEQUATION },
	{ "informaltable",	NODE_TABLE },
	{ "inlineequation",	NODE_INLINEEQUATION },
	{ "itemizedlist",	NODE_ITEMIZEDLIST },
	{ "keysym",		NODE_KEYSYM },
	{ "legalnotice",	NODE_LEGALNOTICE },
	{ "link",		NODE_LINK },
	{ "listitem",		NODE_LISTITEM },
	{ "literal",		NODE_LITERAL },
	{ "literallayout",	NODE_LITERALLAYOUT },
	{ "manvolnum",		NODE_MANVOLNUM },
	{ "member",		NODE_MEMBER },
	{ "mml:math",		NODE_MML_MATH },
	{ "mml:mfenced",	NODE_MML_MFENCED },
	{ "mml:mfrac",		NODE_MML_MFRAC },
	{ "mml:mi",		NODE_MML_MI },
	{ "mml:mn",		NODE_MML_MN },
	{ "mml:mo",		NODE_MML_MO },
	{ "mml:mrow",		NODE_MML_MROW },
	{ "mml:msub",		NODE_MML_MSUB },
	{ "mml:msup",		NODE_MML_MSUP },
	{ "modifier",		NODE_MODIFIER },
	{ "note",		NODE_NOTE },
	{ "option",		NODE_OPTION },
	{ "orderedlist",	NODE_ORDEREDLIST },
	{ "orgname",		NODE_ORGNAME },
	{ "othername",		NODE_PERSONNAME },
	{ "para",		NODE_PARA },
	{ "paramdef",		NODE_PARAMDEF },
	{ "parameter",		NODE_PARAMETER },
	{ "part",		NODE_SECTION },
	{ "personname",		NODE_PERSONNAME },
	{ "phrase",		NODE_IGNORE },
	{ "preface",		NODE_PREFACE },
	{ "primary",		NODE_DELETE },
	{ "programlisting",	NODE_PROGRAMLISTING },
	{ "prompt",		NODE_PROMPT },
	{ "quote",		NODE_QUOTE },
	{ "refclass",		NODE_REFCLASS },
	{ "refdescriptor",	NODE_REFDESCRIPTOR },
	{ "refentry",		NODE_REFENTRY },
	{ "refentryinfo",	NODE_REFENTRYINFO },
	{ "refentrytitle",	NODE_REFENTRYTITLE },
	{ "refmeta",		NODE_REFMETA },
	{ "refmetainfo",	NODE_REFMETAINFO },
	{ "refmiscinfo",	NODE_REFMISCINFO },
	{ "refname",		NODE_REFNAME },
	{ "refnamediv",		NODE_REFNAMEDIV },
	{ "refpurpose",		NODE_REFPURPOSE },
	{ "refsect1",		NODE_SECTION },
	{ "refsect2",		NODE_SECTION },
	{ "refsect3",		NODE_SECTION },
	{ "refsection",		NODE_SECTION },
	{ "refsynopsisdiv",	NODE_REFSYNOPSISDIV },
	{ "releaseinfo",	NODE_RELEASEINFO },
	{ "replaceable",	NODE_REPLACEABLE },
	{ "row",		NODE_ROW },
	{ "sbr",		NODE_SBR },
	{ "screen",		NODE_SCREEN },
	{ "secondary",		NODE_DELETE },
	{ "sect1",		NODE_SECTION },
	{ "sect2",		NODE_SECTION },
	{ "section",		NODE_SECTION },
	{ "sgmltag",		NODE_SGMLTAG },
	{ "simpara",		NODE_PARA },
	{ "simplelist",		NODE_SIMPLELIST },
	{ "spanspec",		NODE_SPANSPEC },
	{ "structfield",	NODE_PARAMETER },
	{ "structname",		NODE_TYPE },
	{ "subtitle",		NODE_SUBTITLE },
	{ "surname",		NODE_PERSONNAME },
	{ "symbol",		NODE_CONSTANT },
	{ "synopsis",		NODE_SYNOPSIS },
	{ "table",		NODE_TABLE },
	{ "tbody",		NODE_TBODY },
	{ "term",		NODE_TERM },
	{ "tfoot",		NODE_TFOOT },
	{ "tgroup",		NODE_TGROUP },
	{ "thead",		NODE_THEAD },
	{ "tip",		NODE_TIP },
	{ "title",		NODE_TITLE },
	{ "trademark",		NODE_IGNORE },
	{ "type",		NODE_TYPE },
	{ "ulink",		NODE_LINK },
	{ "userinput",		NODE_LITERAL },
	{ "variablelist",	NODE_VARIABLELIST },
	{ "varlistentry",	NODE_VARLISTENTRY },
	{ "varname",		NODE_VARNAME },
	{ "warning",		NODE_WARNING },
	{ "wordasword",		NODE_WORDASWORD },
	{ "xi:include",		NODE_DELETE_WARN },
	{ "year",		NODE_YEAR },
	{ NULL,			NODE_IGNORE }
};

struct	entity {
	const char	*name;
	const char	*roff;
};

/*
 * XML character entity references found in the wild.
 * Those that don't have an exact mandoc_char(7) representation
 * are approximated, and the desired codepoint is given as a comment.
 * Encoding them as \\[u...] would leave -Tascii out in the cold.
 */
static	const struct entity entities[] = {
	{ "alpha",	"\\(*a" },
	{ "amp",	"&" },
	{ "apos",	"'" },
	{ "auml",	"\\(:a" },
	{ "beta",	"\\(*b" },
	{ "circ",	"^" },      /* U+02C6 */
	{ "copy",	"\\(co" },
	{ "dagger",	"\\(dg" },
	{ "Delta",	"\\(*D" },
	{ "eacute",	"\\('e" },
	{ "emsp",	"\\ " },    /* U+2003 */
	{ "gt",		">" },
	{ "hairsp",	"\\^" },
	{ "kappa",	"\\(*k" },
	{ "larr",	"\\(<-" },
	{ "ldquo",	"\\(lq" },
	{ "le",		"\\(<=" },
	{ "lowbar",	"_" },
	{ "lsqb",	"[" },
	{ "lt",		"<" },
	{ "mdash",	"\\(em" },
	{ "minus",	"\\-" },
	{ "ndash",	"\\(en" },
	{ "nbsp",	"\\ " },
	{ "num",	"#" },
	{ "oslash",	"\\(/o" },
	{ "ouml",	"\\(:o" },
	{ "percnt",	"%" },
	{ "quot",	"\\(dq" },
	{ "rarr",	"\\(->" },
	{ "rArr",	"\\(rA" },
	{ "rdquo",	"\\(rq" },
	{ "reg",	"\\(rg" },
	{ "rho",	"\\(*r" },
	{ "rsqb",	"]" },
	{ "sigma",	"\\(*s" },
	{ "shy",	"\\&" },     /* U+00AD */
	{ "tau",	"\\(*t" },
	{ "tilde",	"\\[u02DC]" },
	{ "times",	"\\[tmu]" },
	{ "uuml",	"\\(:u" },
	{ NULL,		NULL }
};

static size_t	 parse_string(struct parse *, char *, size_t,
			 enum pstate *, int);


static void
error_msg(struct parse *p, const char *fmt, ...)
{
	va_list		 ap;

	fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	fputc('\n', stderr);
	p->tree->flags |= TREE_FAIL;
}

static void
warn_msg(struct parse *p, const char *fmt, ...)
{
	va_list		 ap;

	if ((p->flags & PFLAG_WARN) == 0)
		return;

	fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	fputc('\n', stderr);
}

/*
 * Process a string of characters.
 * If a text node is already open, append to it.
 * Otherwise, create a new one as a child of the current node.
 */
static void
xml_char(struct parse *ps, const char *p, int sz)
{
	struct pnode	*dat;
	size_t		 newsz;

	if (ps->del > 0)
		return;

	if (ps->cur == NULL) {
		error_msg(ps, "discarding text before document: %.*s", sz, p);
		return;
	}

	if (ps->cur->node != NODE_TEXT) {
		if ((dat = calloc(1, sizeof(*dat))) == NULL) {
			perror(NULL);
			exit(1);
		}
		dat->node = NODE_TEXT;
		dat->spc = (ps->flags & PFLAG_SPC) != 0;
		dat->parent = ps->cur;
		TAILQ_INIT(&dat->childq);
		TAILQ_INIT(&dat->attrq);
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
		ps->cur = dat;
	}

	if (ps->tree->flags & TREE_CLOSED &&
	    ps->cur->parent == ps->tree->root)
		warn_msg(ps, "text after end of document: %.*s", sz, p);

	/* Append to the current text node. */

	assert(sz >= 0);
	newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz;
	ps->cur->b = realloc(ps->cur->b, newsz + 1);
	if (ps->cur->b == NULL) {
		perror(NULL);
		exit(1);
	}
	if (ps->cur->bsz && (ps->flags & PFLAG_SPC))
		ps->cur->b[ps->cur->bsz++] = ' ';
	memcpy(ps->cur->b + ps->cur->bsz, p, sz);
	ps->cur->b[ps->cur->bsz = newsz] = '\0';
	ps->cur->real = ps->cur->b;
	ps->flags &= ~PFLAG_SPC;
}

/*
 * Close out the text node and strip trailing whitespace, if one is open.
 */
static void
pnode_closetext(struct parse *p)
{
	struct pnode	*n;

	if ((n = p->cur) == NULL || n->node != NODE_TEXT)
		return;
	p->cur = n->parent;
	while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
		n->b[--n->bsz] = '\0';
		p->flags |= PFLAG_SPC;
	}
}

static void
xml_entity(struct parse *p, const char *name)
{
	const struct entity	*entity;
	struct pnode		*dat;
	const char		*ccp;
	char			*cp;
	enum pstate		 pstate;

	if (p->del > 0)
		return;

	if (p->cur == NULL) {
		error_msg(p, "discarding entity before document: &%s;", name);
		return;
	}

	pnode_closetext(p);

	if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
		warn_msg(p, "entity after end of document: &%s;", name);

	for (entity = entities; entity->name != NULL; entity++)
		if (strcmp(name, entity->name) == 0)
			break;

	if (entity->roff == NULL) {
		if (p->doctype != NULL) {
			TAILQ_FOREACH(dat, &p->doctype->childq, child) {
				if ((ccp = pnode_getattr_raw(dat,
				     ATTRKEY_NAME, NULL)) == NULL ||
				    strcmp(ccp, name) != 0 ||
				    (ccp = pnode_getattr_raw(dat,
				     ATTRKEY_DEFINITION, NULL)) == NULL)
					continue;
				if ((cp = strdup(ccp)) == NULL) {
					perror(NULL);
					exit(1);
				}
				pstate = PARSE_ELEM;
				parse_string(p, cp, strlen(cp), &pstate, 0);
				p->flags &= ~PFLAG_SPC;
				free(cp);
				return;
			}
		}
		error_msg(p, "unknown entity &%s;", name);
		return;
	}

	/* Create, append, and close out an entity node. */
	if ((dat = calloc(1, sizeof(*dat))) == NULL ||
	    (dat->b = dat->real = strdup(entity->roff)) == NULL) {
		perror(NULL);
		exit(1);
	}
	dat->node = NODE_ESCAPE;
	dat->bsz = strlen(dat->b);
	dat->spc = (p->flags & PFLAG_SPC) != 0;
	dat->parent = p->cur;
	TAILQ_INIT(&dat->childq);
	TAILQ_INIT(&dat->attrq);
	TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
	p->flags &= ~PFLAG_SPC;
}

/*
 * Begin an element.
 */
static void
xml_elem_start(struct parse *ps, const char *name)
{
	const struct element	*elem;
	struct pnode		*dat;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 0) {
		if (*name != '!' && *name != '?')
			ps->del++;
		return;
	}

	pnode_closetext(ps);

	for (elem = elements; elem->name != NULL; elem++)
		if (strcmp(elem->name, name) == 0)
			break;

	if (elem->name == NULL) {
		if (*name == '!' || *name == '?')
			return;
		error_msg(ps, "unknown element <%s>", name);
	}

	ps->ncur = elem->node;

	switch (ps->ncur) {
	case NODE_DELETE_WARN:
		warn_msg(ps, "skipping element <%s>", name);
		/* FALLTHROUGH */
	case NODE_DELETE:
		ps->del = 1;
		/* FALLTHROUGH */
	case NODE_IGNORE:
		return;
	case NODE_INLINEEQUATION:
		ps->tree->flags |= TREE_EQN;
		break;
	default:
		break;
	}

	if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
		warn_msg(ps, "element after end of document: <%s>", name);

	if ((dat = calloc(1, sizeof(*dat))) == NULL) {
		perror(NULL);
		exit(1);
	}

	/*
	 * Nodes that begin a new macro or request line or start by
	 * printing text always want whitespace before themselves.
	 */

	switch (dat->node = elem->node) {
	case NODE_DOCTYPE:
	case NODE_ENTITY:
	case NODE_SBR:
		ps->flags |= PFLAG_EEND;
		/* FALLTHROUGH */
	case NODE_APPENDIX:
	case NODE_AUTHORGROUP:
	case NODE_BLOCKQUOTE:
	case NODE_BOOKINFO:
	case NODE_CAUTION:
	case NODE_EDITOR:
	case NODE_ENTRY:
	case NODE_FUNCDEF:
	case NODE_FUNCPROTOTYPE:
	case NODE_INFORMALEQUATION:
	case NODE_INLINEEQUATION:
	case NODE_ITEMIZEDLIST:
	case NODE_LEGALNOTICE:
	case NODE_LITERALLAYOUT:
	case NODE_NOTE:
	case NODE_ORDEREDLIST:
	case NODE_PARA:
	case NODE_PREFACE:
	case NODE_PROGRAMLISTING:
	case NODE_REFMETA:
	case NODE_REFNAMEDIV:
	case NODE_REFSYNOPSISDIV:
	case NODE_ROW:
	case NODE_SCREEN:
	case NODE_SECTION:
	case NODE_SYNOPSIS:
	case NODE_TGROUP:
	case NODE_TIP:
	case NODE_TITLE:
	case NODE_VARIABLELIST:
	case NODE_VARLISTENTRY:
	case NODE_WARNING:
		dat->spc = 1;
		break;
	default:
		dat->spc = (ps->flags & PFLAG_SPC) != 0;
		break;
	}
	dat->parent = ps->cur;
	TAILQ_INIT(&dat->childq);
	TAILQ_INIT(&dat->attrq);

	if (ps->cur != NULL)
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);

	ps->cur = dat;
	if (dat->node == NODE_DOCTYPE) {
		if (ps->doctype == NULL)
			ps->doctype = dat;
		else
			error_msg(ps, "duplicate doctype");
	} else if (dat->parent == NULL && ps->tree->root == NULL)
		ps->tree->root = dat;
}

static void
xml_attrkey(struct parse *ps, const char *name)
{
	struct pattr	*attr;
	const char	*value;
	enum attrkey	 key;

	if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
		return;

	if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) &&
	    TAILQ_FIRST(&ps->cur->attrq) == NULL) {
		value = name;
		name = "NAME";
	} else
		value = NULL;

	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
		ps->flags &= ~PFLAG_ATTR;
		return;
	}
	if ((attr = calloc(1, sizeof(*attr))) == NULL) {
		perror(NULL);
		exit(1);
	}
	attr->key = key;
	attr->val = ATTRVAL__MAX;
	if (value == NULL) {
		attr->rawval = NULL;
		ps->flags |= PFLAG_ATTR;
	} else {
		if ((attr->rawval = strdup(value)) == NULL) {
			perror(NULL);
			exit(1);
		}
		ps->flags &= ~PFLAG_ATTR;
	}
	TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
	if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
		xml_attrkey(ps, "DEFINITION");
}

static void
xml_attrval(struct parse *ps, const char *name)
{
	struct pattr	*attr;

	if (ps->del > 0 || ps->ncur == NODE_IGNORE ||
	    (ps->flags & PFLAG_ATTR) == 0)
		return;
	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
		return;
	if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
	    (attr->rawval = strdup(name)) == NULL) {
		perror(NULL);
		exit(1);
	}
}

/*
 * Roll up the parse tree.
 * If we're at a text node, roll that one up first.
 */
static void
xml_elem_end(struct parse *ps, const char *name)
{
	const struct element	*elem;
	enum nodeid		 node;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 1) {
		ps->del--;
		return;
	}

	if (ps->del == 0)
		pnode_closetext(ps);

	if (name != NULL) {
		for (elem = elements; elem->name != NULL; elem++)
			if (strcmp(elem->name, name) == 0)
				break;
		node = elem->node;
	} else
		node = ps->ncur;

	switch (node) {
	case NODE_DELETE_WARN:
	case NODE_DELETE:
		if (ps->del > 0)
			ps->del--;
		break;
	case NODE_IGNORE:
		break;
	case NODE_DOCTYPE:
		ps->flags &= ~PFLAG_EEND;
		/* FALLTHROUGH */
	default:
		if (ps->cur == NULL || node != ps->cur->node) {
			warn_msg(ps, "element not open: </%s>", name);
			break;
		}

		/*
		 * Refrain from actually closing the document element.
		 * If no more content follows, no harm is done, but if
		 * some content still follows, simply processing it is
		 * obviously better than discarding it or crashing.
		 */

		if (ps->cur->parent != NULL || node == NODE_DOCTYPE) {
			ps->cur = ps->cur->parent;
			if (ps->cur != NULL)
				ps->ncur = ps->cur->node;
		} else
			ps->tree->flags |= TREE_CLOSED;
		ps->flags &= ~PFLAG_SPC;
		break;
	}
	assert(ps->del == 0);
}

struct parse *
parse_alloc(int warn)
{
	struct parse	*p;

	if ((p = calloc(1, sizeof(*p))) == NULL)
		return NULL;

	if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
		free(p);
		return NULL;
	}
	if (warn)
		p->flags |= PFLAG_WARN;
	else
		p->flags &= ~PFLAG_WARN;
	return p;
}

void
parse_free(struct parse *p)
{
	if (p == NULL)
		return;
	if (p->tree != NULL) {
		pnode_unlink(p->tree->root);
		free(p->tree);
	}
	free(p);
}

static void
increment(struct parse *p, char *b, size_t *pend, int refill)
{
	if (refill) {
		if (b[*pend] == '\n') {
			p->nline++;
			p->ncol = 1;
		} else
			p->ncol++;
	}
	++*pend;
}

/*
 * Advance the pend pointer to the next character in the charset.
 * If the charset starts with a space, it stands for any whitespace.
 * Update the new input file position, used for messages.
 * Do not overrun the buffer b of length rlen.
 * When reaching the end, NUL-terminate the buffer and return 1;
 * otherwise, return 0.
 */
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
    const char *charset, int refill)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	if (refill) {
		p->nline = p->line;
		p->ncol = p->col;
	}
	while (*pend < rlen) {
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		increment(p, b, pend, refill);
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return refill;
	} else
		return 0;
}

size_t
parse_string(struct parse *p, char *b, size_t rlen,
    enum pstate *pstate, int refill)
{
	char		*cp;
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 elem_end;

	pend = 0;
	for (;;) {

		/* Proceed to the next token, skipping whitespace. */

		if (refill) {
			p->line = p->nline;
			p->col = p->ncol;
		}
		if ((poff = pend) == rlen)
			break;
		if (isspace((unsigned char)b[pend])) {
			p->flags |= PFLAG_SPC;
			increment(p, b, &pend, refill);
			continue;
		}

		/*
		 * The following four cases (ARG, TAG, and starting an
		 * entity or a tag) all parse a word or quoted string.
		 * If that extends beyond the read buffer and the last
		 * read(2) still got data, they all break out of the
		 * token loop to request more data from the read loop.
		 *
		 * Also, three of them detect self-closing tags, those
		 * ending with "/>", setting the flag elem_end and
		 * calling xml_elem_end() at the very end, after
		 * handling the attribute value, attribute name, or
		 * tag name, respectively.
		 */

		/* Parse an attribute value. */

		if (*pstate >= PARSE_ARG) {
			if (*pstate == PARSE_ARG &&
			    (b[pend] == '\'' || b[pend] == '"')) {
				*pstate = b[pend] == '"' ?
				    PARSE_DQ : PARSE_SQ;
				increment(p, b, &pend, refill);
				continue;
			}
			if (advance(p, b, rlen, &pend,
			    *pstate == PARSE_DQ ? "\"" :
			    *pstate == PARSE_SQ ? "'" : " >", refill))
				break;
			*pstate = PARSE_TAG;
			elem_end = 0;
			if (b[pend] == '>') {
				*pstate = PARSE_ELEM;
				if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				if (p->flags & PFLAG_EEND)
					elem_end = 1;
			}
			b[pend] = '\0';
			if (pend < rlen)
				increment(p, b, &pend, refill);
			xml_attrval(p, b + poff);
			if (elem_end)
				xml_elem_end(p, NULL);

		/* Look for an attribute name. */

		} else if (*pstate == PARSE_TAG) {
			switch (p->ncur) {
			case NODE_DOCTYPE:
				if (b[pend] == '[') {
					*pstate = PARSE_ELEM;
					increment(p, b, &pend, refill);
					continue;
				}
				/* FALLTHROUGH */
			case NODE_ENTITY:
				if (b[pend] == '"' || b[pend] == '\'') {
					*pstate = PARSE_ARG;
					continue;
				}
				break;
			default:
				break;
			}
			if (advance(p, b, rlen, &pend, " =>", refill))
				break;
			elem_end = 0;
			switch (b[pend]) {
			case '>':
				*pstate = PARSE_ELEM;
				if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				if (p->flags & PFLAG_EEND)
					elem_end = 1;
				break;
			case '=':
				*pstate = PARSE_ARG;
				break;
			default:
				break;
			}
			b[pend] = '\0';
			if (pend < rlen)
				increment(p, b, &pend, refill);
			xml_attrkey(p, b + poff);
			if (elem_end)
				xml_elem_end(p, NULL);

		/* Begin an opening or closing tag. */

		} else if (b[poff] == '<') {
			if (advance(p, b, rlen, &pend, " >", refill))
				break;
			if (pend > poff + 3 &&
			    strncmp(b + poff, "<!--", 4) == 0) {

				/* Skip a comment. */

				cp = strstr(b + pend - 2, "-->");
				if (cp == NULL) {
					if (refill)
						break;
					cp = b + rlen;
				} else
					cp += 3;
				while (b + pend < cp)
					increment(p, b, &pend, refill);
				continue;
			}
			elem_end = 0;
			if (b[pend] != '>')
				*pstate = PARSE_TAG;
			else if (pend > 0 && b[pend - 1] == '/') {
				b[pend - 1] = '\0';
				elem_end = 1;
			}
			b[pend] = '\0';
			if (pend < rlen)
				increment(p, b, &pend, refill);
			if (b[++poff] == '/') {
				elem_end = 1;
				poff++;
			} else {
				xml_elem_start(p, b + poff);
				if (*pstate == PARSE_ELEM &&
				    p->flags & PFLAG_EEND)
					elem_end = 1;
			}
			if (elem_end)
				xml_elem_end(p, b + poff);

		/* Close a doctype. */

		} else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
			*pstate = PARSE_TAG;
			increment(p, b, &pend, refill);

		/* Process an entity. */

		} else if (b[poff] == '&') {
			if (advance(p, b, rlen, &pend, ";", refill))
				break;
			b[pend] = '\0';
			if (pend < rlen)
				increment(p, b, &pend, refill);
			xml_entity(p, b + poff + 1);

		/* Process text up to the next tag, entity, or EOL. */

		} else {
			advance(p, b, rlen, &pend, "<&", refill);
			xml_char(p, b + poff, pend - poff);
		}
	}
	return poff;
}

struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
	char		 b[4096];
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;	/* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	enum pstate	 pstate;

	p->fname = fname;
	p->nline = 1;
	p->ncol = 1;
	pstate = PARSE_ELEM;
	rlen = 0;

	/*
	 * Read loop.
	 *
         * If the previous token was incomplete and asked for more
         * input, we have to enter the read loop once more even on EOF.
	 * Once rsz is 0, incomplete tokens will no longer ask
	 * for more input but instead use whatever there is,
	 * and then exit the read loop.
	 * The minus one on the size limit for read(2) is needed
	 * such that advance() can set b[rlen] to NUL when needed.
	 */

	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
	    (rlen += rsz) > 0) {
		poff = parse_string(p, b, rlen, &pstate, rsz > 0);
		/* Buffer exhausted; shift left and re-fill. */
		assert(poff > 0);
		rlen -= poff;
		memmove(b, b + poff, rlen);
	}
	if (rsz < 0) {
		perror(fname);
		p->tree->flags |= TREE_FAIL;
	}
	pnode_closetext(p);
	if ((p->tree->flags & TREE_CLOSED) == 0)
		warn_msg(p, "document not closed");
	pnode_unlink(p->doctype);
	return p->tree;
}