===================================================================
RCS file: /cvs/docbook2mdoc/parse.c,v
retrieving revision 1.2
retrieving revision 1.13
diff -u -p -r1.2 -r1.13
--- docbook2mdoc/parse.c	2019/03/26 20:54:43	1.2
+++ docbook2mdoc/parse.c	2019/04/03 17:53:02	1.13
@@ -1,4 +1,4 @@
-/* $Id: parse.c,v 1.2 2019/03/26 20:54:43 schwarze Exp $ */
+/* $Id: parse.c,v 1.13 2019/04/03 17:53:02 schwarze Exp $ */
 /*
  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
@@ -17,8 +17,9 @@
  */
 #include <assert.h>
 #include <ctype.h>
-#include <expat.h>
+#include <stdarg.h>
 #include <stdio.h>
+#include <stdlib.h>
 #include <string.h>
 #include <unistd.h>
 
@@ -34,10 +35,16 @@
  * Keep this as simple and small as possible.
  */
 struct	parse {
-	XML_Parser	 xml;
 	const char	*fname;  /* Name of the input file. */
 	struct ptree	*tree;   /* Complete parse result. */
 	struct pnode	*cur;	 /* Current node in the tree. */
+	enum nodeid	 ncur;   /* Type of the current node. */
+	int		 line;   /* Line number in the input file. */
+	int		 col;	 /* Column number in the input file. */
+	int		 nline;  /* Line number of next token. */
+	int		 ncol;   /* Column number of next token. */
+	int		 del;    /* Levels of nested nodes being deleted. */
+	int		 attr;   /* The most recent attribute is valid. */
 	int		 warn;
 };
 
@@ -47,9 +54,9 @@ struct	element {
 };
 
 static	const struct element elements[] = {
-	{ "acronym",		NODE_ACRONYM },
+	{ "acronym",		NODE_IGNORE },
 	{ "affiliation",	NODE_AFFILIATION },
-	{ "anchor",		NODE_ANCHOR },
+	{ "anchor",		NODE_DELETE },
 	{ "application",	NODE_APPLICATION },
 	{ "arg",		NODE_ARG },
 	{ "author",		NODE_AUTHOR },
@@ -62,10 +69,11 @@ static	const struct element elements[] = {
 	{ "citerefentry",	NODE_CITEREFENTRY },
 	{ "citetitle",		NODE_CITETITLE },
 	{ "cmdsynopsis",	NODE_CMDSYNOPSIS },
-	{ "code",		NODE_CODE },
+	{ "code",		NODE_LITERAL },
 	{ "colspec",		NODE_COLSPEC },
 	{ "command",		NODE_COMMAND },
 	{ "constant",		NODE_CONSTANT },
+	{ "contrib",		NODE_CONTRIB },
 	{ "copyright",		NODE_COPYRIGHT },
 	{ "date",		NODE_DATE },
 	{ "editor",		NODE_EDITOR },
@@ -73,9 +81,10 @@ static	const struct element elements[] = {
 	{ "emphasis",		NODE_EMPHASIS },
 	{ "entry",		NODE_ENTRY },
 	{ "envar",		NODE_ENVAR },
+	{ "errorname",		NODE_ERRORNAME },
 	{ "fieldsynopsis",	NODE_FIELDSYNOPSIS },
 	{ "filename",		NODE_FILENAME },
-	{ "firstname",		NODE_FIRSTNAME },
+	{ "firstname",		NODE_PERSONNAME },
 	{ "firstterm",		NODE_FIRSTTERM },
 	{ "footnote",		NODE_FOOTNOTE },
 	{ "funcdef",		NODE_FUNCDEF },
@@ -87,10 +96,10 @@ static	const struct element elements[] = {
 	{ "group",		NODE_GROUP },
 	{ "holder",		NODE_HOLDER },
 	{ "index",		NODE_INDEX },
-	{ "indexterm",		NODE_INDEXTERM },
+	{ "indexterm",		NODE_DELETE },
 	{ "info",		NODE_INFO },
 	{ "informalequation",	NODE_INFORMALEQUATION },
-	{ "informaltable",	NODE_INFORMALTABLE },
+	{ "informaltable",	NODE_TABLE },
 	{ "inlineequation",	NODE_INLINEEQUATION },
 	{ "itemizedlist",	NODE_ITEMIZEDLIST },
 	{ "keysym",		NODE_KEYSYM },
@@ -115,15 +124,15 @@ static	const struct element elements[] = {
 	{ "option",		NODE_OPTION },
 	{ "orderedlist",	NODE_ORDEREDLIST },
 	{ "orgname",		NODE_ORGNAME },
-	{ "othername",		NODE_OTHERNAME },
+	{ "othername",		NODE_PERSONNAME },
 	{ "para",		NODE_PARA },
 	{ "paramdef",		NODE_PARAMDEF },
 	{ "parameter",		NODE_PARAMETER },
 	{ "part",		NODE_SECTION },
 	{ "personname",		NODE_PERSONNAME },
-	{ "phrase",		NODE_PHRASE },
+	{ "phrase",		NODE_IGNORE },
 	{ "preface",		NODE_PREFACE },
-	{ "primary",		NODE_PRIMARY },
+	{ "primary",		NODE_DELETE },
 	{ "programlisting",	NODE_PROGRAMLISTING },
 	{ "prompt",		NODE_PROMPT },
 	{ "quote",		NODE_QUOTE },
@@ -148,16 +157,18 @@ static	const struct element elements[] = {
 	{ "row",		NODE_ROW },
 	{ "sbr",		NODE_SBR },
 	{ "screen",		NODE_SCREEN },
-	{ "secondary",		NODE_SECONDARY },
+	{ "secondary",		NODE_DELETE },
 	{ "sect1",		NODE_SECTION },
 	{ "sect2",		NODE_SECTION },
 	{ "section",		NODE_SECTION },
 	{ "sgmltag",		NODE_SGMLTAG },
 	{ "simplelist",		NODE_SIMPLELIST },
 	{ "spanspec",		NODE_SPANSPEC },
-	{ "structname",		NODE_STRUCTNAME },
+	{ "structfield",	NODE_PARAMETER },
+	{ "structname",		NODE_TYPE },
 	{ "subtitle",		NODE_SUBTITLE },
-	{ "surname",		NODE_SURNAME },
+	{ "surname",		NODE_PERSONNAME },
+	{ "symbol",		NODE_CONSTANT },
 	{ "synopsis",		NODE_SYNOPSIS },
 	{ "table",		NODE_TABLE },
 	{ "tbody",		NODE_TBODY },
@@ -167,49 +178,123 @@ static	const struct element elements[] = {
 	{ "thead",		NODE_THEAD },
 	{ "tip",		NODE_TIP },
 	{ "title",		NODE_TITLE },
-	{ "trademark",		NODE_TRADEMARK },
+	{ "trademark",		NODE_IGNORE },
 	{ "type",		NODE_TYPE },
 	{ "ulink",		NODE_ULINK },
-	{ "userinput",		NODE_USERINPUT },
+	{ "userinput",		NODE_LITERAL },
 	{ "variablelist",	NODE_VARIABLELIST },
 	{ "varlistentry",	NODE_VARLISTENTRY },
 	{ "varname",		NODE_VARNAME },
 	{ "warning",		NODE_WARNING },
 	{ "wordasword",		NODE_WORDASWORD },
-	{ "xi:include",		NODE_WARN },
+	{ "xi:include",		NODE_DELETE_WARN },
 	{ "year",		NODE_YEAR },
-	{ NULL,			NODE__MAX }
+	{ NULL,			NODE_IGNORE }
 };
 
+struct	entity {
+	const char	*name;
+	const char	*roff;
+};
+
 /*
+ * XML character entity references found in the wild.
+ * Those that don't have an exact mandoc_char(7) representation
+ * are approximated, and the desired codepoint is given as a comment.
+ * Encoding them as \\[u...] would leave -Tascii out in the cold.
+ */
+static	const struct entity entities[] = {
+	{ "alpha",	"\\(*a" },
+	{ "amp",	"&" },
+	{ "apos",	"'" },
+	{ "auml",	"\\(:a" },
+	{ "beta",	"\\(*b" },
+	{ "circ",	"^" },      /* U+02C6 */
+	{ "copy",	"\\(co" },
+	{ "dagger",	"\\(dg" },
+	{ "Delta",	"\\(*D" },
+	{ "eacute",	"\\('e" },
+	{ "emsp",	"\\ " },    /* U+2003 */
+	{ "gt",		">" },
+	{ "hairsp",	"\\^" },
+	{ "kappa",	"\\(*k" },
+	{ "larr",	"\\(<-" },
+	{ "ldquo",	"\\(lq" },
+	{ "le",		"\\(<=" },
+	{ "lowbar",	"_" },
+	{ "lsqb",	"[" },
+	{ "lt",		"<" },
+	{ "mdash",	"\\(em" },
+	{ "minus",	"\\-" },
+	{ "ndash",	"\\(en" },
+	{ "nbsp",	"\\ " },
+	{ "num",	"#" },
+	{ "oslash",	"\\(/o" },
+	{ "ouml",	"\\(:o" },
+	{ "percnt",	"%" },
+	{ "quot",	"\\(dq" },
+	{ "rarr",	"\\(->" },
+	{ "rArr",	"\\(rA" },
+	{ "rdquo",	"\\(rq" },
+	{ "reg",	"\\(rg" },
+	{ "rho",	"\\(*r" },
+	{ "rsqb",	"]" },
+	{ "sigma",	"\\(*s" },
+	{ "shy",	"\\&" },     /* U+00AD */
+	{ "tau",	"\\(*t" },
+	{ "tilde",	"\\[u02DC]" },
+	{ "times",	"\\[tmu]" },
+	{ "uuml",	"\\(:u" },
+	{ NULL,		NULL }
+};
+
+static void
+error_msg(struct parse *p, const char *fmt, ...)
+{
+	va_list		 ap;
+
+	fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	fputc('\n', stderr);
+	p->tree->flags |= TREE_FAIL;
+}
+
+static void
+warn_msg(struct parse *p, const char *fmt, ...)
+{
+	va_list		 ap;
+
+	if (p->warn == 0)
+		return;
+
+	fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+	fputc('\n', stderr);
+}
+
+/*
  * Process a string of characters.
  * If a text node is already open, append to it.
  * Otherwise, create a new one as a child of the current node.
  */
 static void
-xml_char(void *arg, const XML_Char *p, int sz)
+xml_char(struct parse *ps, const char *p, int sz)
 {
-	struct parse	*ps;
 	struct pnode	*dat;
-	int		 i;
 
-	ps = arg;
-	if (ps->tree->flags && TREE_FAIL)
+	if (ps->del > 0)
 		return;
 
-	/*
-	 * Only create a new node if there is non-whitespace text.
-	 * Strip all leading whitespace.
-	 */
-	if (ps->cur->node != NODE_TEXT) {
-		for (i = 0; i < sz; i++)
-			if (isspace((unsigned char)p[i]) == 0)
-				break;
-		if (i == sz)
-			return;
-		p += i;
-		sz -= i;
+	if (ps->cur == NULL) {
+		error_msg(ps, "discarding text before document: %.*s", sz, p);
+		return;
+	}
 
+	if (ps->cur->node != NODE_TEXT) {
 		if ((dat = calloc(1, sizeof(*dat))) == NULL) {
 			perror(NULL);
 			exit(1);
@@ -222,6 +307,10 @@ xml_char(void *arg, const XML_Char *p, int sz)
 		ps->cur = dat;
 	}
 
+	if (ps->tree->flags & TREE_CLOSED &&
+	    ps->cur->parent == ps->tree->root)
+		warn_msg(ps, "text after end of document: %.*s", sz, p);
+
 	/* Append to the current text node. */
 
 	assert(sz >= 0);
@@ -245,24 +334,73 @@ pnode_trim(struct pnode *pn)
 			break;
 }
 
+static void
+xml_entity(struct parse *p, const char *name)
+{
+	const struct entity	*entity;
+	struct pnode		*dat;
+
+	if (p->del > 0)
+		return;
+
+	if (p->cur == NULL) {
+		error_msg(p, "discarding entity before document: &%s;", name);
+		return;
+	}
+
+	/* Close out the text node, if there is one. */
+	if (p->cur->node == NODE_TEXT) {
+		pnode_trim(p->cur);
+		p->cur = p->cur->parent;
+	}
+
+	if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
+		warn_msg(p, "entity after end of document: &%s;", name);
+
+	for (entity = entities; entity->name != NULL; entity++)
+		if (strcmp(name, entity->name) == 0)
+			break;
+
+	if (entity->roff == NULL) {
+		error_msg(p, "unknown entity &%s;", name);
+		return;
+	}
+
+	/* Create, append, and close out an entity node. */
+	if ((dat = calloc(1, sizeof(*dat))) == NULL ||
+	    (dat->b = dat->real = strdup(entity->roff)) == NULL) {
+		perror(NULL);
+		exit(1);
+	}
+	dat->node = NODE_ESCAPE;
+	dat->bsz = strlen(dat->b);
+	dat->parent = p->cur;
+	TAILQ_INIT(&dat->childq);
+	TAILQ_INIT(&dat->attrq);
+	TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
+}
+
 /*
  * Begin an element.
- * If the name is unknown, abort parsing.
  */
 static void
-xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
+xml_elem_start(struct parse *ps, const char *name)
 {
-	struct parse	 *ps;
-	const struct element *elem;
-	enum attrkey	  key;
-	struct pnode	 *dat;
-	struct pattr	 *pattr;
-	const XML_Char	**att;
+	const struct element	*elem;
+	struct pnode		*dat;
 
-	ps = arg;
-	if (ps->tree->flags && TREE_FAIL)
+	if (*name == '!' || *name == '?')
 		return;
 
+	/*
+	 * An ancestor is excluded from the tree;
+	 * keep track of the number of levels excluded.
+	 */
+	if (ps->del > 0) {
+		ps->del++;
+		return;
+	}
+
 	/* Close out the text node, if there is one. */
 	if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
 		pnode_trim(ps->cur);
@@ -273,22 +411,18 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 		if (strcmp(elem->name, name) == 0)
 			break;
 
-	if (elem->name == NULL) {
-		fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
-			ps->fname, XML_GetCurrentLineNumber(ps->xml),
-			XML_GetCurrentColumnNumber(ps->xml), name);
-		ps->tree->flags |= TREE_FAIL;
-		return;
-	}
+	if (elem->name == NULL)
+		error_msg(ps, "unknown element <%s>", name);
 
-	switch (elem->node) {
-	case NODE_WARN:
-		if (ps->warn)
-			fprintf(stderr, "%s:%zu:%zu: warning: "
-			    "ignoring element <%s>\n", ps->fname,
-			    XML_GetCurrentLineNumber(ps->xml),
-			    XML_GetCurrentColumnNumber(ps->xml), name);
+	ps->ncur = elem->node;
+
+	switch (ps->ncur) {
+	case NODE_DELETE_WARN:
+		warn_msg(ps, "skipping element <%s>", name);
 		/* FALLTHROUGH */
+	case NODE_DELETE:
+		ps->del = 1;
+		/* FALLTHROUGH */
 	case NODE_IGNORE:
 		return;
 	case NODE_INLINEEQUATION:
@@ -298,6 +432,9 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 		break;
 	}
 
+	if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
+		warn_msg(ps, "element after end of document: <%s>", name);
+
 	if ((dat = calloc(1, sizeof(*dat))) == NULL) {
 		perror(NULL);
 		exit(1);
@@ -313,61 +450,108 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 	ps->cur = dat;
 	if (ps->tree->root == NULL)
 		ps->tree->root = dat;
+}
 
-	/*
-	 * Process attributes.
-	 */
-	for (att = atts; *att != NULL; att += 2) {
-		if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
-			if (ps->warn)
-				fprintf(stderr, "%s:%zu:%zu: warning: "
-				    "unknown attribute \"%s\"\n",
-				    ps->fname,
-				    XML_GetCurrentLineNumber(ps->xml),
-				    XML_GetCurrentColumnNumber(ps->xml),
-				    *att);
-			continue;
-		}
-		pattr = calloc(1, sizeof(*pattr));
-		pattr->key = key;
-		if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
-			pattr->rawval = strdup(att[1]);
-		TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
+static void
+xml_attrkey(struct parse *ps, const char *name)
+{
+	struct pattr	*attr;
+	enum attrkey	 key;
+
+	if (ps->del > 0 || *name == '\0')
+		return;
+	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
+		ps->attr = 0;
+		return;
 	}
+	if ((attr = calloc(1, sizeof(*attr))) == NULL) {
+		perror(NULL);
+		exit(1);
+	}
+	attr->key = key;
+	attr->val = ATTRVAL__MAX;
+	attr->rawval = NULL;
+	TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
+	ps->attr = 1;
 }
 
+static void
+xml_attrval(struct parse *ps, const char *name)
+{
+	struct pattr	*attr;
+
+	if (ps->del > 0 || ps->attr == 0)
+		return;
+	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
+		return;
+	if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
+	    (attr->rawval = strdup(name)) == NULL) {
+		perror(NULL);
+		exit(1);
+	}
+}
+
 /*
  * Roll up the parse tree.
  * If we're at a text node, roll that one up first.
  */
 static void
-xml_elem_end(void *arg, const XML_Char *name)
+xml_elem_end(struct parse *ps, const char *name)
 {
-	struct parse	*ps;
-	const struct element *elem;
+	const struct element	*elem;
+	enum nodeid		 node;
 
-	ps = arg;
-	if (ps->tree->flags && TREE_FAIL)
+	/*
+	 * An ancestor is excluded from the tree;
+	 * keep track of the number of levels excluded.
+	 */
+	if (ps->del > 1) {
+		ps->del--;
 		return;
+	}
 
 	/* Close out the text node, if there is one. */
-	if (ps->cur->node == NODE_TEXT) {
+	if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
 		pnode_trim(ps->cur);
 		ps->cur = ps->cur->parent;
 	}
 
-	for (elem = elements; elem->name != NULL; elem++)
-		if (strcmp(elem->name, name) == 0)
-			break;
+	if (name != NULL) {
+		for (elem = elements; elem->name != NULL; elem++)
+			if (strcmp(elem->name, name) == 0)
+				break;
+		node = elem->node;
+	} else
+		node = ps->ncur;
 
-	switch (elem->node) {
+	switch (node) {
+	case NODE_DELETE_WARN:
+	case NODE_DELETE:
+		if (ps->del > 0)
+			ps->del--;
+		break;
 	case NODE_IGNORE:
-	case NODE_WARN:
 		break;
 	default:
-		assert(elem->node == ps->cur->node);
-		ps->cur = ps->cur->parent;
+		if (ps->cur == NULL || node != ps->cur->node) {
+			warn_msg(ps, "element not open: </%s>", name);
+			break;
+		}
+
+		/*
+		 * Refrain from actually closing the document element.
+		 * If no more content follows, no harm is done, but if
+		 * some content still follows, simply processing it is
+		 * obviously better than discarding it or crashing.
+		 */
+
+		if (ps->cur->parent == NULL)
+			ps->tree->flags |= TREE_CLOSED;
+		else
+			ps->cur = ps->cur->parent;
+		break;
 	}
+	assert(ps->del == 0);
 }
 
 struct parse *
@@ -382,16 +566,7 @@ parse_alloc(int warn)
 		free(p);
 		return NULL;
 	}
-
-	if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
-		free(p->tree);
-		free(p);
-		return NULL;
-	}
 	p->warn = warn;
-	XML_SetCharacterDataHandler(p->xml, xml_char);
-	XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
-	XML_SetUserData(p->xml, p);
 	return p;
 }
 
@@ -400,7 +575,6 @@ parse_free(struct parse *p)
 {
 	if (p == NULL)
 		return;
-	XML_ParserFree(p->xml);
 	if (p->tree != NULL) {
 		pnode_unlink(p->tree->root);
 		free(p->tree);
@@ -408,28 +582,255 @@ parse_free(struct parse *p)
 	free(p);
 }
 
+/*
+ * Advance the pend pointer to the next character in the charset.
+ * If the charset starts with a space, it stands for any whitespace.
+ * Update the new input file position, used for messages.
+ * Do not overrun the buffer b of length rlen.
+ * When reaching the end, NUL-terminate the buffer and return 1;
+ * otherwise, return 0.
+ */
+static int
+advance(struct parse *p, char *b, size_t rlen, size_t *pend,
+    const char *charset)
+{
+	int		 space;
+
+	if (*charset == ' ') {
+		space = 1;
+		charset++;
+	} else
+		space = 0;
+
+	p->nline = p->line;
+	p->ncol = p->col;
+	while (*pend < rlen) {
+		if (b[*pend] == '\n') {
+			p->nline++;
+			p->ncol = 1;
+		} else
+			p->ncol++;
+		if (space && isspace((unsigned char)b[*pend]))
+			break;
+		if (strchr(charset, b[*pend]) != NULL)
+			break;
+		++*pend;
+	}
+	if (*pend == rlen) {
+		b[rlen] = '\0';
+		return 1;
+	} else
+		return 0;
+}
+
 struct ptree *
 parse_file(struct parse *p, int fd, const char *fname)
 {
 	char		 b[4096];
-	ssize_t		 ssz;
+	char		*cp;
+	ssize_t		 rsz;	/* Return value from read(2). */
+	size_t		 rlen;  /* Number of bytes in b[]. */
+	size_t		 poff;  /* Parse offset in b[]. */
+	size_t		 pend;  /* Offset of the end of the current word. */
+	int		 in_tag, in_arg, in_quotes, elem_end;
 
 	p->fname = fname;
-	do {
-		if ((ssz = read(fd, b, sizeof(b))) < 0) {
-			perror(fname);
-			pnode_unlink(p->tree->root);
-			p->tree->root = p->cur = NULL;
-			p->tree->flags |= TREE_FAIL;
-			return NULL;
+	p->nline = 1;
+	p->ncol = 1;
+	rlen = 0;
+	in_tag = in_arg = in_quotes = 0;
+
+	/*
+	 * Read loop.
+	 *
+	 * We have to enter the read loop once more even on EOF
+	 * because the previous token may have been incomplete,
+	 * such that it asked for more input.
+	 * Once rsz is 0, incomplete tokens will no longer ask
+	 * for more input but instead use whatever there is,
+	 * and then exit the read loop.
+	 * The minus one on the size limit for read(2) is needed
+	 * such that advance() can set b[rlen] to NUL when needed.
+	 */
+
+	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
+		if ((rlen += rsz) == 0)
+			break;
+
+		/* Token loop. */
+
+		pend = 0;
+		for (;;) {
+
+			/* Proceed to the next token, skipping whitespace. */
+
+			p->line = p->nline;
+			p->col = p->ncol;
+			if ((poff = pend) == rlen)
+				break;
+			if (isspace((unsigned char)b[pend])) {
+				if (b[pend++] == '\n') {
+					p->nline++;
+					p->ncol = 1;
+				} else
+					p->ncol++;
+				continue;
+			}
+
+			/*
+			 * The following four cases (in_arg, in_tag, and
+			 * starting an entity or a tag) all parse a word
+			 * or quoted string.  If that extends beyond the
+			 * read buffer and the last read(2) still got
+			 * data, they all break out of the token loop
+			 * to request more data from the read loop.
+			 *
+			 * Also, three of them detect self-closing tags,
+			 * those ending with "/>", setting the flag
+			 * elem_end and calling xml_elem_end() at the
+			 * very end, after handling the attribute value,
+			 * attribute name, or tag name, respectively.
+			 */
+
+			/* Parse an attribute value. */
+
+			if (in_arg) {
+				if (in_quotes == 0 &&
+				    (b[pend] == '\'' || b[pend] == '"')) {
+					in_quotes = b[pend] == '"' ? 2 : 1;
+					p->ncol++;
+					pend++;
+					continue;
+				}
+				if (advance(p, b, rlen, &pend,
+				    in_quotes == 2 ? "\"" :
+				    in_quotes == 1 ? "'" : " >") && rsz > 0)
+					break;
+				in_arg = in_quotes = elem_end = 0;
+				if (b[pend] == '>') {
+					in_tag = 0;
+					if (pend > 0 && b[pend - 1] == '/') {
+						b[pend - 1] = '\0';
+						elem_end = 1;
+					}
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				xml_attrval(p, b + poff);
+				if (elem_end)
+					xml_elem_end(p, NULL);
+
+			/* Look for an attribute name. */
+
+			} else if (in_tag) {
+				if (advance(p, b, rlen, &pend, " =>") &&
+				    rsz > 0)
+					break;
+				elem_end = 0;
+				switch (b[pend]) {
+				case '>':
+					in_tag = 0;
+					if (pend > 0 && b[pend - 1] == '/') {
+						b[pend - 1] = '\0';
+						elem_end = 1;
+					}
+					break;
+				case '=':
+					in_arg = 1;
+					break;
+				default:
+					break;
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				xml_attrkey(p, b + poff);
+				if (elem_end)
+					xml_elem_end(p, NULL);
+
+			/* Begin an opening or closing tag. */
+
+			} else if (b[poff] == '<') {
+				if (advance(p, b, rlen, &pend, " >") &&
+				    rsz > 0)
+					break;
+				if (pend > poff + 3 &&
+				    strncmp(b + poff, "<!--", 4) == 0) {
+
+					/* Skip a comment. */
+
+					cp = strstr(b + pend - 2, "-->");
+					if (cp == NULL) {
+						if (rsz > 0) {
+							pend = rlen;
+							break;
+						}
+						cp = b + rlen;
+					} else
+						cp += 3;
+					while (b + pend < cp) {
+						if (b[++pend] == '\n') {
+							p->nline++;
+							p->ncol = 1;
+						} else
+							p->ncol++;
+					}
+					continue;
+				}
+				elem_end = 0;
+				if (b[pend] != '>')
+					in_tag = 1;
+				else if (pend > 0 && b[pend - 1] == '/') {
+					b[pend - 1] = '\0';
+					elem_end = 1;
+				}
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				if (b[++poff] == '/') {
+					elem_end = 1;
+					poff++;
+				} else
+					xml_elem_start(p, b + poff);
+				if (elem_end)
+					xml_elem_end(p, b + poff);
+
+			/* Process an entity. */
+
+			} else if (b[poff] == '&') {
+				if (advance(p, b, rlen, &pend, ";") &&
+				    rsz > 0)
+					break;
+				b[pend] = '\0';
+				if (pend < rlen)
+					pend++;
+				xml_entity(p, b + poff + 1);
+
+			/* Process text up to the next tag or entity. */
+
+			} else {
+				if (advance(p, b, rlen, &pend, "<&") == 0)
+					p->ncol--;
+				xml_char(p, b + poff, pend - poff);
+			}
 		}
-		if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
-			fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
-			    XML_GetCurrentLineNumber(p->xml),
-			    XML_GetCurrentColumnNumber(p->xml),
-			    XML_ErrorString(XML_GetErrorCode(p->xml)));
-			p->tree->flags |= TREE_FAIL;
-		}
-	} while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
+
+		/* Buffer exhausted; shift left and re-fill. */
+
+		assert(poff > 0);
+		memmove(b, b + poff, rlen - poff);
+		rlen -= poff;
+	}
+	if (rsz < 0) {
+		perror(fname);
+		p->tree->flags |= TREE_FAIL;
+	}
+	if (p->cur != NULL && p->cur->node == NODE_TEXT) {
+		pnode_trim(p->cur);
+		p->cur = p->cur->parent;
+	}
+	if ((p->tree->flags & TREE_CLOSED) == 0)
+		warn_msg(p, "document not closed");
 	return p->tree;
 }