===================================================================
RCS file: /cvs/docbook2mdoc/parse.c,v
retrieving revision 1.10
retrieving revision 1.22
diff -u -p -r1.10 -r1.22
--- docbook2mdoc/parse.c	2019/04/03 11:23:48	1.10
+++ docbook2mdoc/parse.c	2019/04/07 19:33:27	1.22
@@ -1,4 +1,4 @@
-/* $Id: parse.c,v 1.10 2019/04/03 11:23:48 schwarze Exp $ */
+/* $Id: parse.c,v 1.22 2019/04/07 19:33:27 schwarze Exp $ */
 /*
  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
@@ -30,6 +30,14 @@
  * The implementation of the DocBook parser.
  */
 
+enum	pstate {
+	PARSE_ELEM,
+	PARSE_TAG,
+	PARSE_ARG,
+	PARSE_SQ,
+	PARSE_DQ
+};
+
 /*
  * Global parse state.
  * Keep this as simple and small as possible.
@@ -44,6 +52,7 @@ struct	parse {
 	int		 nline;  /* Line number of next token. */
 	int		 ncol;   /* Column number of next token. */
 	int		 del;    /* Levels of nested nodes being deleted. */
+	int		 spc;	 /* Whitespace before the next element. */
 	int		 attr;   /* The most recent attribute is valid. */
 	int		 warn;
 };
@@ -57,19 +66,21 @@ static	const struct element elements[] = {
 	{ "acronym",		NODE_IGNORE },
 	{ "affiliation",	NODE_AFFILIATION },
 	{ "anchor",		NODE_DELETE },
+	{ "appendix",		NODE_APPENDIX },
 	{ "application",	NODE_APPLICATION },
 	{ "arg",		NODE_ARG },
+	{ "article",		NODE_SECTION },
 	{ "author",		NODE_AUTHOR },
 	{ "authorgroup",	NODE_AUTHORGROUP },
 	{ "blockquote",		NODE_BLOCKQUOTE },
-	{ "book",		NODE_BOOK },
+	{ "book",		NODE_SECTION },
 	{ "bookinfo",		NODE_BOOKINFO },
 	{ "caution",		NODE_CAUTION },
 	{ "chapter",		NODE_SECTION },
 	{ "citerefentry",	NODE_CITEREFENTRY },
 	{ "citetitle",		NODE_CITETITLE },
 	{ "cmdsynopsis",	NODE_CMDSYNOPSIS },
-	{ "code",		NODE_CODE },
+	{ "code",		NODE_LITERAL },
 	{ "colspec",		NODE_COLSPEC },
 	{ "command",		NODE_COMMAND },
 	{ "constant",		NODE_CONSTANT },
@@ -81,6 +92,7 @@ static	const struct element elements[] = {
 	{ "emphasis",		NODE_EMPHASIS },
 	{ "entry",		NODE_ENTRY },
 	{ "envar",		NODE_ENVAR },
+	{ "errorname",		NODE_ERRORNAME },
 	{ "fieldsynopsis",	NODE_FIELDSYNOPSIS },
 	{ "filename",		NODE_FILENAME },
 	{ "firstname",		NODE_PERSONNAME },
@@ -91,6 +103,11 @@ static	const struct element elements[] = {
 	{ "funcsynopsis",	NODE_FUNCSYNOPSIS },
 	{ "funcsynopsisinfo",	NODE_FUNCSYNOPSISINFO },
 	{ "function",		NODE_FUNCTION },
+	{ "glossary",		NODE_VARIABLELIST },
+	{ "glossdef",		NODE_IGNORE },
+	{ "glossdiv",		NODE_IGNORE },
+	{ "glossentry",		NODE_VARLISTENTRY },
+	{ "glosslist",		NODE_VARIABLELIST },
 	{ "glossterm",		NODE_GLOSSTERM },
 	{ "group",		NODE_GROUP },
 	{ "holder",		NODE_HOLDER },
@@ -98,7 +115,7 @@ static	const struct element elements[] = {
 	{ "indexterm",		NODE_DELETE },
 	{ "info",		NODE_INFO },
 	{ "informalequation",	NODE_INFORMALEQUATION },
-	{ "informaltable",	NODE_INFORMALTABLE },
+	{ "informaltable",	NODE_TABLE },
 	{ "inlineequation",	NODE_INLINEEQUATION },
 	{ "itemizedlist",	NODE_ITEMIZEDLIST },
 	{ "keysym",		NODE_KEYSYM },
@@ -161,11 +178,14 @@ static	const struct element elements[] = {
 	{ "sect2",		NODE_SECTION },
 	{ "section",		NODE_SECTION },
 	{ "sgmltag",		NODE_SGMLTAG },
+	{ "simpara",		NODE_PARA },
 	{ "simplelist",		NODE_SIMPLELIST },
 	{ "spanspec",		NODE_SPANSPEC },
-	{ "structname",		NODE_STRUCTNAME },
+	{ "structfield",	NODE_PARAMETER },
+	{ "structname",		NODE_TYPE },
 	{ "subtitle",		NODE_SUBTITLE },
 	{ "surname",		NODE_PERSONNAME },
+	{ "symbol",		NODE_CONSTANT },
 	{ "synopsis",		NODE_SYNOPSIS },
 	{ "table",		NODE_TABLE },
 	{ "tbody",		NODE_TBODY },
@@ -177,8 +197,8 @@ static	const struct element elements[] = {
 	{ "title",		NODE_TITLE },
 	{ "trademark",		NODE_IGNORE },
 	{ "type",		NODE_TYPE },
-	{ "ulink",		NODE_ULINK },
-	{ "userinput",		NODE_USERINPUT },
+	{ "ulink",		NODE_LINK },
+	{ "userinput",		NODE_LITERAL },
 	{ "variablelist",	NODE_VARIABLELIST },
 	{ "varlistentry",	NODE_VARLISTENTRY },
 	{ "varname",		NODE_VARNAME },
@@ -282,6 +302,7 @@ static void
 xml_char(struct parse *ps, const char *p, int sz)
 {
 	struct pnode	*dat;
+	size_t		 newsz;
 
 	if (ps->del > 0)
 		return;
@@ -297,6 +318,7 @@ xml_char(struct parse *ps, const char *p, int sz)
 			exit(1);
 		}
 		dat->node = NODE_TEXT;
+		dat->spc = ps->spc;
 		dat->parent = ps->cur;
 		TAILQ_INIT(&dat->childq);
 		TAILQ_INIT(&dat->attrq);
@@ -311,24 +333,35 @@ xml_char(struct parse *ps, const char *p, int sz)
 	/* Append to the current text node. */
 
 	assert(sz >= 0);
-	ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
+	newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
+	ps->cur->b = realloc(ps->cur->b, newsz + 1);
 	if (ps->cur->b == NULL) {
 		perror(NULL);
 		exit(1);
 	}
+	if (ps->cur->bsz && ps->spc)
+		ps->cur->b[ps->cur->bsz++] = ' ';
 	memcpy(ps->cur->b + ps->cur->bsz, p, sz);
-	ps->cur->bsz += sz;
-	ps->cur->b[ps->cur->bsz] = '\0';
+	ps->cur->b[ps->cur->bsz = newsz] = '\0';
 	ps->cur->real = ps->cur->b;
+	ps->spc = 0;
 }
 
+/*
+ * Close out the text node and strip trailing whitespace, if one is open.
+ */
 static void
-pnode_trim(struct pnode *pn)
+pnode_closetext(struct parse *p)
 {
-	assert(pn->node == NODE_TEXT);
-	for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
-		if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
-			break;
+	struct pnode	*n;
+
+	if ((n = p->cur) == NULL || n->node != NODE_TEXT)
+		return;
+	p->cur = n->parent;
+	while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
+		n->b[--n->bsz] = '\0';
+		p->spc = 1;
+	}
 }
 
 static void
@@ -345,11 +378,7 @@ xml_entity(struct parse *p, const char *name)
 		return;
 	}
 
-	/* Close out the text node, if there is one. */
-	if (p->cur->node == NODE_TEXT) {
-		pnode_trim(p->cur);
-		p->cur = p->cur->parent;
-	}
+	pnode_closetext(p);
 
 	if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
 		warn_msg(p, "entity after end of document: &%s;", name);
@@ -371,10 +400,12 @@ xml_entity(struct parse *p, const char *name)
 	}
 	dat->node = NODE_ESCAPE;
 	dat->bsz = strlen(dat->b);
+	dat->spc = p->spc;
 	dat->parent = p->cur;
 	TAILQ_INIT(&dat->childq);
 	TAILQ_INIT(&dat->attrq);
 	TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
+	p->spc = 0;
 }
 
 /*
@@ -398,11 +429,7 @@ xml_elem_start(struct parse *ps, const char *name)
 		return;
 	}
 
-	/* Close out the text node, if there is one. */
-	if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
-		pnode_trim(ps->cur);
-		ps->cur = ps->cur->parent;
-	}
+	pnode_closetext(ps);
 
 	for (elem = elements; elem->name != NULL; elem++)
 		if (strcmp(elem->name, name) == 0)
@@ -436,7 +463,52 @@ xml_elem_start(struct parse *ps, const char *name)
 		perror(NULL);
 		exit(1);
 	}
-	dat->node = elem->node;
+
+	/*
+	 * Nodes that begin a new macro or request line or start by
+	 * printing text always want whitespace before themselves.
+	 */
+
+	switch (dat->node = elem->node) {
+	case NODE_APPENDIX:
+	case NODE_AUTHORGROUP:
+	case NODE_BLOCKQUOTE:
+	case NODE_BOOKINFO:
+	case NODE_CAUTION:
+	case NODE_EDITOR:
+	case NODE_ENTRY:
+	case NODE_FUNCDEF:
+	case NODE_FUNCPROTOTYPE:
+	case NODE_INFORMALEQUATION:
+	case NODE_INLINEEQUATION:
+	case NODE_ITEMIZEDLIST:
+	case NODE_LEGALNOTICE:
+	case NODE_LITERALLAYOUT:
+	case NODE_NOTE:
+	case NODE_ORDEREDLIST:
+	case NODE_PARA:
+	case NODE_PREFACE:
+	case NODE_PROGRAMLISTING:
+	case NODE_REFMETA:
+	case NODE_REFNAMEDIV:
+	case NODE_REFSYNOPSISDIV:
+	case NODE_ROW:
+	case NODE_SBR:
+	case NODE_SCREEN:
+	case NODE_SECTION:
+	case NODE_SYNOPSIS:
+	case NODE_TGROUP:
+	case NODE_TIP:
+	case NODE_TITLE:
+	case NODE_VARIABLELIST:
+	case NODE_VARLISTENTRY:
+	case NODE_WARNING:
+		dat->spc = 1;
+		break;
+	default:
+		dat->spc = ps->spc;
+		break;
+	}
 	dat->parent = ps->cur;
 	TAILQ_INIT(&dat->childq);
 	TAILQ_INIT(&dat->attrq);
@@ -455,7 +527,7 @@ xml_attrkey(struct parse *ps, const char *name)
 	struct pattr	*attr;
 	enum attrkey	 key;
 
-	if (ps->del > 0 || *name == '\0')
+	if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
 		return;
 	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
 		ps->attr = 0;
@@ -477,7 +549,7 @@ xml_attrval(struct parse *ps, const char *name)
 {
 	struct pattr	*attr;
 
-	if (ps->del > 0 || ps->attr == 0)
+	if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0)
 		return;
 	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
 		return;
@@ -507,11 +579,8 @@ xml_elem_end(struct parse *ps, const char *name)
 		return;
 	}
 
-	/* Close out the text node, if there is one. */
-	if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
-		pnode_trim(ps->cur);
-		ps->cur = ps->cur->parent;
-	}
+	if (ps->del == 0)
+		pnode_closetext(ps);
 
 	if (name != NULL) {
 		for (elem = elements; elem->name != NULL; elem++)
@@ -546,6 +615,7 @@ xml_elem_end(struct parse *ps, const char *name)
 			ps->tree->flags |= TREE_CLOSED;
 		else
 			ps->cur = ps->cur->parent;
+		ps->spc = 0;
 		break;
 	}
 	assert(ps->del == 0);
@@ -579,6 +649,19 @@ parse_free(struct parse *p)
 	free(p);
 }
 
+static void
+increment(struct parse *p, char *b, size_t *pend, int refill)
+{
+	if (refill) {
+		if (b[*pend] == '\n') {
+			p->nline++;
+			p->ncol = 1;
+		} else
+			p->ncol++;
+	}
+	++*pend;
+}
+
 /*
  * Advance the pend pointer to the next character in the charset.
  * If the charset starts with a space, it stands for any whitespace.
@@ -589,7 +672,7 @@ parse_free(struct parse *p)
  */
 static int
 advance(struct parse *p, char *b, size_t rlen, size_t *pend,
-    const char *charset)
+    const char *charset, int refill)
 {
 	int		 space;
 
@@ -599,234 +682,221 @@ advance(struct parse *p, char *b, size_t rlen, size_t 
 	} else
 		space = 0;
 
-	p->nline = p->line;
-	p->ncol = p->col;
+	if (refill) {
+		p->nline = p->line;
+		p->ncol = p->col;
+	}
 	while (*pend < rlen) {
-		if (b[*pend] == '\n') {
-			p->nline++;
-			p->ncol = 1;
-		} else
-			p->ncol++;
 		if (space && isspace((unsigned char)b[*pend]))
 			break;
 		if (strchr(charset, b[*pend]) != NULL)
 			break;
-		++*pend;
+		increment(p, b, pend, refill);
 	}
 	if (*pend == rlen) {
 		b[rlen] = '\0';
-		return 1;
+		return refill;
 	} else
 		return 0;
 }
 
-struct ptree *
-parse_file(struct parse *p, int fd, const char *fname)
+size_t
+parse_string(struct parse *p, char *b, size_t rlen,
+    enum pstate *pstate, int refill)
 {
-	char		 b[4096];
 	char		*cp;
-	ssize_t		 rsz;	/* Return value from read(2). */
-	size_t		 rlen;  /* Number of bytes in b[]. */
 	size_t		 poff;  /* Parse offset in b[]. */
 	size_t		 pend;  /* Offset of the end of the current word. */
-	int		 in_tag, in_arg, in_quotes, elem_end;
+	int		 elem_end;
 
-	p->fname = fname;
-	p->nline = 1;
-	p->ncol = 1;
-	rlen = 0;
-	in_tag = in_arg = in_quotes = 0;
+	p->spc = 0;
+	pend = 0;
+	for (;;) {
 
-	/*
-	 * Read loop.
-	 *
-	 * We have to enter the read loop once more even on EOF
-	 * because the previous token may have been incomplete,
-	 * such that it asked for more input.
-	 * Once rsz is 0, incomplete tokens will no longer ask
-	 * for more input but instead use whatever there is,
-	 * and then exit the read loop.
-	 * The minus one on the size limit for read(2) is needed
-	 * such that advance() can set b[rlen] to NUL when needed.
-	 */
+		/* Proceed to the next token, skipping whitespace. */
 
-	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
-		if ((rlen += rsz) == 0)
+		if (refill) {
+			p->line = p->nline;
+			p->col = p->ncol;
+		}
+		if ((poff = pend) == rlen)
 			break;
+		if (isspace((unsigned char)b[pend])) {
+			p->spc = 1;
+			increment(p, b, &pend, refill);
+			continue;
+		}
 
-		/* Token loop. */
+		/*
+		 * The following four cases (ARG, TAG, and starting an
+		 * entity or a tag) all parse a word or quoted string.
+		 * If that extends beyond the read buffer and the last
+		 * read(2) still got data, they all break out of the
+		 * token loop to request more data from the read loop.
+		 *
+		 * Also, three of them detect self-closing tags, those
+		 * ending with "/>", setting the flag elem_end and
+		 * calling xml_elem_end() at the very end, after
+		 * handling the attribute value, attribute name, or
+		 * tag name, respectively.
+		 */
 
-		pend = 0;
-		for (;;) {
+		/* Parse an attribute value. */
 
-			/* Proceed to the next token, skipping whitespace. */
-
-			p->line = p->nline;
-			p->col = p->ncol;
-			if ((poff = pend) == rlen)
-				break;
-			if (isspace((unsigned char)b[pend])) {
-				if (b[pend++] == '\n') {
-					p->nline++;
-					p->ncol = 1;
-				} else
-					p->ncol++;
+		if (*pstate >= PARSE_ARG) {
+			if (*pstate == PARSE_ARG &&
+			    (b[pend] == '\'' || b[pend] == '"')) {
+				*pstate = b[pend] == '"' ?
+				    PARSE_DQ : PARSE_SQ;
+				increment(p, b, &pend, refill);
 				continue;
 			}
-
-			/*
-			 * The following four cases (in_arg, in_tag, and
-			 * starting an entity or a tag) all parse a word
-			 * or quoted string.  If that extends beyond the
-			 * read buffer and the last read(2) still got
-			 * data, they all break out of the token loop
-			 * to request more data from the read loop.
-			 *
-			 * Also, three of them detect self-closing tags,
-			 * those ending with "/>", setting the flag
-			 * elem_end and calling xml_elem_end() at the
-			 * very end, after handling the attribute value,
-			 * attribute name, or tag name, respectively.
-			 */
-
-			/* Parse an attribute value. */
-
-			if (in_arg) {
-				if (in_quotes == 0 &&
-				    (b[pend] == '\'' || b[pend] == '"')) {
-					in_quotes = b[pend] == '"' ? 2 : 1;
-					p->ncol++;
-					pend++;
-					continue;
+			if (advance(p, b, rlen, &pend,
+			    *pstate == PARSE_DQ ? "\"" :
+			    *pstate == PARSE_SQ ? "'" : " >", refill))
+				break;
+			*pstate = PARSE_TAG;
+			elem_end = 0;
+			if (b[pend] == '>') {
+				*pstate = PARSE_ELEM;
+				if (pend > 0 && b[pend - 1] == '/') {
+					b[pend - 1] = '\0';
+					elem_end = 1;
 				}
-				if (advance(p, b, rlen, &pend,
-				    in_quotes == 2 ? "\"" :
-				    in_quotes == 1 ? "'" : " >") && rsz > 0)
-					break;
-				in_arg = in_quotes = elem_end = 0;
-				if (b[pend] == '>') {
-					in_tag = 0;
-					if (pend > 0 && b[pend - 1] == '/') {
-						b[pend - 1] = '\0';
-						elem_end = 1;
-					}
-				}
-				b[pend] = '\0';
-				if (pend < rlen)
-					pend++;
-				xml_attrval(p, b + poff);
-				if (elem_end)
-					xml_elem_end(p, NULL);
+			}
+			b[pend] = '\0';
+			if (pend < rlen)
+				increment(p, b, &pend, refill);
+			xml_attrval(p, b + poff);
+			if (elem_end)
+				xml_elem_end(p, NULL);
 
-			/* Look for an attribute name. */
+		/* Look for an attribute name. */
 
-			} else if (in_tag) {
-				if (advance(p, b, rlen, &pend, " =>") &&
-				    rsz > 0)
-					break;
-				elem_end = 0;
-				switch (b[pend]) {
-				case '>':
-					in_tag = 0;
-					if (pend > 0 && b[pend - 1] == '/') {
-						b[pend - 1] = '\0';
-						elem_end = 1;
-					}
-					break;
-				case '=':
-					in_arg = 1;
-					break;
-				default:
-					break;
+		} else if (*pstate == PARSE_TAG) {
+			if (advance(p, b, rlen, &pend, " =>", refill))
+				break;
+			elem_end = 0;
+			switch (b[pend]) {
+			case '>':
+				*pstate = PARSE_ELEM;
+				if (pend > 0 && b[pend - 1] == '/') {
+					b[pend - 1] = '\0';
+					elem_end = 1;
 				}
-				b[pend] = '\0';
-				if (pend < rlen)
-					pend++;
-				xml_attrkey(p, b + poff);
-				if (elem_end)
-					xml_elem_end(p, NULL);
+				break;
+			case '=':
+				*pstate = PARSE_ARG;
+				break;
+			default:
+				break;
+			}
+			b[pend] = '\0';
+			if (pend < rlen)
+				increment(p, b, &pend, refill);
+			xml_attrkey(p, b + poff);
+			if (elem_end)
+				xml_elem_end(p, NULL);
 
-			/* Begin an opening or closing tag. */
+		/* Begin an opening or closing tag. */
 
-			} else if (b[poff] == '<') {
-				if (advance(p, b, rlen, &pend, " >") &&
-				    rsz > 0)
-					break;
-				if (pend > poff + 3 &&
-				    strncmp(b + poff, "<!--", 4) == 0) {
+		} else if (b[poff] == '<') {
+			if (advance(p, b, rlen, &pend, " >", refill))
+				break;
+			if (pend > poff + 3 &&
+			    strncmp(b + poff, "<!--", 4) == 0) {
 
-					/* Skip a comment. */
+				/* Skip a comment. */
 
-					cp = strstr(b + pend - 2, "-->");
-					if (cp == NULL) {
-						if (rsz > 0) {
-							pend = rlen;
-							break;
-						}
-						cp = b + rlen;
-					} else
-						cp += 3;
-					while (b + pend < cp) {
-						if (b[++pend] == '\n') {
-							p->nline++;
-							p->ncol = 1;
-						} else
-							p->ncol++;
-					}
-					continue;
-				}
-				elem_end = 0;
-				if (b[pend] != '>')
-					in_tag = 1;
-				else if (pend > 0 && b[pend - 1] == '/') {
-					b[pend - 1] = '\0';
-					elem_end = 1;
-				}
-				b[pend] = '\0';
-				if (pend < rlen)
-					pend++;
-				if (b[++poff] == '/') {
-					elem_end = 1;
-					poff++;
+				cp = strstr(b + pend - 2, "-->");
+				if (cp == NULL) {
+					if (refill)
+						break;
+					cp = b + rlen;
 				} else
-					xml_elem_start(p, b + poff);
-				if (elem_end)
-					xml_elem_end(p, b + poff);
+					cp += 3;
+				while (b + pend < cp)
+					increment(p, b, &pend, refill);
+				continue;
+			}
+			elem_end = 0;
+			if (b[pend] != '>')
+				*pstate = PARSE_TAG;
+			else if (pend > 0 && b[pend - 1] == '/') {
+				b[pend - 1] = '\0';
+				elem_end = 1;
+			}
+			b[pend] = '\0';
+			if (pend < rlen)
+				increment(p, b, &pend, refill);
+			if (b[++poff] == '/') {
+				elem_end = 1;
+				poff++;
+			} else
+				xml_elem_start(p, b + poff);
+			if (elem_end)
+				xml_elem_end(p, b + poff);
 
-			/* Process an entity. */
+		/* Process an entity. */
 
-			} else if (b[poff] == '&') {
-				if (advance(p, b, rlen, &pend, ";") &&
-				    rsz > 0)
-					break;
-				b[pend] = '\0';
-				if (pend < rlen)
-					pend++;
-				xml_entity(p, b + poff + 1);
+		} else if (b[poff] == '&') {
+			if (advance(p, b, rlen, &pend, ";", refill))
+				break;
+			b[pend] = '\0';
+			if (pend < rlen)
+				increment(p, b, &pend, refill);
+			xml_entity(p, b + poff + 1);
 
-			/* Process text up to the next tag or entity. */
+		/* Process text up to the next tag, entity, or EOL. */
 
-			} else {
-				if (advance(p, b, rlen, &pend, "<&") == 0)
-					p->ncol--;
-				xml_char(p, b + poff, pend - poff);
-			}
+		} else {
+			advance(p, b, rlen, &pend, "<&", refill);
+			xml_char(p, b + poff, pend - poff);
 		}
+	}
+	return poff;
+}
 
-		/* Buffer exhausted; shift left and re-fill. */
+struct ptree *
+parse_file(struct parse *p, int fd, const char *fname)
+{
+	char		 b[4096];
+	ssize_t		 rsz;	/* Return value from read(2). */
+	size_t		 rlen;	/* Number of bytes in b[]. */
+	size_t		 poff;  /* Parse offset in b[]. */
+	enum pstate	 pstate;
 
+	p->fname = fname;
+	p->nline = 1;
+	p->ncol = 1;
+	pstate = PARSE_ELEM;
+	rlen = 0;
+
+	/*
+	 * Read loop.
+	 *
+         * If the previous token was incomplete and asked for more
+         * input, we have to enter the read loop once more even on EOF.
+	 * Once rsz is 0, incomplete tokens will no longer ask
+	 * for more input but instead use whatever there is,
+	 * and then exit the read loop.
+	 * The minus one on the size limit for read(2) is needed
+	 * such that advance() can set b[rlen] to NUL when needed.
+	 */
+
+	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
+	    (rlen += rsz) > 0) {
+		poff = parse_string(p, b, rlen, &pstate, rsz > 0);
+		/* Buffer exhausted; shift left and re-fill. */
 		assert(poff > 0);
-		memmove(b, b + poff, rlen - poff);
 		rlen -= poff;
+		memmove(b, b + poff, rlen);
 	}
 	if (rsz < 0) {
 		perror(fname);
 		p->tree->flags |= TREE_FAIL;
 	}
-	if (p->cur != NULL && p->cur->node == NODE_TEXT) {
-		pnode_trim(p->cur);
-		p->cur = p->cur->parent;
-	}
+	pnode_closetext(p);
 	if ((p->tree->flags & TREE_CLOSED) == 0)
 		warn_msg(p, "document not closed");
 	return p->tree;