File: [cvsweb.bsd.lv] / docbook2mdoc / parse.c (download)
Revision 1.22, Sun Apr 7 19:33:27 2019 UTC (5 years, 2 months ago) by schwarze
Branch: MAIN
Changes since 1.21: +5 -2 lines
handle <appendix>, <article>, <book>, and <legalnotice> similar to <section>
|
/* $Id: parse.c,v 1.22 2019/04/07 19:33:27 schwarze Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "node.h"
#include "parse.h"
/*
* The implementation of the DocBook parser.
*/
enum pstate {
PARSE_ELEM,
PARSE_TAG,
PARSE_ARG,
PARSE_SQ,
PARSE_DQ
};
/*
* Global parse state.
* Keep this as simple and small as possible.
*/
struct parse {
const char *fname; /* Name of the input file. */
struct ptree *tree; /* Complete parse result. */
struct pnode *cur; /* Current node in the tree. */
enum nodeid ncur; /* Type of the current node. */
int line; /* Line number in the input file. */
int col; /* Column number in the input file. */
int nline; /* Line number of next token. */
int ncol; /* Column number of next token. */
int del; /* Levels of nested nodes being deleted. */
int spc; /* Whitespace before the next element. */
int attr; /* The most recent attribute is valid. */
int warn;
};
struct element {
const char *name; /* DocBook element name. */
enum nodeid node; /* Node type to generate. */
};
static const struct element elements[] = {
{ "acronym", NODE_IGNORE },
{ "affiliation", NODE_AFFILIATION },
{ "anchor", NODE_DELETE },
{ "appendix", NODE_APPENDIX },
{ "application", NODE_APPLICATION },
{ "arg", NODE_ARG },
{ "article", NODE_SECTION },
{ "author", NODE_AUTHOR },
{ "authorgroup", NODE_AUTHORGROUP },
{ "blockquote", NODE_BLOCKQUOTE },
{ "book", NODE_SECTION },
{ "bookinfo", NODE_BOOKINFO },
{ "caution", NODE_CAUTION },
{ "chapter", NODE_SECTION },
{ "citerefentry", NODE_CITEREFENTRY },
{ "citetitle", NODE_CITETITLE },
{ "cmdsynopsis", NODE_CMDSYNOPSIS },
{ "code", NODE_LITERAL },
{ "colspec", NODE_COLSPEC },
{ "command", NODE_COMMAND },
{ "constant", NODE_CONSTANT },
{ "contrib", NODE_CONTRIB },
{ "copyright", NODE_COPYRIGHT },
{ "date", NODE_DATE },
{ "editor", NODE_EDITOR },
{ "email", NODE_EMAIL },
{ "emphasis", NODE_EMPHASIS },
{ "entry", NODE_ENTRY },
{ "envar", NODE_ENVAR },
{ "errorname", NODE_ERRORNAME },
{ "fieldsynopsis", NODE_FIELDSYNOPSIS },
{ "filename", NODE_FILENAME },
{ "firstname", NODE_PERSONNAME },
{ "firstterm", NODE_FIRSTTERM },
{ "footnote", NODE_FOOTNOTE },
{ "funcdef", NODE_FUNCDEF },
{ "funcprototype", NODE_FUNCPROTOTYPE },
{ "funcsynopsis", NODE_FUNCSYNOPSIS },
{ "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
{ "function", NODE_FUNCTION },
{ "glossary", NODE_VARIABLELIST },
{ "glossdef", NODE_IGNORE },
{ "glossdiv", NODE_IGNORE },
{ "glossentry", NODE_VARLISTENTRY },
{ "glosslist", NODE_VARIABLELIST },
{ "glossterm", NODE_GLOSSTERM },
{ "group", NODE_GROUP },
{ "holder", NODE_HOLDER },
{ "index", NODE_INDEX },
{ "indexterm", NODE_DELETE },
{ "info", NODE_INFO },
{ "informalequation", NODE_INFORMALEQUATION },
{ "informaltable", NODE_TABLE },
{ "inlineequation", NODE_INLINEEQUATION },
{ "itemizedlist", NODE_ITEMIZEDLIST },
{ "keysym", NODE_KEYSYM },
{ "legalnotice", NODE_LEGALNOTICE },
{ "link", NODE_LINK },
{ "listitem", NODE_LISTITEM },
{ "literal", NODE_LITERAL },
{ "literallayout", NODE_LITERALLAYOUT },
{ "manvolnum", NODE_MANVOLNUM },
{ "member", NODE_MEMBER },
{ "mml:math", NODE_MML_MATH },
{ "mml:mfenced", NODE_MML_MFENCED },
{ "mml:mfrac", NODE_MML_MFRAC },
{ "mml:mi", NODE_MML_MI },
{ "mml:mn", NODE_MML_MN },
{ "mml:mo", NODE_MML_MO },
{ "mml:mrow", NODE_MML_MROW },
{ "mml:msub", NODE_MML_MSUB },
{ "mml:msup", NODE_MML_MSUP },
{ "modifier", NODE_MODIFIER },
{ "note", NODE_NOTE },
{ "option", NODE_OPTION },
{ "orderedlist", NODE_ORDEREDLIST },
{ "orgname", NODE_ORGNAME },
{ "othername", NODE_PERSONNAME },
{ "para", NODE_PARA },
{ "paramdef", NODE_PARAMDEF },
{ "parameter", NODE_PARAMETER },
{ "part", NODE_SECTION },
{ "personname", NODE_PERSONNAME },
{ "phrase", NODE_IGNORE },
{ "preface", NODE_PREFACE },
{ "primary", NODE_DELETE },
{ "programlisting", NODE_PROGRAMLISTING },
{ "prompt", NODE_PROMPT },
{ "quote", NODE_QUOTE },
{ "refclass", NODE_REFCLASS },
{ "refdescriptor", NODE_REFDESCRIPTOR },
{ "refentry", NODE_REFENTRY },
{ "refentryinfo", NODE_REFENTRYINFO },
{ "refentrytitle", NODE_REFENTRYTITLE },
{ "refmeta", NODE_REFMETA },
{ "refmetainfo", NODE_REFMETAINFO },
{ "refmiscinfo", NODE_REFMISCINFO },
{ "refname", NODE_REFNAME },
{ "refnamediv", NODE_REFNAMEDIV },
{ "refpurpose", NODE_REFPURPOSE },
{ "refsect1", NODE_SECTION },
{ "refsect2", NODE_SECTION },
{ "refsect3", NODE_SECTION },
{ "refsection", NODE_SECTION },
{ "refsynopsisdiv", NODE_REFSYNOPSISDIV },
{ "releaseinfo", NODE_RELEASEINFO },
{ "replaceable", NODE_REPLACEABLE },
{ "row", NODE_ROW },
{ "sbr", NODE_SBR },
{ "screen", NODE_SCREEN },
{ "secondary", NODE_DELETE },
{ "sect1", NODE_SECTION },
{ "sect2", NODE_SECTION },
{ "section", NODE_SECTION },
{ "sgmltag", NODE_SGMLTAG },
{ "simpara", NODE_PARA },
{ "simplelist", NODE_SIMPLELIST },
{ "spanspec", NODE_SPANSPEC },
{ "structfield", NODE_PARAMETER },
{ "structname", NODE_TYPE },
{ "subtitle", NODE_SUBTITLE },
{ "surname", NODE_PERSONNAME },
{ "symbol", NODE_CONSTANT },
{ "synopsis", NODE_SYNOPSIS },
{ "table", NODE_TABLE },
{ "tbody", NODE_TBODY },
{ "term", NODE_TERM },
{ "tfoot", NODE_TFOOT },
{ "tgroup", NODE_TGROUP },
{ "thead", NODE_THEAD },
{ "tip", NODE_TIP },
{ "title", NODE_TITLE },
{ "trademark", NODE_IGNORE },
{ "type", NODE_TYPE },
{ "ulink", NODE_LINK },
{ "userinput", NODE_LITERAL },
{ "variablelist", NODE_VARIABLELIST },
{ "varlistentry", NODE_VARLISTENTRY },
{ "varname", NODE_VARNAME },
{ "warning", NODE_WARNING },
{ "wordasword", NODE_WORDASWORD },
{ "xi:include", NODE_DELETE_WARN },
{ "year", NODE_YEAR },
{ NULL, NODE_IGNORE }
};
struct entity {
const char *name;
const char *roff;
};
/*
* XML character entity references found in the wild.
* Those that don't have an exact mandoc_char(7) representation
* are approximated, and the desired codepoint is given as a comment.
* Encoding them as \\[u...] would leave -Tascii out in the cold.
*/
static const struct entity entities[] = {
{ "alpha", "\\(*a" },
{ "amp", "&" },
{ "apos", "'" },
{ "auml", "\\(:a" },
{ "beta", "\\(*b" },
{ "circ", "^" }, /* U+02C6 */
{ "copy", "\\(co" },
{ "dagger", "\\(dg" },
{ "Delta", "\\(*D" },
{ "eacute", "\\('e" },
{ "emsp", "\\ " }, /* U+2003 */
{ "gt", ">" },
{ "hairsp", "\\^" },
{ "kappa", "\\(*k" },
{ "larr", "\\(<-" },
{ "ldquo", "\\(lq" },
{ "le", "\\(<=" },
{ "lowbar", "_" },
{ "lsqb", "[" },
{ "lt", "<" },
{ "mdash", "\\(em" },
{ "minus", "\\-" },
{ "ndash", "\\(en" },
{ "nbsp", "\\ " },
{ "num", "#" },
{ "oslash", "\\(/o" },
{ "ouml", "\\(:o" },
{ "percnt", "%" },
{ "quot", "\\(dq" },
{ "rarr", "\\(->" },
{ "rArr", "\\(rA" },
{ "rdquo", "\\(rq" },
{ "reg", "\\(rg" },
{ "rho", "\\(*r" },
{ "rsqb", "]" },
{ "sigma", "\\(*s" },
{ "shy", "\\&" }, /* U+00AD */
{ "tau", "\\(*t" },
{ "tilde", "\\[u02DC]" },
{ "times", "\\[tmu]" },
{ "uuml", "\\(:u" },
{ NULL, NULL }
};
static void
error_msg(struct parse *p, const char *fmt, ...)
{
va_list ap;
fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
fputc('\n', stderr);
p->tree->flags |= TREE_FAIL;
}
static void
warn_msg(struct parse *p, const char *fmt, ...)
{
va_list ap;
if (p->warn == 0)
return;
fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
fputc('\n', stderr);
}
/*
* Process a string of characters.
* If a text node is already open, append to it.
* Otherwise, create a new one as a child of the current node.
*/
static void
xml_char(struct parse *ps, const char *p, int sz)
{
struct pnode *dat;
size_t newsz;
if (ps->del > 0)
return;
if (ps->cur == NULL) {
error_msg(ps, "discarding text before document: %.*s", sz, p);
return;
}
if (ps->cur->node != NODE_TEXT) {
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
}
dat->node = NODE_TEXT;
dat->spc = ps->spc;
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INIT(&dat->attrq);
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
}
if (ps->tree->flags & TREE_CLOSED &&
ps->cur->parent == ps->tree->root)
warn_msg(ps, "text after end of document: %.*s", sz, p);
/* Append to the current text node. */
assert(sz >= 0);
newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
ps->cur->b = realloc(ps->cur->b, newsz + 1);
if (ps->cur->b == NULL) {
perror(NULL);
exit(1);
}
if (ps->cur->bsz && ps->spc)
ps->cur->b[ps->cur->bsz++] = ' ';
memcpy(ps->cur->b + ps->cur->bsz, p, sz);
ps->cur->b[ps->cur->bsz = newsz] = '\0';
ps->cur->real = ps->cur->b;
ps->spc = 0;
}
/*
* Close out the text node and strip trailing whitespace, if one is open.
*/
static void
pnode_closetext(struct parse *p)
{
struct pnode *n;
if ((n = p->cur) == NULL || n->node != NODE_TEXT)
return;
p->cur = n->parent;
while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
n->b[--n->bsz] = '\0';
p->spc = 1;
}
}
static void
xml_entity(struct parse *p, const char *name)
{
const struct entity *entity;
struct pnode *dat;
if (p->del > 0)
return;
if (p->cur == NULL) {
error_msg(p, "discarding entity before document: &%s;", name);
return;
}
pnode_closetext(p);
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
warn_msg(p, "entity after end of document: &%s;", name);
for (entity = entities; entity->name != NULL; entity++)
if (strcmp(name, entity->name) == 0)
break;
if (entity->roff == NULL) {
error_msg(p, "unknown entity &%s;", name);
return;
}
/* Create, append, and close out an entity node. */
if ((dat = calloc(1, sizeof(*dat))) == NULL ||
(dat->b = dat->real = strdup(entity->roff)) == NULL) {
perror(NULL);
exit(1);
}
dat->node = NODE_ESCAPE;
dat->bsz = strlen(dat->b);
dat->spc = p->spc;
dat->parent = p->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INIT(&dat->attrq);
TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
p->spc = 0;
}
/*
* Begin an element.
*/
static void
xml_elem_start(struct parse *ps, const char *name)
{
const struct element *elem;
struct pnode *dat;
if (*name == '!' || *name == '?')
return;
/*
* An ancestor is excluded from the tree;
* keep track of the number of levels excluded.
*/
if (ps->del > 0) {
ps->del++;
return;
}
pnode_closetext(ps);
for (elem = elements; elem->name != NULL; elem++)
if (strcmp(elem->name, name) == 0)
break;
if (elem->name == NULL)
error_msg(ps, "unknown element <%s>", name);
ps->ncur = elem->node;
switch (ps->ncur) {
case NODE_DELETE_WARN:
warn_msg(ps, "skipping element <%s>", name);
/* FALLTHROUGH */
case NODE_DELETE:
ps->del = 1;
/* FALLTHROUGH */
case NODE_IGNORE:
return;
case NODE_INLINEEQUATION:
ps->tree->flags |= TREE_EQN;
break;
default:
break;
}
if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
warn_msg(ps, "element after end of document: <%s>", name);
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
}
/*
* Nodes that begin a new macro or request line or start by
* printing text always want whitespace before themselves.
*/
switch (dat->node = elem->node) {
case NODE_APPENDIX:
case NODE_AUTHORGROUP:
case NODE_BLOCKQUOTE:
case NODE_BOOKINFO:
case NODE_CAUTION:
case NODE_EDITOR:
case NODE_ENTRY:
case NODE_FUNCDEF:
case NODE_FUNCPROTOTYPE:
case NODE_INFORMALEQUATION:
case NODE_INLINEEQUATION:
case NODE_ITEMIZEDLIST:
case NODE_LEGALNOTICE:
case NODE_LITERALLAYOUT:
case NODE_NOTE:
case NODE_ORDEREDLIST:
case NODE_PARA:
case NODE_PREFACE:
case NODE_PROGRAMLISTING:
case NODE_REFMETA:
case NODE_REFNAMEDIV:
case NODE_REFSYNOPSISDIV:
case NODE_ROW:
case NODE_SBR:
case NODE_SCREEN:
case NODE_SECTION:
case NODE_SYNOPSIS:
case NODE_TGROUP:
case NODE_TIP:
case NODE_TITLE:
case NODE_VARIABLELIST:
case NODE_VARLISTENTRY:
case NODE_WARNING:
dat->spc = 1;
break;
default:
dat->spc = ps->spc;
break;
}
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INIT(&dat->attrq);
if (ps->cur != NULL)
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
if (ps->tree->root == NULL)
ps->tree->root = dat;
}
static void
xml_attrkey(struct parse *ps, const char *name)
{
struct pattr *attr;
enum attrkey key;
if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
return;
if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
ps->attr = 0;
return;
}
if ((attr = calloc(1, sizeof(*attr))) == NULL) {
perror(NULL);
exit(1);
}
attr->key = key;
attr->val = ATTRVAL__MAX;
attr->rawval = NULL;
TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
ps->attr = 1;
}
static void
xml_attrval(struct parse *ps, const char *name)
{
struct pattr *attr;
if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0)
return;
if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
return;
if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
(attr->rawval = strdup(name)) == NULL) {
perror(NULL);
exit(1);
}
}
/*
* Roll up the parse tree.
* If we're at a text node, roll that one up first.
*/
static void
xml_elem_end(struct parse *ps, const char *name)
{
const struct element *elem;
enum nodeid node;
/*
* An ancestor is excluded from the tree;
* keep track of the number of levels excluded.
*/
if (ps->del > 1) {
ps->del--;
return;
}
if (ps->del == 0)
pnode_closetext(ps);
if (name != NULL) {
for (elem = elements; elem->name != NULL; elem++)
if (strcmp(elem->name, name) == 0)
break;
node = elem->node;
} else
node = ps->ncur;
switch (node) {
case NODE_DELETE_WARN:
case NODE_DELETE:
if (ps->del > 0)
ps->del--;
break;
case NODE_IGNORE:
break;
default:
if (ps->cur == NULL || node != ps->cur->node) {
warn_msg(ps, "element not open: </%s>", name);
break;
}
/*
* Refrain from actually closing the document element.
* If no more content follows, no harm is done, but if
* some content still follows, simply processing it is
* obviously better than discarding it or crashing.
*/
if (ps->cur->parent == NULL)
ps->tree->flags |= TREE_CLOSED;
else
ps->cur = ps->cur->parent;
ps->spc = 0;
break;
}
assert(ps->del == 0);
}
struct parse *
parse_alloc(int warn)
{
struct parse *p;
if ((p = calloc(1, sizeof(*p))) == NULL)
return NULL;
if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
free(p);
return NULL;
}
p->warn = warn;
return p;
}
void
parse_free(struct parse *p)
{
if (p == NULL)
return;
if (p->tree != NULL) {
pnode_unlink(p->tree->root);
free(p->tree);
}
free(p);
}
static void
increment(struct parse *p, char *b, size_t *pend, int refill)
{
if (refill) {
if (b[*pend] == '\n') {
p->nline++;
p->ncol = 1;
} else
p->ncol++;
}
++*pend;
}
/*
* Advance the pend pointer to the next character in the charset.
* If the charset starts with a space, it stands for any whitespace.
* Update the new input file position, used for messages.
* Do not overrun the buffer b of length rlen.
* When reaching the end, NUL-terminate the buffer and return 1;
* otherwise, return 0.
*/
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
const char *charset, int refill)
{
int space;
if (*charset == ' ') {
space = 1;
charset++;
} else
space = 0;
if (refill) {
p->nline = p->line;
p->ncol = p->col;
}
while (*pend < rlen) {
if (space && isspace((unsigned char)b[*pend]))
break;
if (strchr(charset, b[*pend]) != NULL)
break;
increment(p, b, pend, refill);
}
if (*pend == rlen) {
b[rlen] = '\0';
return refill;
} else
return 0;
}
size_t
parse_string(struct parse *p, char *b, size_t rlen,
enum pstate *pstate, int refill)
{
char *cp;
size_t poff; /* Parse offset in b[]. */
size_t pend; /* Offset of the end of the current word. */
int elem_end;
p->spc = 0;
pend = 0;
for (;;) {
/* Proceed to the next token, skipping whitespace. */
if (refill) {
p->line = p->nline;
p->col = p->ncol;
}
if ((poff = pend) == rlen)
break;
if (isspace((unsigned char)b[pend])) {
p->spc = 1;
increment(p, b, &pend, refill);
continue;
}
/*
* The following four cases (ARG, TAG, and starting an
* entity or a tag) all parse a word or quoted string.
* If that extends beyond the read buffer and the last
* read(2) still got data, they all break out of the
* token loop to request more data from the read loop.
*
* Also, three of them detect self-closing tags, those
* ending with "/>", setting the flag elem_end and
* calling xml_elem_end() at the very end, after
* handling the attribute value, attribute name, or
* tag name, respectively.
*/
/* Parse an attribute value. */
if (*pstate >= PARSE_ARG) {
if (*pstate == PARSE_ARG &&
(b[pend] == '\'' || b[pend] == '"')) {
*pstate = b[pend] == '"' ?
PARSE_DQ : PARSE_SQ;
increment(p, b, &pend, refill);
continue;
}
if (advance(p, b, rlen, &pend,
*pstate == PARSE_DQ ? "\"" :
*pstate == PARSE_SQ ? "'" : " >", refill))
break;
*pstate = PARSE_TAG;
elem_end = 0;
if (b[pend] == '>') {
*pstate = PARSE_ELEM;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_attrval(p, b + poff);
if (elem_end)
xml_elem_end(p, NULL);
/* Look for an attribute name. */
} else if (*pstate == PARSE_TAG) {
if (advance(p, b, rlen, &pend, " =>", refill))
break;
elem_end = 0;
switch (b[pend]) {
case '>':
*pstate = PARSE_ELEM;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
break;
case '=':
*pstate = PARSE_ARG;
break;
default:
break;
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_attrkey(p, b + poff);
if (elem_end)
xml_elem_end(p, NULL);
/* Begin an opening or closing tag. */
} else if (b[poff] == '<') {
if (advance(p, b, rlen, &pend, " >", refill))
break;
if (pend > poff + 3 &&
strncmp(b + poff, "<!--", 4) == 0) {
/* Skip a comment. */
cp = strstr(b + pend - 2, "-->");
if (cp == NULL) {
if (refill)
break;
cp = b + rlen;
} else
cp += 3;
while (b + pend < cp)
increment(p, b, &pend, refill);
continue;
}
elem_end = 0;
if (b[pend] != '>')
*pstate = PARSE_TAG;
else if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
if (b[++poff] == '/') {
elem_end = 1;
poff++;
} else
xml_elem_start(p, b + poff);
if (elem_end)
xml_elem_end(p, b + poff);
/* Process an entity. */
} else if (b[poff] == '&') {
if (advance(p, b, rlen, &pend, ";", refill))
break;
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_entity(p, b + poff + 1);
/* Process text up to the next tag, entity, or EOL. */
} else {
advance(p, b, rlen, &pend, "<&", refill);
xml_char(p, b + poff, pend - poff);
}
}
return poff;
}
struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
char b[4096];
ssize_t rsz; /* Return value from read(2). */
size_t rlen; /* Number of bytes in b[]. */
size_t poff; /* Parse offset in b[]. */
enum pstate pstate;
p->fname = fname;
p->nline = 1;
p->ncol = 1;
pstate = PARSE_ELEM;
rlen = 0;
/*
* Read loop.
*
* If the previous token was incomplete and asked for more
* input, we have to enter the read loop once more even on EOF.
* Once rsz is 0, incomplete tokens will no longer ask
* for more input but instead use whatever there is,
* and then exit the read loop.
* The minus one on the size limit for read(2) is needed
* such that advance() can set b[rlen] to NUL when needed.
*/
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
(rlen += rsz) > 0) {
poff = parse_string(p, b, rlen, &pstate, rsz > 0);
/* Buffer exhausted; shift left and re-fill. */
assert(poff > 0);
rlen -= poff;
memmove(b, b + poff, rlen);
}
if (rsz < 0) {
perror(fname);
p->tree->flags |= TREE_FAIL;
}
pnode_closetext(p);
if ((p->tree->flags & TREE_CLOSED) == 0)
warn_msg(p, "document not closed");
return p->tree;
}