File: [cvsweb.bsd.lv] / docbook2mdoc / parse.c (download)
Revision 1.46, Tue Apr 16 21:19:54 2019 UTC (4 years, 5 months ago) by schwarze
Branch: MAIN
CVS Tags: VERSION_1_0_0 Changes since 1.45: +3 -1 lines
treat <sect3> and <sect4> like <section>
|
/* $Id: parse.c,v 1.46 2019/04/16 21:19:54 schwarze Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <assert.h>
#include <ctype.h>
#include <errno.h>
#include <fcntl.h>
#include <libgen.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "node.h"
#include "parse.h"
/*
* The implementation of the DocBook parser.
*/
enum pstate {
PARSE_ELEM,
PARSE_TAG,
PARSE_ARG,
PARSE_SQ,
PARSE_DQ
};
/*
* Global parse state.
* Keep this as simple and small as possible.
*/
struct parse {
const char *fname; /* Name of the input file. */
struct ptree *tree; /* Complete parse result. */
struct pnode *doctype;
struct pnode *cur; /* Current node in the tree. */
enum nodeid ncur; /* Type of the current node. */
int line; /* Line number in the input file. */
int col; /* Column number in the input file. */
int nline; /* Line number of next token. */
int ncol; /* Column number of next token. */
int del; /* Levels of nested nodes being deleted. */
int nofill; /* Levels of open no-fill displays. */
int flags;
#define PFLAG_WARN (1 << 0) /* Print warning messages. */
#define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
#define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
#define PFLAG_EEND (1 << 3) /* This element is self-closing. */
};
struct alias {
const char *name; /* DocBook element name. */
enum nodeid node; /* Node type to generate. */
};
static const struct alias aliases[] = {
{ "acronym", NODE_IGNORE },
{ "affiliation", NODE_IGNORE },
{ "anchor", NODE_DELETE },
{ "application", NODE_COMMAND },
{ "article", NODE_SECTION },
{ "articleinfo", NODE_BOOKINFO },
{ "book", NODE_SECTION },
{ "chapter", NODE_SECTION },
{ "caption", NODE_IGNORE },
{ "code", NODE_LITERAL },
{ "computeroutput", NODE_LITERAL },
{ "!doctype", NODE_DOCTYPE },
{ "figure", NODE_IGNORE },
{ "firstname", NODE_PERSONNAME },
{ "glossary", NODE_VARIABLELIST },
{ "glossdef", NODE_IGNORE },
{ "glossdiv", NODE_IGNORE },
{ "glossentry", NODE_VARLISTENTRY },
{ "glosslist", NODE_VARIABLELIST },
{ "holder", NODE_IGNORE },
{ "imageobject", NODE_IGNORE },
{ "indexterm", NODE_DELETE },
{ "informaltable", NODE_TABLE },
{ "keycap", NODE_KEYSYM },
{ "keycode", NODE_IGNORE },
{ "mediaobject", NODE_BLOCKQUOTE },
{ "orgname", NODE_IGNORE },
{ "othercredit", NODE_AUTHOR },
{ "othername", NODE_PERSONNAME },
{ "part", NODE_SECTION },
{ "phrase", NODE_IGNORE },
{ "primary", NODE_DELETE },
{ "property", NODE_PARAMETER },
{ "refsect1", NODE_SECTION },
{ "refsect2", NODE_SECTION },
{ "refsect3", NODE_SECTION },
{ "refsection", NODE_SECTION },
{ "releaseinfo", NODE_IGNORE },
{ "returnvalue", NODE_IGNORE },
{ "secondary", NODE_DELETE },
{ "sect1", NODE_SECTION },
{ "sect2", NODE_SECTION },
{ "sect3", NODE_SECTION },
{ "sect4", NODE_SECTION },
{ "sgmltag", NODE_MARKUP },
{ "simpara", NODE_PARA },
{ "structfield", NODE_PARAMETER },
{ "structname", NODE_TYPE },
{ "surname", NODE_PERSONNAME },
{ "symbol", NODE_CONSTANT },
{ "trademark", NODE_IGNORE },
{ "ulink", NODE_LINK },
{ "userinput", NODE_LITERAL },
{ "year", NODE_IGNORE },
{ NULL, NODE_IGNORE }
};
struct entity {
const char *name;
const char *roff;
};
/*
* XML character entity references found in the wild.
* Those that don't have an exact mandoc_char(7) representation
* are approximated, and the desired codepoint is given as a comment.
* Encoding them as \\[u...] would leave -Tascii out in the cold.
*/
static const struct entity entities[] = {
{ "alpha", "\\(*a" },
{ "amp", "&" },
{ "apos", "'" },
{ "auml", "\\(:a" },
{ "beta", "\\(*b" },
{ "circ", "^" }, /* U+02C6 */
{ "copy", "\\(co" },
{ "dagger", "\\(dg" },
{ "Delta", "\\(*D" },
{ "eacute", "\\('e" },
{ "emsp", "\\ " }, /* U+2003 */
{ "gt", ">" },
{ "hairsp", "\\^" },
{ "kappa", "\\(*k" },
{ "larr", "\\(<-" },
{ "ldquo", "\\(lq" },
{ "le", "\\(<=" },
{ "lowbar", "_" },
{ "lsqb", "[" },
{ "lt", "<" },
{ "mdash", "\\(em" },
{ "minus", "\\-" },
{ "ndash", "\\(en" },
{ "nbsp", "\\ " },
{ "num", "#" },
{ "oslash", "\\(/o" },
{ "ouml", "\\(:o" },
{ "percnt", "%" },
{ "quot", "\\(dq" },
{ "rarr", "\\(->" },
{ "rArr", "\\(rA" },
{ "rdquo", "\\(rq" },
{ "reg", "\\(rg" },
{ "rho", "\\(*r" },
{ "rsqb", "]" },
{ "sigma", "\\(*s" },
{ "shy", "\\&" }, /* U+00AD */
{ "tau", "\\(*t" },
{ "tilde", "\\[u02DC]" },
{ "times", "\\[tmu]" },
{ "uuml", "\\(:u" },
{ NULL, NULL }
};
static size_t parse_string(struct parse *, char *, size_t,
enum pstate *, int);
static void parse_fd(struct parse *, int);
static void
fatal(struct parse *p)
{
fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
perror(NULL);
exit(6);
}
static void
error_msg(struct parse *p, const char *fmt, ...)
{
va_list ap;
fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
fputc('\n', stderr);
p->tree->flags |= TREE_ERROR;
}
static void
warn_msg(struct parse *p, const char *fmt, ...)
{
va_list ap;
if ((p->flags & PFLAG_WARN) == 0)
return;
fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
va_start(ap, fmt);
vfprintf(stderr, fmt, ap);
va_end(ap);
fputc('\n', stderr);
p->tree->flags |= TREE_WARN;
}
/*
* Process a string of characters.
* If a text node is already open, append to it.
* Otherwise, create a new one as a child of the current node.
*/
static void
xml_text(struct parse *p, const char *word, int sz)
{
struct pnode *n, *np;
size_t oldsz, newsz;
int i;
assert(sz > 0);
if (p->del > 0)
return;
if ((n = p->cur) == NULL) {
error_msg(p, "discarding text before document: %.*s",
sz, word);
return;
}
/* Append to the current text node, if one is open. */
if (n->node == NODE_TEXT) {
oldsz = strlen(n->b);
newsz = oldsz + sz;
if (oldsz && (p->flags & PFLAG_SPC))
newsz++;
if ((n->b = realloc(n->b, newsz + 1)) == NULL)
fatal(p);
if (oldsz && (p->flags & PFLAG_SPC))
n->b[oldsz++] = ' ';
memcpy(n->b + oldsz, word, sz);
n->b[newsz] = '\0';
p->flags &= ~PFLAG_SPC;
return;
}
if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
warn_msg(p, "text after end of document: %.*s", sz, word);
/* Create a new text node. */
if ((n = pnode_alloc(p->cur)) == NULL)
fatal(p);
n->node = NODE_TEXT;
n->spc = (p->flags & PFLAG_SPC) != 0;
p->flags &= ~PFLAG_SPC;
/*
* If this node follows an in-line macro without intervening
* whitespace, keep the text in it as short as possible,
* and do not keep it open.
*/
np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
while (np != NULL) {
switch (pnode_class(np->node)) {
case CLASS_VOID:
case CLASS_TEXT:
case CLASS_BLOCK:
case CLASS_NOFILL:
np = NULL;
break;
case CLASS_TRANS:
np = TAILQ_LAST(&np->childq, pnodeq);
continue;
case CLASS_LINE:
case CLASS_ENCL:
break;
}
break;
}
if (np != NULL) {
i = 0;
while (i < sz && !isspace((unsigned char)word[i]))
i++;
if ((n->b = strndup(word, i)) == NULL)
fatal(p);
if (i == sz)
return;
while (i < sz && isspace((unsigned char)word[i]))
i++;
if (i == sz) {
p->flags |= PFLAG_SPC;
return;
}
/* Put any remaining text into a second node. */
if ((n = pnode_alloc(p->cur)) == NULL)
fatal(p);
n->node = NODE_TEXT;
n->spc = 1;
word += i;
sz -= i;
}
if ((n->b = strndup(word, sz)) == NULL)
fatal(p);
/* The new node remains open for later pnode_closetext(). */
p->cur = n;
}
/*
* Close out the text node and strip trailing whitespace, if one is open.
*/
static void
pnode_closetext(struct parse *p, int check_last_word)
{
struct pnode *n;
char *cp, *last_word;
if ((n = p->cur) == NULL || n->node != NODE_TEXT)
return;
p->cur = n->parent;
for (cp = strchr(n->b, '\0');
cp > n->b && isspace((unsigned char)cp[-1]);
*--cp = '\0')
p->flags |= PFLAG_SPC;
if (p->flags & PFLAG_SPC || !check_last_word)
return;
/*
* Find the beginning of the last word
* and delete whitespace before it.
*/
while (cp > n->b && !isspace((unsigned char)cp[-1]))
cp--;
if (cp == n->b)
return;
last_word = cp;
while (cp > n->b && isspace((unsigned char)cp[-1]))
*--cp = '\0';
/* Move the last word into its own node, for use with .Pf. */
if ((n = pnode_alloc(p->cur)) == NULL)
fatal(p);
n->node = NODE_TEXT;
n->spc = 1;
if ((n->b = strdup(last_word)) == NULL)
fatal(p);
}
static void
xml_entity(struct parse *p, const char *name)
{
const struct entity *entity;
struct pnode *n;
const char *ccp;
char *cp;
enum pstate pstate;
if (p->del > 0)
return;
if (p->cur == NULL) {
error_msg(p, "discarding entity before document: &%s;", name);
return;
}
pnode_closetext(p, 0);
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
warn_msg(p, "entity after end of document: &%s;", name);
for (entity = entities; entity->name != NULL; entity++)
if (strcmp(name, entity->name) == 0)
break;
if (entity->roff == NULL) {
if (p->doctype != NULL) {
TAILQ_FOREACH(n, &p->doctype->childq, child) {
if ((ccp = pnode_getattr_raw(n,
ATTRKEY_NAME, NULL)) == NULL ||
strcmp(ccp, name) != 0)
continue;
if ((ccp = pnode_getattr_raw(n,
ATTRKEY_SYSTEM, NULL)) != NULL) {
parse_file(p, -1, ccp);
p->flags &= ~PFLAG_SPC;
return;
}
if ((ccp = pnode_getattr_raw(n,
ATTRKEY_DEFINITION, NULL)) == NULL)
continue;
if ((cp = strdup(ccp)) == NULL)
fatal(p);
pstate = PARSE_ELEM;
parse_string(p, cp, strlen(cp), &pstate, 0);
p->flags &= ~PFLAG_SPC;
free(cp);
return;
}
}
error_msg(p, "unknown entity &%s;", name);
return;
}
/* Create, append, and close out an entity node. */
if ((n = pnode_alloc(p->cur)) == NULL ||
(n->b = strdup(entity->roff)) == NULL)
fatal(p);
n->node = NODE_ESCAPE;
n->spc = (p->flags & PFLAG_SPC) != 0;
p->flags &= ~PFLAG_SPC;
}
/*
* Parse an element name.
*/
static enum nodeid
xml_name2node(struct parse *p, const char *name)
{
const struct alias *alias;
enum nodeid node;
if ((node = pnode_parse(name)) < NODE_UNKNOWN)
return node;
for (alias = aliases; alias->name != NULL; alias++)
if (strcmp(alias->name, name) == 0)
return alias->node;
return NODE_UNKNOWN;
}
/*
* Begin an element.
*/
static void
xml_elem_start(struct parse *p, const char *name)
{
struct pnode *n;
/*
* An ancestor is excluded from the tree;
* keep track of the number of levels excluded.
*/
if (p->del > 0) {
if (*name != '!' && *name != '?')
p->del++;
return;
}
switch (p->ncur = xml_name2node(p, name)) {
case NODE_DELETE_WARN:
warn_msg(p, "skipping element <%s>", name);
/* FALLTHROUGH */
case NODE_DELETE:
p->del = 1;
/* FALLTHROUGH */
case NODE_IGNORE:
return;
case NODE_UNKNOWN:
if (*name != '!' && *name != '?')
error_msg(p, "unknown element <%s>", name);
return;
default:
break;
}
if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
warn_msg(p, "element after end of document: <%s>", name);
switch (pnode_class(p->ncur)) {
case CLASS_LINE:
case CLASS_ENCL:
pnode_closetext(p, 1);
break;
default:
pnode_closetext(p, 0);
break;
}
if ((n = pnode_alloc(p->cur)) == NULL)
fatal(p);
/*
* Some elements are self-closing.
* Nodes that begin a new macro or request line or start by
* printing text always want whitespace before themselves.
*/
switch (n->node = p->ncur) {
case NODE_DOCTYPE:
case NODE_ENTITY:
case NODE_SBR:
p->flags |= PFLAG_EEND;
break;
default:
break;
}
switch (pnode_class(p->ncur)) {
case CLASS_LINE:
case CLASS_ENCL:
n->spc = (p->flags & PFLAG_SPC) != 0;
break;
case CLASS_NOFILL:
p->nofill++;
/* FALLTHROUGH */
default:
n->spc = 1;
break;
}
p->cur = n;
if (n->node == NODE_DOCTYPE) {
if (p->doctype == NULL)
p->doctype = n;
else
error_msg(p, "duplicate doctype");
} else if (n->parent == NULL && p->tree->root == NULL)
p->tree->root = n;
}
static void
xml_attrkey(struct parse *p, const char *name)
{
struct pattr *a;
const char *value;
enum attrkey key;
if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
return;
if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
TAILQ_FIRST(&p->cur->attrq) == NULL) {
value = name;
name = "NAME";
} else
value = NULL;
if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
p->flags &= ~PFLAG_ATTR;
return;
}
if ((a = calloc(1, sizeof(*a))) == NULL)
fatal(p);
a->key = key;
a->val = ATTRVAL__MAX;
if (value == NULL) {
a->rawval = NULL;
p->flags |= PFLAG_ATTR;
} else {
if ((a->rawval = strdup(value)) == NULL)
fatal(p);
p->flags &= ~PFLAG_ATTR;
}
TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
xml_attrkey(p, "DEFINITION");
}
static void
xml_attrval(struct parse *p, const char *name)
{
struct pattr *a;
if (p->del > 0 || p->ncur == NODE_IGNORE ||
(p->flags & PFLAG_ATTR) == 0)
return;
if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
return;
if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
(a->rawval = strdup(name)) == NULL)
fatal(p);
p->flags &= ~PFLAG_ATTR;
}
/*
* Roll up the parse tree.
* If we're at a text node, roll that one up first.
*/
static void
xml_elem_end(struct parse *p, const char *name)
{
struct pnode *n;
const char *cp;
enum nodeid node;
/*
* An ancestor is excluded from the tree;
* keep track of the number of levels excluded.
*/
if (p->del > 1) {
p->del--;
return;
}
if (p->del == 0)
pnode_closetext(p, 0);
node = name == NULL ? p->ncur : xml_name2node(p, name);
switch (node) {
case NODE_DELETE_WARN:
case NODE_DELETE:
if (p->del > 0)
p->del--;
break;
case NODE_IGNORE:
case NODE_UNKNOWN:
break;
case NODE_INCLUDE:
n = p->cur;
p->cur = p->cur->parent;
cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
if (cp == NULL)
error_msg(p, "<xi:include> element "
"without href attribute");
else
parse_file(p, -1, cp);
pnode_unlink(n);
p->flags &= ~PFLAG_SPC;
break;
case NODE_DOCTYPE:
case NODE_SBR:
p->flags &= ~PFLAG_EEND;
/* FALLTHROUGH */
default:
if (p->cur == NULL || node != p->cur->node) {
warn_msg(p, "element not open: </%s>", name);
break;
}
if (pnode_class(node) == CLASS_NOFILL)
p->nofill--;
/*
* Refrain from actually closing the document element.
* If no more content follows, no harm is done, but if
* some content still follows, simply processing it is
* obviously better than discarding it or crashing.
*/
if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
p->cur = p->cur->parent;
if (p->cur != NULL)
p->ncur = p->cur->node;
} else
p->tree->flags |= TREE_CLOSED;
p->flags &= ~PFLAG_SPC;
break;
}
assert(p->del == 0);
}
struct parse *
parse_alloc(int warn)
{
struct parse *p;
if ((p = calloc(1, sizeof(*p))) == NULL)
return NULL;
if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
free(p);
return NULL;
}
if (warn)
p->flags |= PFLAG_WARN;
else
p->flags &= ~PFLAG_WARN;
return p;
}
void
parse_free(struct parse *p)
{
if (p == NULL)
return;
if (p->tree != NULL) {
pnode_unlink(p->tree->root);
free(p->tree);
}
free(p);
}
static void
increment(struct parse *p, char *b, size_t *pend, int refill)
{
if (refill) {
if (b[*pend] == '\n') {
p->nline++;
p->ncol = 1;
} else
p->ncol++;
}
++*pend;
}
/*
* Advance the pend pointer to the next character in the charset.
* If the charset starts with a space, it stands for any whitespace.
* Update the new input file position, used for messages.
* Do not overrun the buffer b of length rlen.
* When reaching the end, NUL-terminate the buffer and return 1;
* otherwise, return 0.
*/
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
const char *charset, int refill)
{
int space;
if (*charset == ' ') {
space = 1;
charset++;
} else
space = 0;
if (refill) {
p->nline = p->line;
p->ncol = p->col;
}
while (*pend < rlen) {
if (space && isspace((unsigned char)b[*pend]))
break;
if (strchr(charset, b[*pend]) != NULL)
break;
increment(p, b, pend, refill);
}
if (*pend == rlen) {
b[rlen] = '\0';
return refill;
} else
return 0;
}
size_t
parse_string(struct parse *p, char *b, size_t rlen,
enum pstate *pstate, int refill)
{
char *cp;
size_t pws; /* Parse offset including whitespace. */
size_t poff; /* Parse offset in b[]. */
size_t pend; /* Offset of the end of the current word. */
int elem_end;
pend = pws = 0;
for (;;) {
/* Proceed to the next token, skipping whitespace. */
if (refill) {
p->line = p->nline;
p->col = p->ncol;
}
if ((poff = pend) == rlen)
break;
if (isspace((unsigned char)b[pend])) {
p->flags |= PFLAG_SPC;
if (b[pend] == '\n')
pws = pend + 1;
increment(p, b, &pend, refill);
continue;
}
/*
* The following four cases (ARG, TAG, and starting an
* entity or a tag) all parse a word or quoted string.
* If that extends beyond the read buffer and the last
* read(2) still got data, they all break out of the
* token loop to request more data from the read loop.
*
* Also, three of them detect self-closing tags, those
* ending with "/>", setting the flag elem_end and
* calling xml_elem_end() at the very end, after
* handling the attribute value, attribute name, or
* tag name, respectively.
*/
/* Parse an attribute value. */
if (*pstate >= PARSE_ARG) {
if (*pstate == PARSE_ARG &&
(b[pend] == '\'' || b[pend] == '"')) {
*pstate = b[pend] == '"' ?
PARSE_DQ : PARSE_SQ;
increment(p, b, &pend, refill);
continue;
}
if (advance(p, b, rlen, &pend,
*pstate == PARSE_DQ ? "\"" :
*pstate == PARSE_SQ ? "'" : " >", refill))
break;
*pstate = PARSE_TAG;
elem_end = 0;
if (b[pend] == '>') {
*pstate = PARSE_ELEM;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
if (p->flags & PFLAG_EEND)
elem_end = 1;
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_attrval(p, b + poff);
if (elem_end)
xml_elem_end(p, NULL);
/* Look for an attribute name. */
} else if (*pstate == PARSE_TAG) {
switch (p->ncur) {
case NODE_DOCTYPE:
if (b[pend] == '[') {
*pstate = PARSE_ELEM;
increment(p, b, &pend, refill);
continue;
}
/* FALLTHROUGH */
case NODE_ENTITY:
if (b[pend] == '"' || b[pend] == '\'') {
*pstate = PARSE_ARG;
continue;
}
break;
default:
break;
}
if (advance(p, b, rlen, &pend, " =>", refill))
break;
elem_end = 0;
switch (b[pend]) {
case '>':
*pstate = PARSE_ELEM;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
if (p->flags & PFLAG_EEND)
elem_end = 1;
break;
case '=':
*pstate = PARSE_ARG;
break;
default:
break;
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_attrkey(p, b + poff);
if (elem_end)
xml_elem_end(p, NULL);
/* Begin an opening or closing tag. */
} else if (b[poff] == '<') {
if (advance(p, b, rlen, &pend, " >", refill))
break;
if (pend > poff + 3 &&
strncmp(b + poff, "<!--", 4) == 0) {
/* Skip a comment. */
cp = strstr(b + pend - 2, "-->");
if (cp == NULL) {
if (refill)
break;
cp = b + rlen;
} else
cp += 3;
while (b + pend < cp)
increment(p, b, &pend, refill);
continue;
}
elem_end = 0;
if (b[pend] != '>')
*pstate = PARSE_TAG;
else if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
if (b[++poff] == '/') {
elem_end = 1;
poff++;
} else {
xml_elem_start(p, b + poff);
if (*pstate == PARSE_ELEM &&
p->flags & PFLAG_EEND)
elem_end = 1;
}
if (elem_end)
xml_elem_end(p, b + poff);
/* Close a doctype. */
} else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
*pstate = PARSE_TAG;
increment(p, b, &pend, refill);
/* Process an entity. */
} else if (b[poff] == '&') {
if (advance(p, b, rlen, &pend, ";", refill))
break;
b[pend] = '\0';
if (pend < rlen)
increment(p, b, &pend, refill);
xml_entity(p, b + poff + 1);
/* Process text up to the next tag, entity, or EOL. */
} else {
advance(p, b, rlen, &pend,
p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
refill);
if (p->nofill)
poff = pws;
xml_text(p, b + poff, pend - poff);
if (b[pend] == '\n')
pnode_closetext(p, 0);
}
pws = pend;
}
return poff;
}
/*
* The read loop.
* If the previous token was incomplete and asked for more input,
* we have to enter the read loop once more even on EOF.
* Once rsz is 0, incomplete tokens will no longer ask for more input
* but instead use whatever there is, and then exit the read loop.
* The minus one on the size limit for read(2) is needed such that
* advance() can set b[rlen] to NUL when needed.
*/
static void
parse_fd(struct parse *p, int fd)
{
char b[4096];
ssize_t rsz; /* Return value from read(2). */
size_t rlen; /* Number of bytes in b[]. */
size_t poff; /* Parse offset in b[]. */
enum pstate pstate;
rlen = 0;
pstate = PARSE_ELEM;
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
(rlen += rsz) > 0) {
poff = parse_string(p, b, rlen, &pstate, rsz > 0);
/* Buffer exhausted; shift left and re-fill. */
assert(poff > 0);
rlen -= poff;
memmove(b, b + poff, rlen);
}
if (rsz < 0)
error_msg(p, "read: %s", strerror(errno));
}
/*
* Open and parse a file.
*/
struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
const char *save_fname;
int save_line, save_col;
/* Save and initialize reporting data. */
save_fname = p->fname;
save_line = p->nline;
save_col = p->ncol;
p->fname = fname;
p->line = 0;
p->col = 0;
/* Open the file, unless it is already open. */
if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
error_msg(p, "open: %s", strerror(errno));
p->fname = save_fname;
return p->tree;
}
/*
* After opening the starting file, change to the directory it
* is located in, in case it wants to include any further files,
* which are typically given with relative paths in DocBook.
* Do this on a best-effort basis; don't complain about failure.
*/
if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
strcmp(fname, ".") != 0)
(void)chdir(fname);
/* Run the read loop. */
p->nline = 1;
p->ncol = 1;
parse_fd(p, fd);
/* On the top level, finalize the parse tree. */
if (save_fname == NULL) {
pnode_closetext(p, 0);
if (p->tree->root == NULL)
error_msg(p, "empty document");
else if ((p->tree->flags & TREE_CLOSED) == 0)
warn_msg(p, "document not closed");
pnode_unlink(p->doctype);
}
/* Clean up. */
if (fd != STDIN_FILENO)
close(fd);
p->fname = save_fname;
p->nline = save_line;
p->ncol = save_col;
return p->tree;
}