version 1.12, 2019/04/03 16:08:57 |
version 1.18, 2019/04/07 17:00:56 |
|
|
* The implementation of the DocBook parser. |
* The implementation of the DocBook parser. |
*/ |
*/ |
|
|
|
enum pstate { |
|
PARSE_ELEM, |
|
PARSE_TAG, |
|
PARSE_ARG, |
|
PARSE_SQ, |
|
PARSE_DQ |
|
}; |
|
|
/* |
/* |
* Global parse state. |
* Global parse state. |
* Keep this as simple and small as possible. |
* Keep this as simple and small as possible. |
|
|
int nline; /* Line number of next token. */ |
int nline; /* Line number of next token. */ |
int ncol; /* Column number of next token. */ |
int ncol; /* Column number of next token. */ |
int del; /* Levels of nested nodes being deleted. */ |
int del; /* Levels of nested nodes being deleted. */ |
|
int spc; /* Whitespace before the next element. */ |
int attr; /* The most recent attribute is valid. */ |
int attr; /* The most recent attribute is valid. */ |
int warn; |
int warn; |
}; |
}; |
Line 69 static const struct element elements[] = { |
|
Line 78 static const struct element elements[] = { |
|
{ "citerefentry", NODE_CITEREFENTRY }, |
{ "citerefentry", NODE_CITEREFENTRY }, |
{ "citetitle", NODE_CITETITLE }, |
{ "citetitle", NODE_CITETITLE }, |
{ "cmdsynopsis", NODE_CMDSYNOPSIS }, |
{ "cmdsynopsis", NODE_CMDSYNOPSIS }, |
{ "code", NODE_CODE }, |
{ "code", NODE_LITERAL }, |
{ "colspec", NODE_COLSPEC }, |
{ "colspec", NODE_COLSPEC }, |
{ "command", NODE_COMMAND }, |
{ "command", NODE_COMMAND }, |
{ "constant", NODE_CONSTANT }, |
{ "constant", NODE_CONSTANT }, |
Line 81 static const struct element elements[] = { |
|
Line 90 static const struct element elements[] = { |
|
{ "emphasis", NODE_EMPHASIS }, |
{ "emphasis", NODE_EMPHASIS }, |
{ "entry", NODE_ENTRY }, |
{ "entry", NODE_ENTRY }, |
{ "envar", NODE_ENVAR }, |
{ "envar", NODE_ENVAR }, |
|
{ "errorname", NODE_ERRORNAME }, |
{ "fieldsynopsis", NODE_FIELDSYNOPSIS }, |
{ "fieldsynopsis", NODE_FIELDSYNOPSIS }, |
{ "filename", NODE_FILENAME }, |
{ "filename", NODE_FILENAME }, |
{ "firstname", NODE_PERSONNAME }, |
{ "firstname", NODE_PERSONNAME }, |
Line 161 static const struct element elements[] = { |
|
Line 171 static const struct element elements[] = { |
|
{ "sect2", NODE_SECTION }, |
{ "sect2", NODE_SECTION }, |
{ "section", NODE_SECTION }, |
{ "section", NODE_SECTION }, |
{ "sgmltag", NODE_SGMLTAG }, |
{ "sgmltag", NODE_SGMLTAG }, |
|
{ "simpara", NODE_PARA }, |
{ "simplelist", NODE_SIMPLELIST }, |
{ "simplelist", NODE_SIMPLELIST }, |
{ "spanspec", NODE_SPANSPEC }, |
{ "spanspec", NODE_SPANSPEC }, |
{ "structname", NODE_STRUCTNAME }, |
{ "structfield", NODE_PARAMETER }, |
|
{ "structname", NODE_TYPE }, |
{ "subtitle", NODE_SUBTITLE }, |
{ "subtitle", NODE_SUBTITLE }, |
{ "surname", NODE_PERSONNAME }, |
{ "surname", NODE_PERSONNAME }, |
{ "symbol", NODE_CONSTANT }, |
{ "symbol", NODE_CONSTANT }, |
Line 178 static const struct element elements[] = { |
|
Line 190 static const struct element elements[] = { |
|
{ "title", NODE_TITLE }, |
{ "title", NODE_TITLE }, |
{ "trademark", NODE_IGNORE }, |
{ "trademark", NODE_IGNORE }, |
{ "type", NODE_TYPE }, |
{ "type", NODE_TYPE }, |
{ "ulink", NODE_ULINK }, |
{ "ulink", NODE_LINK }, |
{ "userinput", NODE_USERINPUT }, |
{ "userinput", NODE_LITERAL }, |
{ "variablelist", NODE_VARIABLELIST }, |
{ "variablelist", NODE_VARIABLELIST }, |
{ "varlistentry", NODE_VARLISTENTRY }, |
{ "varlistentry", NODE_VARLISTENTRY }, |
{ "varname", NODE_VARNAME }, |
{ "varname", NODE_VARNAME }, |
|
|
xml_char(struct parse *ps, const char *p, int sz) |
xml_char(struct parse *ps, const char *p, int sz) |
{ |
{ |
struct pnode *dat; |
struct pnode *dat; |
|
size_t newsz; |
|
|
if (ps->del > 0) |
if (ps->del > 0) |
return; |
return; |
Line 298 xml_char(struct parse *ps, const char *p, int sz) |
|
Line 311 xml_char(struct parse *ps, const char *p, int sz) |
|
exit(1); |
exit(1); |
} |
} |
dat->node = NODE_TEXT; |
dat->node = NODE_TEXT; |
|
dat->spc = ps->spc; |
dat->parent = ps->cur; |
dat->parent = ps->cur; |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INIT(&dat->attrq); |
Line 312 xml_char(struct parse *ps, const char *p, int sz) |
|
Line 326 xml_char(struct parse *ps, const char *p, int sz) |
|
/* Append to the current text node. */ |
/* Append to the current text node. */ |
|
|
assert(sz >= 0); |
assert(sz >= 0); |
ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1); |
newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz; |
|
ps->cur->b = realloc(ps->cur->b, newsz + 1); |
if (ps->cur->b == NULL) { |
if (ps->cur->b == NULL) { |
perror(NULL); |
perror(NULL); |
exit(1); |
exit(1); |
} |
} |
|
if (ps->cur->bsz && ps->spc) |
|
ps->cur->b[ps->cur->bsz++] = ' '; |
memcpy(ps->cur->b + ps->cur->bsz, p, sz); |
memcpy(ps->cur->b + ps->cur->bsz, p, sz); |
ps->cur->bsz += sz; |
ps->cur->b[ps->cur->bsz = newsz] = '\0'; |
ps->cur->b[ps->cur->bsz] = '\0'; |
|
ps->cur->real = ps->cur->b; |
ps->cur->real = ps->cur->b; |
|
ps->spc = 0; |
} |
} |
|
|
|
/* |
|
* Close out the text node and strip trailing whitespace, if one is open. |
|
*/ |
static void |
static void |
pnode_trim(struct pnode *pn) |
pnode_closetext(struct parse *p) |
{ |
{ |
assert(pn->node == NODE_TEXT); |
struct pnode *n; |
for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0') |
|
if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0) |
if ((n = p->cur) == NULL || n->node != NODE_TEXT) |
break; |
return; |
|
p->cur = n->parent; |
|
while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { |
|
n->b[--n->bsz] = '\0'; |
|
p->spc = 1; |
|
} |
} |
} |
|
|
static void |
static void |
Line 346 xml_entity(struct parse *p, const char *name) |
|
Line 371 xml_entity(struct parse *p, const char *name) |
|
return; |
return; |
} |
} |
|
|
/* Close out the text node, if there is one. */ |
pnode_closetext(p); |
if (p->cur->node == NODE_TEXT) { |
|
pnode_trim(p->cur); |
|
p->cur = p->cur->parent; |
|
} |
|
|
|
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) |
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) |
warn_msg(p, "entity after end of document: &%s;", name); |
warn_msg(p, "entity after end of document: &%s;", name); |
Line 372 xml_entity(struct parse *p, const char *name) |
|
Line 393 xml_entity(struct parse *p, const char *name) |
|
} |
} |
dat->node = NODE_ESCAPE; |
dat->node = NODE_ESCAPE; |
dat->bsz = strlen(dat->b); |
dat->bsz = strlen(dat->b); |
|
dat->spc = p->spc; |
dat->parent = p->cur; |
dat->parent = p->cur; |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); |
TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); |
|
p->spc = 0; |
} |
} |
|
|
/* |
/* |
Line 399 xml_elem_start(struct parse *ps, const char *name) |
|
Line 422 xml_elem_start(struct parse *ps, const char *name) |
|
return; |
return; |
} |
} |
|
|
/* Close out the text node, if there is one. */ |
pnode_closetext(ps); |
if (ps->cur != NULL && ps->cur->node == NODE_TEXT) { |
|
pnode_trim(ps->cur); |
|
ps->cur = ps->cur->parent; |
|
} |
|
|
|
for (elem = elements; elem->name != NULL; elem++) |
for (elem = elements; elem->name != NULL; elem++) |
if (strcmp(elem->name, name) == 0) |
if (strcmp(elem->name, name) == 0) |
Line 437 xml_elem_start(struct parse *ps, const char *name) |
|
Line 456 xml_elem_start(struct parse *ps, const char *name) |
|
perror(NULL); |
perror(NULL); |
exit(1); |
exit(1); |
} |
} |
dat->node = elem->node; |
|
|
/* |
|
* Nodes that begin a new macro or request line or start by |
|
* printing text always want whitespace before themselves. |
|
*/ |
|
|
|
switch (dat->node = elem->node) { |
|
case NODE_AUTHORGROUP: |
|
case NODE_BOOKINFO: |
|
case NODE_CAUTION: |
|
case NODE_EDITOR: |
|
case NODE_ENTRY: |
|
case NODE_FUNCDEF: |
|
case NODE_FUNCPROTOTYPE: |
|
case NODE_INFORMALEQUATION: |
|
case NODE_INLINEEQUATION: |
|
case NODE_ITEMIZEDLIST: |
|
case NODE_LEGALNOTICE: |
|
case NODE_LITERALLAYOUT: |
|
case NODE_NOTE: |
|
case NODE_ORDEREDLIST: |
|
case NODE_PARA: |
|
case NODE_PREFACE: |
|
case NODE_PROGRAMLISTING: |
|
case NODE_REFMETA: |
|
case NODE_REFNAMEDIV: |
|
case NODE_REFSYNOPSISDIV: |
|
case NODE_ROW: |
|
case NODE_SBR: |
|
case NODE_SCREEN: |
|
case NODE_SECTION: |
|
case NODE_SYNOPSIS: |
|
case NODE_TGROUP: |
|
case NODE_TIP: |
|
case NODE_TITLE: |
|
case NODE_VARIABLELIST: |
|
case NODE_VARLISTENTRY: |
|
case NODE_WARNING: |
|
dat->spc = 1; |
|
break; |
|
default: |
|
dat->spc = ps->spc; |
|
break; |
|
} |
dat->parent = ps->cur; |
dat->parent = ps->cur; |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INIT(&dat->attrq); |
Line 508 xml_elem_end(struct parse *ps, const char *name) |
|
Line 570 xml_elem_end(struct parse *ps, const char *name) |
|
return; |
return; |
} |
} |
|
|
/* Close out the text node, if there is one. */ |
if (ps->del == 0) |
if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) { |
pnode_closetext(ps); |
pnode_trim(ps->cur); |
|
ps->cur = ps->cur->parent; |
|
} |
|
|
|
if (name != NULL) { |
if (name != NULL) { |
for (elem = elements; elem->name != NULL; elem++) |
for (elem = elements; elem->name != NULL; elem++) |
Line 547 xml_elem_end(struct parse *ps, const char *name) |
|
Line 606 xml_elem_end(struct parse *ps, const char *name) |
|
ps->tree->flags |= TREE_CLOSED; |
ps->tree->flags |= TREE_CLOSED; |
else |
else |
ps->cur = ps->cur->parent; |
ps->cur = ps->cur->parent; |
|
ps->spc = 0; |
break; |
break; |
} |
} |
assert(ps->del == 0); |
assert(ps->del == 0); |
Line 580 parse_free(struct parse *p) |
|
Line 640 parse_free(struct parse *p) |
|
free(p); |
free(p); |
} |
} |
|
|
|
static void |
|
increment(struct parse *p, char *b, size_t *pend, int refill) |
|
{ |
|
if (refill) { |
|
if (b[*pend] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
} |
|
++*pend; |
|
} |
|
|
/* |
/* |
* Advance the pend pointer to the next character in the charset. |
* Advance the pend pointer to the next character in the charset. |
* If the charset starts with a space, it stands for any whitespace. |
* If the charset starts with a space, it stands for any whitespace. |
Line 590 parse_free(struct parse *p) |
|
Line 663 parse_free(struct parse *p) |
|
*/ |
*/ |
static int |
static int |
advance(struct parse *p, char *b, size_t rlen, size_t *pend, |
advance(struct parse *p, char *b, size_t rlen, size_t *pend, |
const char *charset) |
const char *charset, int refill) |
{ |
{ |
int space; |
int space; |
|
|
Line 600 advance(struct parse *p, char *b, size_t rlen, size_t |
|
Line 673 advance(struct parse *p, char *b, size_t rlen, size_t |
|
} else |
} else |
space = 0; |
space = 0; |
|
|
p->nline = p->line; |
if (refill) { |
p->ncol = p->col; |
p->nline = p->line; |
|
p->ncol = p->col; |
|
} |
while (*pend < rlen) { |
while (*pend < rlen) { |
if (b[*pend] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
if (space && isspace((unsigned char)b[*pend])) |
if (space && isspace((unsigned char)b[*pend])) |
break; |
break; |
if (strchr(charset, b[*pend]) != NULL) |
if (strchr(charset, b[*pend]) != NULL) |
break; |
break; |
++*pend; |
increment(p, b, pend, refill); |
} |
} |
if (*pend == rlen) { |
if (*pend == rlen) { |
b[rlen] = '\0'; |
b[rlen] = '\0'; |
return 1; |
return refill; |
} else |
} else |
return 0; |
return 0; |
} |
} |
|
|
struct ptree * |
size_t |
parse_file(struct parse *p, int fd, const char *fname) |
parse_string(struct parse *p, char *b, size_t rlen, |
|
enum pstate *pstate, int refill) |
{ |
{ |
char b[4096]; |
|
char *cp; |
char *cp; |
ssize_t rsz; /* Return value from read(2). */ |
|
size_t rlen; /* Number of bytes in b[]. */ |
|
size_t poff; /* Parse offset in b[]. */ |
size_t poff; /* Parse offset in b[]. */ |
size_t pend; /* Offset of the end of the current word. */ |
size_t pend; /* Offset of the end of the current word. */ |
int in_tag, in_arg, in_quotes, elem_end; |
int elem_end; |
|
|
p->fname = fname; |
p->spc = 0; |
p->nline = 1; |
pend = 0; |
p->ncol = 1; |
for (;;) { |
rlen = 0; |
|
in_tag = in_arg = in_quotes = 0; |
|
|
|
/* |
/* Proceed to the next token, skipping whitespace. */ |
* Read loop. |
|
* |
|
* We have to enter the read loop once more even on EOF |
|
* because the previous token may have been incomplete, |
|
* such that it asked for more input. |
|
* Once rsz is 0, incomplete tokens will no longer ask |
|
* for more input but instead use whatever there is, |
|
* and then exit the read loop. |
|
* The minus one on the size limit for read(2) is needed |
|
* such that advance() can set b[rlen] to NUL when needed. |
|
*/ |
|
|
|
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { |
if (refill) { |
if ((rlen += rsz) == 0) |
p->line = p->nline; |
|
p->col = p->ncol; |
|
} |
|
if ((poff = pend) == rlen) |
break; |
break; |
|
if (isspace((unsigned char)b[pend])) { |
|
p->spc = 1; |
|
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
|
/* Token loop. */ |
/* |
|
* The following four cases (ARG, TAG, and starting an |
|
* entity or a tag) all parse a word or quoted string. |
|
* If that extends beyond the read buffer and the last |
|
* read(2) still got data, they all break out of the |
|
* token loop to request more data from the read loop. |
|
* |
|
* Also, three of them detect self-closing tags, those |
|
* ending with "/>", setting the flag elem_end and |
|
* calling xml_elem_end() at the very end, after |
|
* handling the attribute value, attribute name, or |
|
* tag name, respectively. |
|
*/ |
|
|
pend = 0; |
/* Parse an attribute value. */ |
for (;;) { |
|
|
|
/* Proceed to the next token, skipping whitespace. */ |
if (*pstate >= PARSE_ARG) { |
|
if (*pstate == PARSE_ARG && |
p->line = p->nline; |
(b[pend] == '\'' || b[pend] == '"')) { |
p->col = p->ncol; |
*pstate = b[pend] == '"' ? |
if ((poff = pend) == rlen) |
PARSE_DQ : PARSE_SQ; |
break; |
increment(p, b, &pend, refill); |
if (isspace((unsigned char)b[pend])) { |
|
if (b[pend++] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
continue; |
continue; |
} |
} |
|
if (advance(p, b, rlen, &pend, |
/* |
*pstate == PARSE_DQ ? "\"" : |
* The following four cases (in_arg, in_tag, and |
*pstate == PARSE_SQ ? "'" : " >", refill)) |
* starting an entity or a tag) all parse a word |
break; |
* or quoted string. If that extends beyond the |
*pstate = PARSE_TAG; |
* read buffer and the last read(2) still got |
elem_end = 0; |
* data, they all break out of the token loop |
if (b[pend] == '>') { |
* to request more data from the read loop. |
*pstate = PARSE_ELEM; |
* |
if (pend > 0 && b[pend - 1] == '/') { |
* Also, three of them detect self-closing tags, |
b[pend - 1] = '\0'; |
* those ending with "/>", setting the flag |
elem_end = 1; |
* elem_end and calling xml_elem_end() at the |
|
* very end, after handling the attribute value, |
|
* attribute name, or tag name, respectively. |
|
*/ |
|
|
|
/* Parse an attribute value. */ |
|
|
|
if (in_arg) { |
|
if (in_quotes == 0 && |
|
(b[pend] == '\'' || b[pend] == '"')) { |
|
in_quotes = b[pend] == '"' ? 2 : 1; |
|
p->ncol++; |
|
pend++; |
|
continue; |
|
} |
} |
if (advance(p, b, rlen, &pend, |
} |
in_quotes == 2 ? "\"" : |
b[pend] = '\0'; |
in_quotes == 1 ? "'" : " >") && rsz > 0) |
if (pend < rlen) |
break; |
increment(p, b, &pend, refill); |
in_arg = in_quotes = elem_end = 0; |
xml_attrval(p, b + poff); |
if (b[pend] == '>') { |
if (elem_end) |
in_tag = 0; |
xml_elem_end(p, NULL); |
if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
pend++; |
|
xml_attrval(p, b + poff); |
|
if (elem_end) |
|
xml_elem_end(p, NULL); |
|
|
|
/* Look for an attribute name. */ |
/* Look for an attribute name. */ |
|
|
} else if (in_tag) { |
} else if (*pstate == PARSE_TAG) { |
if (advance(p, b, rlen, &pend, " =>") && |
if (advance(p, b, rlen, &pend, " =>", refill)) |
rsz > 0) |
break; |
break; |
elem_end = 0; |
elem_end = 0; |
switch (b[pend]) { |
switch (b[pend]) { |
case '>': |
case '>': |
*pstate = PARSE_ELEM; |
in_tag = 0; |
if (pend > 0 && b[pend - 1] == '/') { |
if (pend > 0 && b[pend - 1] == '/') { |
b[pend - 1] = '\0'; |
b[pend - 1] = '\0'; |
elem_end = 1; |
elem_end = 1; |
|
} |
|
break; |
|
case '=': |
|
in_arg = 1; |
|
break; |
|
default: |
|
break; |
|
} |
} |
b[pend] = '\0'; |
break; |
if (pend < rlen) |
case '=': |
pend++; |
*pstate = PARSE_ARG; |
xml_attrkey(p, b + poff); |
break; |
if (elem_end) |
default: |
xml_elem_end(p, NULL); |
break; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
xml_attrkey(p, b + poff); |
|
if (elem_end) |
|
xml_elem_end(p, NULL); |
|
|
/* Begin an opening or closing tag. */ |
/* Begin an opening or closing tag. */ |
|
|
} else if (b[poff] == '<') { |
} else if (b[poff] == '<') { |
if (advance(p, b, rlen, &pend, " >") && |
if (advance(p, b, rlen, &pend, " >", refill)) |
rsz > 0) |
break; |
break; |
if (pend > poff + 3 && |
if (pend > poff + 3 && |
strncmp(b + poff, "<!--", 4) == 0) { |
strncmp(b + poff, "<!--", 4) == 0) { |
|
|
|
/* Skip a comment. */ |
/* Skip a comment. */ |
|
|
cp = strstr(b + pend - 2, "-->"); |
cp = strstr(b + pend - 2, "-->"); |
if (cp == NULL) { |
if (cp == NULL) { |
if (rsz > 0) { |
if (refill) |
pend = rlen; |
break; |
break; |
cp = b + rlen; |
} |
|
cp = b + rlen; |
|
} else |
|
cp += 3; |
|
while (b + pend < cp) { |
|
if (b[++pend] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
} |
|
continue; |
|
} |
|
elem_end = 0; |
|
if (b[pend] != '>') |
|
in_tag = 1; |
|
else if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
pend++; |
|
if (b[++poff] == '/') { |
|
elem_end = 1; |
|
poff++; |
|
} else |
} else |
xml_elem_start(p, b + poff); |
cp += 3; |
if (elem_end) |
while (b + pend < cp) |
xml_elem_end(p, b + poff); |
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
elem_end = 0; |
|
if (b[pend] != '>') |
|
*pstate = PARSE_TAG; |
|
else if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
if (b[++poff] == '/') { |
|
elem_end = 1; |
|
poff++; |
|
} else |
|
xml_elem_start(p, b + poff); |
|
if (elem_end) |
|
xml_elem_end(p, b + poff); |
|
|
/* Process an entity. */ |
/* Process an entity. */ |
|
|
} else if (b[poff] == '&') { |
} else if (b[poff] == '&') { |
if (advance(p, b, rlen, &pend, ";") && |
if (advance(p, b, rlen, &pend, ";", refill)) |
rsz > 0) |
break; |
break; |
b[pend] = '\0'; |
b[pend] = '\0'; |
if (pend < rlen) |
if (pend < rlen) |
increment(p, b, &pend, refill); |
pend++; |
xml_entity(p, b + poff + 1); |
xml_entity(p, b + poff + 1); |
|
|
|
/* Process text up to the next tag or entity. */ |
/* Process text up to the next tag, entity, or EOL. */ |
|
|
} else { |
} else { |
if (advance(p, b, rlen, &pend, "<&") == 0) |
advance(p, b, rlen, &pend, "<&", refill); |
p->ncol--; |
xml_char(p, b + poff, pend - poff); |
xml_char(p, b + poff, pend - poff); |
|
} |
|
} |
} |
|
} |
|
return poff; |
|
} |
|
|
/* Buffer exhausted; shift left and re-fill. */ |
struct ptree * |
|
parse_file(struct parse *p, int fd, const char *fname) |
|
{ |
|
char b[4096]; |
|
ssize_t rsz; /* Return value from read(2). */ |
|
size_t rlen; /* Number of bytes in b[]. */ |
|
size_t poff; /* Parse offset in b[]. */ |
|
enum pstate pstate; |
|
|
|
p->fname = fname; |
|
p->nline = 1; |
|
p->ncol = 1; |
|
pstate = PARSE_ELEM; |
|
rlen = 0; |
|
|
|
/* |
|
* Read loop. |
|
* |
|
* If the previous token was incomplete and asked for more |
|
* input, we have to enter the read loop once more even on EOF. |
|
* Once rsz is 0, incomplete tokens will no longer ask |
|
* for more input but instead use whatever there is, |
|
* and then exit the read loop. |
|
* The minus one on the size limit for read(2) is needed |
|
* such that advance() can set b[rlen] to NUL when needed. |
|
*/ |
|
|
|
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && |
|
(rlen += rsz) > 0) { |
|
poff = parse_string(p, b, rlen, &pstate, rsz > 0); |
|
/* Buffer exhausted; shift left and re-fill. */ |
assert(poff > 0); |
assert(poff > 0); |
memmove(b, b + poff, rlen - poff); |
|
rlen -= poff; |
rlen -= poff; |
|
memmove(b, b + poff, rlen); |
} |
} |
if (rsz < 0) { |
if (rsz < 0) { |
perror(fname); |
perror(fname); |
p->tree->flags |= TREE_FAIL; |
p->tree->flags |= TREE_FAIL; |
} |
} |
if (p->cur != NULL && p->cur->node == NODE_TEXT) { |
pnode_closetext(p); |
pnode_trim(p->cur); |
|
p->cur = p->cur->parent; |
|
} |
|
if ((p->tree->flags & TREE_CLOSED) == 0) |
if ((p->tree->flags & TREE_CLOSED) == 0) |
warn_msg(p, "document not closed"); |
warn_msg(p, "document not closed"); |
return p->tree; |
return p->tree; |