version 1.2, 2019/03/26 20:54:43 |
version 1.25, 2019/04/08 23:40:17 |
|
|
*/ |
*/ |
#include <assert.h> |
#include <assert.h> |
#include <ctype.h> |
#include <ctype.h> |
#include <expat.h> |
#include <errno.h> |
|
#include <fcntl.h> |
|
#include <libgen.h> |
|
#include <stdarg.h> |
#include <stdio.h> |
#include <stdio.h> |
|
#include <stdlib.h> |
#include <string.h> |
#include <string.h> |
#include <unistd.h> |
#include <unistd.h> |
|
|
|
|
* The implementation of the DocBook parser. |
* The implementation of the DocBook parser. |
*/ |
*/ |
|
|
|
enum pstate { |
|
PARSE_ELEM, |
|
PARSE_TAG, |
|
PARSE_ARG, |
|
PARSE_SQ, |
|
PARSE_DQ |
|
}; |
|
|
/* |
/* |
* Global parse state. |
* Global parse state. |
* Keep this as simple and small as possible. |
* Keep this as simple and small as possible. |
*/ |
*/ |
struct parse { |
struct parse { |
XML_Parser xml; |
|
const char *fname; /* Name of the input file. */ |
const char *fname; /* Name of the input file. */ |
struct ptree *tree; /* Complete parse result. */ |
struct ptree *tree; /* Complete parse result. */ |
|
struct pnode *doctype; |
struct pnode *cur; /* Current node in the tree. */ |
struct pnode *cur; /* Current node in the tree. */ |
int warn; |
enum nodeid ncur; /* Type of the current node. */ |
|
int line; /* Line number in the input file. */ |
|
int col; /* Column number in the input file. */ |
|
int nline; /* Line number of next token. */ |
|
int ncol; /* Column number of next token. */ |
|
int del; /* Levels of nested nodes being deleted. */ |
|
int flags; |
|
#define PFLAG_WARN (1 << 0) /* Print warning messages. */ |
|
#define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */ |
|
#define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */ |
|
#define PFLAG_EEND (1 << 3) /* This element is self-closing. */ |
}; |
}; |
|
|
struct element { |
struct element { |
|
|
}; |
}; |
|
|
static const struct element elements[] = { |
static const struct element elements[] = { |
{ "acronym", NODE_ACRONYM }, |
{ "acronym", NODE_IGNORE }, |
{ "affiliation", NODE_AFFILIATION }, |
{ "affiliation", NODE_AFFILIATION }, |
{ "anchor", NODE_ANCHOR }, |
{ "anchor", NODE_DELETE }, |
|
{ "appendix", NODE_APPENDIX }, |
{ "application", NODE_APPLICATION }, |
{ "application", NODE_APPLICATION }, |
{ "arg", NODE_ARG }, |
{ "arg", NODE_ARG }, |
|
{ "article", NODE_SECTION }, |
{ "author", NODE_AUTHOR }, |
{ "author", NODE_AUTHOR }, |
{ "authorgroup", NODE_AUTHORGROUP }, |
{ "authorgroup", NODE_AUTHORGROUP }, |
{ "blockquote", NODE_BLOCKQUOTE }, |
{ "blockquote", NODE_BLOCKQUOTE }, |
{ "book", NODE_BOOK }, |
{ "book", NODE_SECTION }, |
{ "bookinfo", NODE_BOOKINFO }, |
{ "bookinfo", NODE_BOOKINFO }, |
{ "caution", NODE_CAUTION }, |
{ "caution", NODE_CAUTION }, |
{ "chapter", NODE_SECTION }, |
{ "chapter", NODE_SECTION }, |
{ "citerefentry", NODE_CITEREFENTRY }, |
{ "citerefentry", NODE_CITEREFENTRY }, |
{ "citetitle", NODE_CITETITLE }, |
{ "citetitle", NODE_CITETITLE }, |
{ "cmdsynopsis", NODE_CMDSYNOPSIS }, |
{ "cmdsynopsis", NODE_CMDSYNOPSIS }, |
{ "code", NODE_CODE }, |
{ "code", NODE_LITERAL }, |
{ "colspec", NODE_COLSPEC }, |
{ "colspec", NODE_COLSPEC }, |
{ "command", NODE_COMMAND }, |
{ "command", NODE_COMMAND }, |
{ "constant", NODE_CONSTANT }, |
{ "constant", NODE_CONSTANT }, |
|
{ "contrib", NODE_CONTRIB }, |
{ "copyright", NODE_COPYRIGHT }, |
{ "copyright", NODE_COPYRIGHT }, |
{ "date", NODE_DATE }, |
{ "date", NODE_DATE }, |
|
{ "!doctype", NODE_DOCTYPE }, |
|
{ "!DOCTYPE", NODE_DOCTYPE }, |
{ "editor", NODE_EDITOR }, |
{ "editor", NODE_EDITOR }, |
{ "email", NODE_EMAIL }, |
{ "email", NODE_EMAIL }, |
{ "emphasis", NODE_EMPHASIS }, |
{ "emphasis", NODE_EMPHASIS }, |
|
{ "!ENTITY", NODE_ENTITY }, |
{ "entry", NODE_ENTRY }, |
{ "entry", NODE_ENTRY }, |
{ "envar", NODE_ENVAR }, |
{ "envar", NODE_ENVAR }, |
|
{ "errorname", NODE_ERRORNAME }, |
{ "fieldsynopsis", NODE_FIELDSYNOPSIS }, |
{ "fieldsynopsis", NODE_FIELDSYNOPSIS }, |
{ "filename", NODE_FILENAME }, |
{ "filename", NODE_FILENAME }, |
{ "firstname", NODE_FIRSTNAME }, |
{ "firstname", NODE_PERSONNAME }, |
{ "firstterm", NODE_FIRSTTERM }, |
{ "firstterm", NODE_FIRSTTERM }, |
{ "footnote", NODE_FOOTNOTE }, |
{ "footnote", NODE_FOOTNOTE }, |
{ "funcdef", NODE_FUNCDEF }, |
{ "funcdef", NODE_FUNCDEF }, |
Line 83 static const struct element elements[] = { |
|
Line 112 static const struct element elements[] = { |
|
{ "funcsynopsis", NODE_FUNCSYNOPSIS }, |
{ "funcsynopsis", NODE_FUNCSYNOPSIS }, |
{ "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, |
{ "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, |
{ "function", NODE_FUNCTION }, |
{ "function", NODE_FUNCTION }, |
|
{ "glossary", NODE_VARIABLELIST }, |
|
{ "glossdef", NODE_IGNORE }, |
|
{ "glossdiv", NODE_IGNORE }, |
|
{ "glossentry", NODE_VARLISTENTRY }, |
|
{ "glosslist", NODE_VARIABLELIST }, |
{ "glossterm", NODE_GLOSSTERM }, |
{ "glossterm", NODE_GLOSSTERM }, |
{ "group", NODE_GROUP }, |
{ "group", NODE_GROUP }, |
{ "holder", NODE_HOLDER }, |
{ "holder", NODE_HOLDER }, |
{ "index", NODE_INDEX }, |
{ "index", NODE_INDEX }, |
{ "indexterm", NODE_INDEXTERM }, |
{ "indexterm", NODE_DELETE }, |
{ "info", NODE_INFO }, |
{ "info", NODE_INFO }, |
{ "informalequation", NODE_INFORMALEQUATION }, |
{ "informalequation", NODE_INFORMALEQUATION }, |
{ "informaltable", NODE_INFORMALTABLE }, |
{ "informaltable", NODE_TABLE }, |
{ "inlineequation", NODE_INLINEEQUATION }, |
{ "inlineequation", NODE_INLINEEQUATION }, |
{ "itemizedlist", NODE_ITEMIZEDLIST }, |
{ "itemizedlist", NODE_ITEMIZEDLIST }, |
{ "keysym", NODE_KEYSYM }, |
{ "keysym", NODE_KEYSYM }, |
Line 115 static const struct element elements[] = { |
|
Line 149 static const struct element elements[] = { |
|
{ "option", NODE_OPTION }, |
{ "option", NODE_OPTION }, |
{ "orderedlist", NODE_ORDEREDLIST }, |
{ "orderedlist", NODE_ORDEREDLIST }, |
{ "orgname", NODE_ORGNAME }, |
{ "orgname", NODE_ORGNAME }, |
{ "othername", NODE_OTHERNAME }, |
{ "othername", NODE_PERSONNAME }, |
{ "para", NODE_PARA }, |
{ "para", NODE_PARA }, |
{ "paramdef", NODE_PARAMDEF }, |
{ "paramdef", NODE_PARAMDEF }, |
{ "parameter", NODE_PARAMETER }, |
{ "parameter", NODE_PARAMETER }, |
{ "part", NODE_SECTION }, |
{ "part", NODE_SECTION }, |
{ "personname", NODE_PERSONNAME }, |
{ "personname", NODE_PERSONNAME }, |
{ "phrase", NODE_PHRASE }, |
{ "phrase", NODE_IGNORE }, |
{ "preface", NODE_PREFACE }, |
{ "preface", NODE_PREFACE }, |
{ "primary", NODE_PRIMARY }, |
{ "primary", NODE_DELETE }, |
{ "programlisting", NODE_PROGRAMLISTING }, |
{ "programlisting", NODE_PROGRAMLISTING }, |
{ "prompt", NODE_PROMPT }, |
{ "prompt", NODE_PROMPT }, |
{ "quote", NODE_QUOTE }, |
{ "quote", NODE_QUOTE }, |
Line 148 static const struct element elements[] = { |
|
Line 182 static const struct element elements[] = { |
|
{ "row", NODE_ROW }, |
{ "row", NODE_ROW }, |
{ "sbr", NODE_SBR }, |
{ "sbr", NODE_SBR }, |
{ "screen", NODE_SCREEN }, |
{ "screen", NODE_SCREEN }, |
{ "secondary", NODE_SECONDARY }, |
{ "secondary", NODE_DELETE }, |
{ "sect1", NODE_SECTION }, |
{ "sect1", NODE_SECTION }, |
{ "sect2", NODE_SECTION }, |
{ "sect2", NODE_SECTION }, |
{ "section", NODE_SECTION }, |
{ "section", NODE_SECTION }, |
{ "sgmltag", NODE_SGMLTAG }, |
{ "sgmltag", NODE_SGMLTAG }, |
|
{ "simpara", NODE_PARA }, |
{ "simplelist", NODE_SIMPLELIST }, |
{ "simplelist", NODE_SIMPLELIST }, |
{ "spanspec", NODE_SPANSPEC }, |
{ "spanspec", NODE_SPANSPEC }, |
{ "structname", NODE_STRUCTNAME }, |
{ "structfield", NODE_PARAMETER }, |
|
{ "structname", NODE_TYPE }, |
{ "subtitle", NODE_SUBTITLE }, |
{ "subtitle", NODE_SUBTITLE }, |
{ "surname", NODE_SURNAME }, |
{ "surname", NODE_PERSONNAME }, |
|
{ "symbol", NODE_CONSTANT }, |
{ "synopsis", NODE_SYNOPSIS }, |
{ "synopsis", NODE_SYNOPSIS }, |
{ "table", NODE_TABLE }, |
{ "table", NODE_TABLE }, |
{ "tbody", NODE_TBODY }, |
{ "tbody", NODE_TBODY }, |
Line 167 static const struct element elements[] = { |
|
Line 204 static const struct element elements[] = { |
|
{ "thead", NODE_THEAD }, |
{ "thead", NODE_THEAD }, |
{ "tip", NODE_TIP }, |
{ "tip", NODE_TIP }, |
{ "title", NODE_TITLE }, |
{ "title", NODE_TITLE }, |
{ "trademark", NODE_TRADEMARK }, |
{ "trademark", NODE_IGNORE }, |
{ "type", NODE_TYPE }, |
{ "type", NODE_TYPE }, |
{ "ulink", NODE_ULINK }, |
{ "ulink", NODE_LINK }, |
{ "userinput", NODE_USERINPUT }, |
{ "userinput", NODE_LITERAL }, |
{ "variablelist", NODE_VARIABLELIST }, |
{ "variablelist", NODE_VARIABLELIST }, |
{ "varlistentry", NODE_VARLISTENTRY }, |
{ "varlistentry", NODE_VARLISTENTRY }, |
{ "varname", NODE_VARNAME }, |
{ "varname", NODE_VARNAME }, |
{ "warning", NODE_WARNING }, |
{ "warning", NODE_WARNING }, |
{ "wordasword", NODE_WORDASWORD }, |
{ "wordasword", NODE_WORDASWORD }, |
{ "xi:include", NODE_WARN }, |
{ "xi:include", NODE_DELETE_WARN }, |
{ "year", NODE_YEAR }, |
{ "year", NODE_YEAR }, |
{ NULL, NODE__MAX } |
{ NULL, NODE_IGNORE } |
}; |
}; |
|
|
|
struct entity { |
|
const char *name; |
|
const char *roff; |
|
}; |
|
|
/* |
/* |
|
* XML character entity references found in the wild. |
|
* Those that don't have an exact mandoc_char(7) representation |
|
* are approximated, and the desired codepoint is given as a comment. |
|
* Encoding them as \\[u...] would leave -Tascii out in the cold. |
|
*/ |
|
static const struct entity entities[] = { |
|
{ "alpha", "\\(*a" }, |
|
{ "amp", "&" }, |
|
{ "apos", "'" }, |
|
{ "auml", "\\(:a" }, |
|
{ "beta", "\\(*b" }, |
|
{ "circ", "^" }, /* U+02C6 */ |
|
{ "copy", "\\(co" }, |
|
{ "dagger", "\\(dg" }, |
|
{ "Delta", "\\(*D" }, |
|
{ "eacute", "\\('e" }, |
|
{ "emsp", "\\ " }, /* U+2003 */ |
|
{ "gt", ">" }, |
|
{ "hairsp", "\\^" }, |
|
{ "kappa", "\\(*k" }, |
|
{ "larr", "\\(<-" }, |
|
{ "ldquo", "\\(lq" }, |
|
{ "le", "\\(<=" }, |
|
{ "lowbar", "_" }, |
|
{ "lsqb", "[" }, |
|
{ "lt", "<" }, |
|
{ "mdash", "\\(em" }, |
|
{ "minus", "\\-" }, |
|
{ "ndash", "\\(en" }, |
|
{ "nbsp", "\\ " }, |
|
{ "num", "#" }, |
|
{ "oslash", "\\(/o" }, |
|
{ "ouml", "\\(:o" }, |
|
{ "percnt", "%" }, |
|
{ "quot", "\\(dq" }, |
|
{ "rarr", "\\(->" }, |
|
{ "rArr", "\\(rA" }, |
|
{ "rdquo", "\\(rq" }, |
|
{ "reg", "\\(rg" }, |
|
{ "rho", "\\(*r" }, |
|
{ "rsqb", "]" }, |
|
{ "sigma", "\\(*s" }, |
|
{ "shy", "\\&" }, /* U+00AD */ |
|
{ "tau", "\\(*t" }, |
|
{ "tilde", "\\[u02DC]" }, |
|
{ "times", "\\[tmu]" }, |
|
{ "uuml", "\\(:u" }, |
|
{ NULL, NULL } |
|
}; |
|
|
|
static size_t parse_string(struct parse *, char *, size_t, |
|
enum pstate *, int); |
|
static void parse_fd(struct parse *, int); |
|
|
|
|
|
static void |
|
error_msg(struct parse *p, const char *fmt, ...) |
|
{ |
|
va_list ap; |
|
|
|
fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col); |
|
va_start(ap, fmt); |
|
vfprintf(stderr, fmt, ap); |
|
va_end(ap); |
|
fputc('\n', stderr); |
|
p->tree->flags |= TREE_FAIL; |
|
} |
|
|
|
static void |
|
warn_msg(struct parse *p, const char *fmt, ...) |
|
{ |
|
va_list ap; |
|
|
|
if ((p->flags & PFLAG_WARN) == 0) |
|
return; |
|
|
|
fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col); |
|
va_start(ap, fmt); |
|
vfprintf(stderr, fmt, ap); |
|
va_end(ap); |
|
fputc('\n', stderr); |
|
} |
|
|
|
/* |
* Process a string of characters. |
* Process a string of characters. |
* If a text node is already open, append to it. |
* If a text node is already open, append to it. |
* Otherwise, create a new one as a child of the current node. |
* Otherwise, create a new one as a child of the current node. |
*/ |
*/ |
static void |
static void |
xml_char(void *arg, const XML_Char *p, int sz) |
xml_char(struct parse *ps, const char *p, int sz) |
{ |
{ |
struct parse *ps; |
|
struct pnode *dat; |
struct pnode *dat; |
int i; |
size_t newsz; |
|
|
ps = arg; |
if (ps->del > 0) |
if (ps->tree->flags && TREE_FAIL) |
|
return; |
return; |
|
|
/* |
if (ps->cur == NULL) { |
* Only create a new node if there is non-whitespace text. |
error_msg(ps, "discarding text before document: %.*s", sz, p); |
* Strip all leading whitespace. |
return; |
*/ |
} |
if (ps->cur->node != NODE_TEXT) { |
|
for (i = 0; i < sz; i++) |
|
if (isspace((unsigned char)p[i]) == 0) |
|
break; |
|
if (i == sz) |
|
return; |
|
p += i; |
|
sz -= i; |
|
|
|
|
if (ps->cur->node != NODE_TEXT) { |
if ((dat = calloc(1, sizeof(*dat))) == NULL) { |
if ((dat = calloc(1, sizeof(*dat))) == NULL) { |
perror(NULL); |
perror(NULL); |
exit(1); |
exit(1); |
} |
} |
dat->node = NODE_TEXT; |
dat->node = NODE_TEXT; |
|
dat->spc = (ps->flags & PFLAG_SPC) != 0; |
dat->parent = ps->cur; |
dat->parent = ps->cur; |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INIT(&dat->attrq); |
Line 222 xml_char(void *arg, const XML_Char *p, int sz) |
|
Line 340 xml_char(void *arg, const XML_Char *p, int sz) |
|
ps->cur = dat; |
ps->cur = dat; |
} |
} |
|
|
|
if (ps->tree->flags & TREE_CLOSED && |
|
ps->cur->parent == ps->tree->root) |
|
warn_msg(ps, "text after end of document: %.*s", sz, p); |
|
|
/* Append to the current text node. */ |
/* Append to the current text node. */ |
|
|
assert(sz >= 0); |
assert(sz >= 0); |
ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1); |
newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz; |
|
ps->cur->b = realloc(ps->cur->b, newsz + 1); |
if (ps->cur->b == NULL) { |
if (ps->cur->b == NULL) { |
perror(NULL); |
perror(NULL); |
exit(1); |
exit(1); |
} |
} |
|
if (ps->cur->bsz && (ps->flags & PFLAG_SPC)) |
|
ps->cur->b[ps->cur->bsz++] = ' '; |
memcpy(ps->cur->b + ps->cur->bsz, p, sz); |
memcpy(ps->cur->b + ps->cur->bsz, p, sz); |
ps->cur->bsz += sz; |
ps->cur->b[ps->cur->bsz = newsz] = '\0'; |
ps->cur->b[ps->cur->bsz] = '\0'; |
|
ps->cur->real = ps->cur->b; |
ps->cur->real = ps->cur->b; |
|
ps->flags &= ~PFLAG_SPC; |
} |
} |
|
|
|
/* |
|
* Close out the text node and strip trailing whitespace, if one is open. |
|
*/ |
static void |
static void |
pnode_trim(struct pnode *pn) |
pnode_closetext(struct parse *p) |
{ |
{ |
assert(pn->node == NODE_TEXT); |
struct pnode *n; |
for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0') |
|
if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0) |
if ((n = p->cur) == NULL || n->node != NODE_TEXT) |
|
return; |
|
p->cur = n->parent; |
|
while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { |
|
n->b[--n->bsz] = '\0'; |
|
p->flags |= PFLAG_SPC; |
|
} |
|
} |
|
|
|
static void |
|
xml_entity(struct parse *p, const char *name) |
|
{ |
|
const struct entity *entity; |
|
struct pnode *dat; |
|
const char *ccp; |
|
char *cp; |
|
enum pstate pstate; |
|
|
|
if (p->del > 0) |
|
return; |
|
|
|
if (p->cur == NULL) { |
|
error_msg(p, "discarding entity before document: &%s;", name); |
|
return; |
|
} |
|
|
|
pnode_closetext(p); |
|
|
|
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) |
|
warn_msg(p, "entity after end of document: &%s;", name); |
|
|
|
for (entity = entities; entity->name != NULL; entity++) |
|
if (strcmp(name, entity->name) == 0) |
break; |
break; |
|
|
|
if (entity->roff == NULL) { |
|
if (p->doctype != NULL) { |
|
TAILQ_FOREACH(dat, &p->doctype->childq, child) { |
|
if ((ccp = pnode_getattr_raw(dat, |
|
ATTRKEY_NAME, NULL)) == NULL || |
|
strcmp(ccp, name) != 0) |
|
continue; |
|
if ((ccp = pnode_getattr_raw(dat, |
|
ATTRKEY_SYSTEM, NULL)) != NULL) { |
|
parse_file(p, -1, ccp); |
|
p->flags &= ~PFLAG_SPC; |
|
return; |
|
} |
|
if ((ccp = pnode_getattr_raw(dat, |
|
ATTRKEY_DEFINITION, NULL)) == NULL) |
|
continue; |
|
if ((cp = strdup(ccp)) == NULL) { |
|
perror(NULL); |
|
exit(1); |
|
} |
|
pstate = PARSE_ELEM; |
|
parse_string(p, cp, strlen(cp), &pstate, 0); |
|
p->flags &= ~PFLAG_SPC; |
|
free(cp); |
|
return; |
|
} |
|
} |
|
error_msg(p, "unknown entity &%s;", name); |
|
return; |
|
} |
|
|
|
/* Create, append, and close out an entity node. */ |
|
if ((dat = calloc(1, sizeof(*dat))) == NULL || |
|
(dat->b = dat->real = strdup(entity->roff)) == NULL) { |
|
perror(NULL); |
|
exit(1); |
|
} |
|
dat->node = NODE_ESCAPE; |
|
dat->bsz = strlen(dat->b); |
|
dat->spc = (p->flags & PFLAG_SPC) != 0; |
|
dat->parent = p->cur; |
|
TAILQ_INIT(&dat->childq); |
|
TAILQ_INIT(&dat->attrq); |
|
TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); |
|
p->flags &= ~PFLAG_SPC; |
} |
} |
|
|
/* |
/* |
* Begin an element. |
* Begin an element. |
* If the name is unknown, abort parsing. |
|
*/ |
*/ |
static void |
static void |
xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts) |
xml_elem_start(struct parse *ps, const char *name) |
{ |
{ |
struct parse *ps; |
const struct element *elem; |
const struct element *elem; |
struct pnode *dat; |
enum attrkey key; |
|
struct pnode *dat; |
|
struct pattr *pattr; |
|
const XML_Char **att; |
|
|
|
ps = arg; |
/* |
if (ps->tree->flags && TREE_FAIL) |
* An ancestor is excluded from the tree; |
|
* keep track of the number of levels excluded. |
|
*/ |
|
if (ps->del > 0) { |
|
if (*name != '!' && *name != '?') |
|
ps->del++; |
return; |
return; |
|
|
/* Close out the text node, if there is one. */ |
|
if (ps->cur != NULL && ps->cur->node == NODE_TEXT) { |
|
pnode_trim(ps->cur); |
|
ps->cur = ps->cur->parent; |
|
} |
} |
|
|
|
pnode_closetext(ps); |
|
|
for (elem = elements; elem->name != NULL; elem++) |
for (elem = elements; elem->name != NULL; elem++) |
if (strcmp(elem->name, name) == 0) |
if (strcmp(elem->name, name) == 0) |
break; |
break; |
|
|
if (elem->name == NULL) { |
if (elem->name == NULL) { |
fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n", |
if (*name == '!' || *name == '?') |
ps->fname, XML_GetCurrentLineNumber(ps->xml), |
return; |
XML_GetCurrentColumnNumber(ps->xml), name); |
error_msg(ps, "unknown element <%s>", name); |
ps->tree->flags |= TREE_FAIL; |
|
return; |
|
} |
} |
|
|
switch (elem->node) { |
ps->ncur = elem->node; |
case NODE_WARN: |
|
if (ps->warn) |
switch (ps->ncur) { |
fprintf(stderr, "%s:%zu:%zu: warning: " |
case NODE_DELETE_WARN: |
"ignoring element <%s>\n", ps->fname, |
warn_msg(ps, "skipping element <%s>", name); |
XML_GetCurrentLineNumber(ps->xml), |
|
XML_GetCurrentColumnNumber(ps->xml), name); |
|
/* FALLTHROUGH */ |
/* FALLTHROUGH */ |
|
case NODE_DELETE: |
|
ps->del = 1; |
|
/* FALLTHROUGH */ |
case NODE_IGNORE: |
case NODE_IGNORE: |
return; |
return; |
case NODE_INLINEEQUATION: |
case NODE_INLINEEQUATION: |
Line 298 xml_elem_start(void *arg, const XML_Char *name, const |
|
Line 500 xml_elem_start(void *arg, const XML_Char *name, const |
|
break; |
break; |
} |
} |
|
|
|
if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL) |
|
warn_msg(ps, "element after end of document: <%s>", name); |
|
|
if ((dat = calloc(1, sizeof(*dat))) == NULL) { |
if ((dat = calloc(1, sizeof(*dat))) == NULL) { |
perror(NULL); |
perror(NULL); |
exit(1); |
exit(1); |
} |
} |
dat->node = elem->node; |
|
|
/* |
|
* Nodes that begin a new macro or request line or start by |
|
* printing text always want whitespace before themselves. |
|
*/ |
|
|
|
switch (dat->node = elem->node) { |
|
case NODE_DOCTYPE: |
|
case NODE_ENTITY: |
|
case NODE_SBR: |
|
ps->flags |= PFLAG_EEND; |
|
/* FALLTHROUGH */ |
|
case NODE_APPENDIX: |
|
case NODE_AUTHORGROUP: |
|
case NODE_BLOCKQUOTE: |
|
case NODE_BOOKINFO: |
|
case NODE_CAUTION: |
|
case NODE_EDITOR: |
|
case NODE_ENTRY: |
|
case NODE_FUNCDEF: |
|
case NODE_FUNCPROTOTYPE: |
|
case NODE_INFORMALEQUATION: |
|
case NODE_INLINEEQUATION: |
|
case NODE_ITEMIZEDLIST: |
|
case NODE_LEGALNOTICE: |
|
case NODE_LITERALLAYOUT: |
|
case NODE_NOTE: |
|
case NODE_ORDEREDLIST: |
|
case NODE_PARA: |
|
case NODE_PREFACE: |
|
case NODE_PROGRAMLISTING: |
|
case NODE_REFMETA: |
|
case NODE_REFNAMEDIV: |
|
case NODE_REFSYNOPSISDIV: |
|
case NODE_ROW: |
|
case NODE_SCREEN: |
|
case NODE_SECTION: |
|
case NODE_SYNOPSIS: |
|
case NODE_TGROUP: |
|
case NODE_TIP: |
|
case NODE_TITLE: |
|
case NODE_VARIABLELIST: |
|
case NODE_VARLISTENTRY: |
|
case NODE_WARNING: |
|
dat->spc = 1; |
|
break; |
|
default: |
|
dat->spc = (ps->flags & PFLAG_SPC) != 0; |
|
break; |
|
} |
dat->parent = ps->cur; |
dat->parent = ps->cur; |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->childq); |
TAILQ_INIT(&dat->attrq); |
TAILQ_INIT(&dat->attrq); |
Line 311 xml_elem_start(void *arg, const XML_Char *name, const |
|
Line 565 xml_elem_start(void *arg, const XML_Char *name, const |
|
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); |
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); |
|
|
ps->cur = dat; |
ps->cur = dat; |
if (ps->tree->root == NULL) |
if (dat->node == NODE_DOCTYPE) { |
|
if (ps->doctype == NULL) |
|
ps->doctype = dat; |
|
else |
|
error_msg(ps, "duplicate doctype"); |
|
} else if (dat->parent == NULL && ps->tree->root == NULL) |
ps->tree->root = dat; |
ps->tree->root = dat; |
|
} |
|
|
/* |
static void |
* Process attributes. |
xml_attrkey(struct parse *ps, const char *name) |
*/ |
{ |
for (att = atts; *att != NULL; att += 2) { |
struct pattr *attr; |
if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) { |
const char *value; |
if (ps->warn) |
enum attrkey key; |
fprintf(stderr, "%s:%zu:%zu: warning: " |
|
"unknown attribute \"%s\"\n", |
if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0') |
ps->fname, |
return; |
XML_GetCurrentLineNumber(ps->xml), |
|
XML_GetCurrentColumnNumber(ps->xml), |
if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) && |
*att); |
TAILQ_FIRST(&ps->cur->attrq) == NULL) { |
continue; |
value = name; |
|
name = "NAME"; |
|
} else |
|
value = NULL; |
|
|
|
if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { |
|
ps->flags &= ~PFLAG_ATTR; |
|
return; |
|
} |
|
if ((attr = calloc(1, sizeof(*attr))) == NULL) { |
|
perror(NULL); |
|
exit(1); |
|
} |
|
attr->key = key; |
|
attr->val = ATTRVAL__MAX; |
|
if (value == NULL) { |
|
attr->rawval = NULL; |
|
ps->flags |= PFLAG_ATTR; |
|
} else { |
|
if ((attr->rawval = strdup(value)) == NULL) { |
|
perror(NULL); |
|
exit(1); |
} |
} |
pattr = calloc(1, sizeof(*pattr)); |
ps->flags &= ~PFLAG_ATTR; |
pattr->key = key; |
|
if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX) |
|
pattr->rawval = strdup(att[1]); |
|
TAILQ_INSERT_TAIL(&dat->attrq, pattr, child); |
|
} |
} |
|
TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child); |
|
if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME) |
|
xml_attrkey(ps, "DEFINITION"); |
} |
} |
|
|
|
static void |
|
xml_attrval(struct parse *ps, const char *name) |
|
{ |
|
struct pattr *attr; |
|
|
|
if (ps->del > 0 || ps->ncur == NODE_IGNORE || |
|
(ps->flags & PFLAG_ATTR) == 0) |
|
return; |
|
if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL) |
|
return; |
|
if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX && |
|
(attr->rawval = strdup(name)) == NULL) { |
|
perror(NULL); |
|
exit(1); |
|
} |
|
} |
|
|
/* |
/* |
* Roll up the parse tree. |
* Roll up the parse tree. |
* If we're at a text node, roll that one up first. |
* If we're at a text node, roll that one up first. |
*/ |
*/ |
static void |
static void |
xml_elem_end(void *arg, const XML_Char *name) |
xml_elem_end(struct parse *ps, const char *name) |
{ |
{ |
struct parse *ps; |
const struct element *elem; |
const struct element *elem; |
enum nodeid node; |
|
|
ps = arg; |
/* |
if (ps->tree->flags && TREE_FAIL) |
* An ancestor is excluded from the tree; |
|
* keep track of the number of levels excluded. |
|
*/ |
|
if (ps->del > 1) { |
|
ps->del--; |
return; |
return; |
|
|
/* Close out the text node, if there is one. */ |
|
if (ps->cur->node == NODE_TEXT) { |
|
pnode_trim(ps->cur); |
|
ps->cur = ps->cur->parent; |
|
} |
} |
|
|
for (elem = elements; elem->name != NULL; elem++) |
if (ps->del == 0) |
if (strcmp(elem->name, name) == 0) |
pnode_closetext(ps); |
break; |
|
|
|
switch (elem->node) { |
if (name != NULL) { |
|
for (elem = elements; elem->name != NULL; elem++) |
|
if (strcmp(elem->name, name) == 0) |
|
break; |
|
node = elem->node; |
|
} else |
|
node = ps->ncur; |
|
|
|
switch (node) { |
|
case NODE_DELETE_WARN: |
|
case NODE_DELETE: |
|
if (ps->del > 0) |
|
ps->del--; |
|
break; |
case NODE_IGNORE: |
case NODE_IGNORE: |
case NODE_WARN: |
|
break; |
break; |
|
case NODE_DOCTYPE: |
|
ps->flags &= ~PFLAG_EEND; |
|
/* FALLTHROUGH */ |
default: |
default: |
assert(elem->node == ps->cur->node); |
if (ps->cur == NULL || node != ps->cur->node) { |
ps->cur = ps->cur->parent; |
warn_msg(ps, "element not open: </%s>", name); |
|
break; |
|
} |
|
|
|
/* |
|
* Refrain from actually closing the document element. |
|
* If no more content follows, no harm is done, but if |
|
* some content still follows, simply processing it is |
|
* obviously better than discarding it or crashing. |
|
*/ |
|
|
|
if (ps->cur->parent != NULL || node == NODE_DOCTYPE) { |
|
ps->cur = ps->cur->parent; |
|
if (ps->cur != NULL) |
|
ps->ncur = ps->cur->node; |
|
} else |
|
ps->tree->flags |= TREE_CLOSED; |
|
ps->flags &= ~PFLAG_SPC; |
|
break; |
} |
} |
|
assert(ps->del == 0); |
} |
} |
|
|
struct parse * |
struct parse * |
Line 382 parse_alloc(int warn) |
|
Line 711 parse_alloc(int warn) |
|
free(p); |
free(p); |
return NULL; |
return NULL; |
} |
} |
|
if (warn) |
if ((p->xml = XML_ParserCreate(NULL)) == NULL) { |
p->flags |= PFLAG_WARN; |
free(p->tree); |
else |
free(p); |
p->flags &= ~PFLAG_WARN; |
return NULL; |
|
} |
|
p->warn = warn; |
|
XML_SetCharacterDataHandler(p->xml, xml_char); |
|
XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end); |
|
XML_SetUserData(p->xml, p); |
|
return p; |
return p; |
} |
} |
|
|
Line 400 parse_free(struct parse *p) |
|
Line 723 parse_free(struct parse *p) |
|
{ |
{ |
if (p == NULL) |
if (p == NULL) |
return; |
return; |
XML_ParserFree(p->xml); |
|
if (p->tree != NULL) { |
if (p->tree != NULL) { |
pnode_unlink(p->tree->root); |
pnode_unlink(p->tree->root); |
free(p->tree); |
free(p->tree); |
Line 408 parse_free(struct parse *p) |
|
Line 730 parse_free(struct parse *p) |
|
free(p); |
free(p); |
} |
} |
|
|
|
static void |
|
increment(struct parse *p, char *b, size_t *pend, int refill) |
|
{ |
|
if (refill) { |
|
if (b[*pend] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
} |
|
++*pend; |
|
} |
|
|
|
/* |
|
* Advance the pend pointer to the next character in the charset. |
|
* If the charset starts with a space, it stands for any whitespace. |
|
* Update the new input file position, used for messages. |
|
* Do not overrun the buffer b of length rlen. |
|
* When reaching the end, NUL-terminate the buffer and return 1; |
|
* otherwise, return 0. |
|
*/ |
|
static int |
|
advance(struct parse *p, char *b, size_t rlen, size_t *pend, |
|
const char *charset, int refill) |
|
{ |
|
int space; |
|
|
|
if (*charset == ' ') { |
|
space = 1; |
|
charset++; |
|
} else |
|
space = 0; |
|
|
|
if (refill) { |
|
p->nline = p->line; |
|
p->ncol = p->col; |
|
} |
|
while (*pend < rlen) { |
|
if (space && isspace((unsigned char)b[*pend])) |
|
break; |
|
if (strchr(charset, b[*pend]) != NULL) |
|
break; |
|
increment(p, b, pend, refill); |
|
} |
|
if (*pend == rlen) { |
|
b[rlen] = '\0'; |
|
return refill; |
|
} else |
|
return 0; |
|
} |
|
|
|
size_t |
|
parse_string(struct parse *p, char *b, size_t rlen, |
|
enum pstate *pstate, int refill) |
|
{ |
|
char *cp; |
|
size_t poff; /* Parse offset in b[]. */ |
|
size_t pend; /* Offset of the end of the current word. */ |
|
int elem_end; |
|
|
|
pend = 0; |
|
for (;;) { |
|
|
|
/* Proceed to the next token, skipping whitespace. */ |
|
|
|
if (refill) { |
|
p->line = p->nline; |
|
p->col = p->ncol; |
|
} |
|
if ((poff = pend) == rlen) |
|
break; |
|
if (isspace((unsigned char)b[pend])) { |
|
p->flags |= PFLAG_SPC; |
|
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
|
|
/* |
|
* The following four cases (ARG, TAG, and starting an |
|
* entity or a tag) all parse a word or quoted string. |
|
* If that extends beyond the read buffer and the last |
|
* read(2) still got data, they all break out of the |
|
* token loop to request more data from the read loop. |
|
* |
|
* Also, three of them detect self-closing tags, those |
|
* ending with "/>", setting the flag elem_end and |
|
* calling xml_elem_end() at the very end, after |
|
* handling the attribute value, attribute name, or |
|
* tag name, respectively. |
|
*/ |
|
|
|
/* Parse an attribute value. */ |
|
|
|
if (*pstate >= PARSE_ARG) { |
|
if (*pstate == PARSE_ARG && |
|
(b[pend] == '\'' || b[pend] == '"')) { |
|
*pstate = b[pend] == '"' ? |
|
PARSE_DQ : PARSE_SQ; |
|
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
if (advance(p, b, rlen, &pend, |
|
*pstate == PARSE_DQ ? "\"" : |
|
*pstate == PARSE_SQ ? "'" : " >", refill)) |
|
break; |
|
*pstate = PARSE_TAG; |
|
elem_end = 0; |
|
if (b[pend] == '>') { |
|
*pstate = PARSE_ELEM; |
|
if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
if (p->flags & PFLAG_EEND) |
|
elem_end = 1; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
xml_attrval(p, b + poff); |
|
if (elem_end) |
|
xml_elem_end(p, NULL); |
|
|
|
/* Look for an attribute name. */ |
|
|
|
} else if (*pstate == PARSE_TAG) { |
|
switch (p->ncur) { |
|
case NODE_DOCTYPE: |
|
if (b[pend] == '[') { |
|
*pstate = PARSE_ELEM; |
|
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
/* FALLTHROUGH */ |
|
case NODE_ENTITY: |
|
if (b[pend] == '"' || b[pend] == '\'') { |
|
*pstate = PARSE_ARG; |
|
continue; |
|
} |
|
break; |
|
default: |
|
break; |
|
} |
|
if (advance(p, b, rlen, &pend, " =>", refill)) |
|
break; |
|
elem_end = 0; |
|
switch (b[pend]) { |
|
case '>': |
|
*pstate = PARSE_ELEM; |
|
if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
if (p->flags & PFLAG_EEND) |
|
elem_end = 1; |
|
break; |
|
case '=': |
|
*pstate = PARSE_ARG; |
|
break; |
|
default: |
|
break; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
xml_attrkey(p, b + poff); |
|
if (elem_end) |
|
xml_elem_end(p, NULL); |
|
|
|
/* Begin an opening or closing tag. */ |
|
|
|
} else if (b[poff] == '<') { |
|
if (advance(p, b, rlen, &pend, " >", refill)) |
|
break; |
|
if (pend > poff + 3 && |
|
strncmp(b + poff, "<!--", 4) == 0) { |
|
|
|
/* Skip a comment. */ |
|
|
|
cp = strstr(b + pend - 2, "-->"); |
|
if (cp == NULL) { |
|
if (refill) |
|
break; |
|
cp = b + rlen; |
|
} else |
|
cp += 3; |
|
while (b + pend < cp) |
|
increment(p, b, &pend, refill); |
|
continue; |
|
} |
|
elem_end = 0; |
|
if (b[pend] != '>') |
|
*pstate = PARSE_TAG; |
|
else if (pend > 0 && b[pend - 1] == '/') { |
|
b[pend - 1] = '\0'; |
|
elem_end = 1; |
|
} |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
if (b[++poff] == '/') { |
|
elem_end = 1; |
|
poff++; |
|
} else { |
|
xml_elem_start(p, b + poff); |
|
if (*pstate == PARSE_ELEM && |
|
p->flags & PFLAG_EEND) |
|
elem_end = 1; |
|
} |
|
if (elem_end) |
|
xml_elem_end(p, b + poff); |
|
|
|
/* Close a doctype. */ |
|
|
|
} else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') { |
|
*pstate = PARSE_TAG; |
|
increment(p, b, &pend, refill); |
|
|
|
/* Process an entity. */ |
|
|
|
} else if (b[poff] == '&') { |
|
if (advance(p, b, rlen, &pend, ";", refill)) |
|
break; |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
increment(p, b, &pend, refill); |
|
xml_entity(p, b + poff + 1); |
|
|
|
/* Process text up to the next tag, entity, or EOL. */ |
|
|
|
} else { |
|
advance(p, b, rlen, &pend, "<&", refill); |
|
xml_char(p, b + poff, pend - poff); |
|
} |
|
} |
|
return poff; |
|
} |
|
|
|
|
|
/* |
|
* The read loop. |
|
* If the previous token was incomplete and asked for more input, |
|
* we have to enter the read loop once more even on EOF. |
|
* Once rsz is 0, incomplete tokens will no longer ask for more input |
|
* but instead use whatever there is, and then exit the read loop. |
|
* The minus one on the size limit for read(2) is needed such that |
|
* advance() can set b[rlen] to NUL when needed. |
|
*/ |
|
static void |
|
parse_fd(struct parse *p, int fd) |
|
{ |
|
char b[4096]; |
|
ssize_t rsz; /* Return value from read(2). */ |
|
size_t rlen; /* Number of bytes in b[]. */ |
|
size_t poff; /* Parse offset in b[]. */ |
|
enum pstate pstate; |
|
|
|
rlen = 0; |
|
pstate = PARSE_ELEM; |
|
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && |
|
(rlen += rsz) > 0) { |
|
poff = parse_string(p, b, rlen, &pstate, rsz > 0); |
|
/* Buffer exhausted; shift left and re-fill. */ |
|
assert(poff > 0); |
|
rlen -= poff; |
|
memmove(b, b + poff, rlen); |
|
} |
|
if (rsz < 0) |
|
error_msg(p, "read: %s", strerror(errno)); |
|
} |
|
|
|
/* |
|
* Open and parse a file. |
|
*/ |
struct ptree * |
struct ptree * |
parse_file(struct parse *p, int fd, const char *fname) |
parse_file(struct parse *p, int fd, const char *fname) |
{ |
{ |
char b[4096]; |
const char *save_fname; |
ssize_t ssz; |
int save_line, save_col; |
|
|
|
/* Save and initialize reporting data. */ |
|
|
|
save_fname = p->fname; |
|
save_line = p->nline; |
|
save_col = p->ncol; |
p->fname = fname; |
p->fname = fname; |
do { |
p->line = 0; |
if ((ssz = read(fd, b, sizeof(b))) < 0) { |
p->col = 0; |
perror(fname); |
|
pnode_unlink(p->tree->root); |
/* Open the file, unless it is already open. */ |
p->tree->root = p->cur = NULL; |
|
p->tree->flags |= TREE_FAIL; |
if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) { |
return NULL; |
error_msg(p, "open: %s", strerror(errno)); |
} |
p->fname = save_fname; |
if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) { |
return p->tree; |
fprintf(stderr, "%s:%zu:%zu: %s\n", fname, |
} |
XML_GetCurrentLineNumber(p->xml), |
|
XML_GetCurrentColumnNumber(p->xml), |
/* |
XML_ErrorString(XML_GetErrorCode(p->xml))); |
* After opening the starting file, change to the directory it |
p->tree->flags |= TREE_FAIL; |
* is located in, in case it wants to include any further files, |
} |
* which are typically given with relative paths in DocBook. |
} while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0); |
* Do this on a best-effort basis; don't complain about failure. |
|
*/ |
|
|
|
if (save_fname == NULL && (fname = dirname(fname)) != NULL && |
|
strcmp(fname, ".") != 0) |
|
(void)chdir(fname); |
|
|
|
/* Run the read loop. */ |
|
|
|
p->nline = 1; |
|
p->ncol = 1; |
|
parse_fd(p, fd); |
|
|
|
/* On the top level, finalize the parse tree. */ |
|
|
|
if (save_fname == NULL) { |
|
pnode_closetext(p); |
|
if (p->tree->root == NULL) |
|
error_msg(p, "empty document"); |
|
else if ((p->tree->flags & TREE_CLOSED) == 0) |
|
warn_msg(p, "document not closed"); |
|
pnode_unlink(p->doctype); |
|
} |
|
|
|
/* Clean up. */ |
|
|
|
if (fd != STDIN_FILENO) |
|
close(fd); |
|
p->fname = save_fname; |
|
p->nline = save_line; |
|
p->ncol = save_col; |
return p->tree; |
return p->tree; |
} |
} |