version 1.7, 2019/03/28 20:41:33 |
version 1.10, 2019/04/03 11:23:48 |
Line 189 static const struct element elements[] = { |
|
Line 189 static const struct element elements[] = { |
|
{ NULL, NODE_IGNORE } |
{ NULL, NODE_IGNORE } |
}; |
}; |
|
|
|
struct entity { |
|
const char *name; |
|
const char *roff; |
|
}; |
|
|
|
/* |
|
* XML character entity references found in the wild. |
|
* Those that don't have an exact mandoc_char(7) representation |
|
* are approximated, and the desired codepoint is given as a comment. |
|
* Encoding them as \\[u...] would leave -Tascii out in the cold. |
|
*/ |
|
static const struct entity entities[] = { |
|
{ "alpha", "\\(*a" }, |
|
{ "amp", "&" }, |
|
{ "apos", "'" }, |
|
{ "auml", "\\(:a" }, |
|
{ "beta", "\\(*b" }, |
|
{ "circ", "^" }, /* U+02C6 */ |
|
{ "copy", "\\(co" }, |
|
{ "dagger", "\\(dg" }, |
|
{ "Delta", "\\(*D" }, |
|
{ "eacute", "\\('e" }, |
|
{ "emsp", "\\ " }, /* U+2003 */ |
|
{ "gt", ">" }, |
|
{ "hairsp", "\\^" }, |
|
{ "kappa", "\\(*k" }, |
|
{ "larr", "\\(<-" }, |
|
{ "ldquo", "\\(lq" }, |
|
{ "le", "\\(<=" }, |
|
{ "lowbar", "_" }, |
|
{ "lsqb", "[" }, |
|
{ "lt", "<" }, |
|
{ "mdash", "\\(em" }, |
|
{ "minus", "\\-" }, |
|
{ "ndash", "\\(en" }, |
|
{ "nbsp", "\\ " }, |
|
{ "num", "#" }, |
|
{ "oslash", "\\(/o" }, |
|
{ "ouml", "\\(:o" }, |
|
{ "percnt", "%" }, |
|
{ "quot", "\\(dq" }, |
|
{ "rarr", "\\(->" }, |
|
{ "rArr", "\\(rA" }, |
|
{ "rdquo", "\\(rq" }, |
|
{ "reg", "\\(rg" }, |
|
{ "rho", "\\(*r" }, |
|
{ "rsqb", "]" }, |
|
{ "sigma", "\\(*s" }, |
|
{ "shy", "\\&" }, /* U+00AD */ |
|
{ "tau", "\\(*t" }, |
|
{ "tilde", "\\[u02DC]" }, |
|
{ "times", "\\[tmu]" }, |
|
{ "uuml", "\\(:u" }, |
|
{ NULL, NULL } |
|
}; |
|
|
static void |
static void |
error_msg(struct parse *p, const char *fmt, ...) |
error_msg(struct parse *p, const char *fmt, ...) |
{ |
{ |
Line 275 pnode_trim(struct pnode *pn) |
|
Line 331 pnode_trim(struct pnode *pn) |
|
break; |
break; |
} |
} |
|
|
|
static void |
|
xml_entity(struct parse *p, const char *name) |
|
{ |
|
const struct entity *entity; |
|
struct pnode *dat; |
|
|
|
if (p->del > 0) |
|
return; |
|
|
|
if (p->cur == NULL) { |
|
error_msg(p, "discarding entity before document: &%s;", name); |
|
return; |
|
} |
|
|
|
/* Close out the text node, if there is one. */ |
|
if (p->cur->node == NODE_TEXT) { |
|
pnode_trim(p->cur); |
|
p->cur = p->cur->parent; |
|
} |
|
|
|
if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) |
|
warn_msg(p, "entity after end of document: &%s;", name); |
|
|
|
for (entity = entities; entity->name != NULL; entity++) |
|
if (strcmp(name, entity->name) == 0) |
|
break; |
|
|
|
if (entity->roff == NULL) { |
|
error_msg(p, "unknown entity &%s;", name); |
|
return; |
|
} |
|
|
|
/* Create, append, and close out an entity node. */ |
|
if ((dat = calloc(1, sizeof(*dat))) == NULL || |
|
(dat->b = dat->real = strdup(entity->roff)) == NULL) { |
|
perror(NULL); |
|
exit(1); |
|
} |
|
dat->node = NODE_ESCAPE; |
|
dat->bsz = strlen(dat->b); |
|
dat->parent = p->cur; |
|
TAILQ_INIT(&dat->childq); |
|
TAILQ_INIT(&dat->attrq); |
|
TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); |
|
} |
|
|
/* |
/* |
* Begin an element. |
* Begin an element. |
*/ |
*/ |
|
|
parse_file(struct parse *p, int fd, const char *fname) |
parse_file(struct parse *p, int fd, const char *fname) |
{ |
{ |
char b[4096]; |
char b[4096]; |
|
char *cp; |
ssize_t rsz; /* Return value from read(2). */ |
ssize_t rsz; /* Return value from read(2). */ |
size_t rlen; /* Number of bytes in b[]. */ |
size_t rlen; /* Number of bytes in b[]. */ |
size_t poff; /* Parse offset in b[]. */ |
size_t poff; /* Parse offset in b[]. */ |
Line 572 parse_file(struct parse *p, int fd, const char *fname) |
|
Line 675 parse_file(struct parse *p, int fd, const char *fname) |
|
} |
} |
|
|
/* |
/* |
* The following three cases (in_arg, in_tag, |
* The following four cases (in_arg, in_tag, and |
* and starting a tag) all parse a word or |
* starting an entity or a tag) all parse a word |
* quoted string. If that extends beyond the |
* or quoted string. If that extends beyond the |
* read buffer and the last read(2) still got |
* read buffer and the last read(2) still got |
* data, they all break out of the token loop |
* data, they all break out of the token loop |
* to request more data from the read loop. |
* to request more data from the read loop. |
* |
* |
* Also, they all detect self-closing tags, |
* Also, three of them detect self-closing tags, |
* those ending with "/>", setting the flag |
* those ending with "/>", setting the flag |
* elem_end and calling xml_elem_end() at the |
* elem_end and calling xml_elem_end() at the |
* very end, after handling the attribute value, |
* very end, after handling the attribute value, |
Line 589 parse_file(struct parse *p, int fd, const char *fname) |
|
Line 692 parse_file(struct parse *p, int fd, const char *fname) |
|
/* Parse an attribute value. */ |
/* Parse an attribute value. */ |
|
|
if (in_arg) { |
if (in_arg) { |
if (in_quotes == 0 && b[pend] == '"') { |
if (in_quotes == 0 && |
in_quotes = 1; |
(b[pend] == '\'' || b[pend] == '"')) { |
|
in_quotes = b[pend] == '"' ? 2 : 1; |
p->ncol++; |
p->ncol++; |
pend++; |
pend++; |
continue; |
continue; |
} |
} |
if (advance(p, b, rlen, &pend, |
if (advance(p, b, rlen, &pend, |
in_quotes ? "\"" : " >") && rsz > 0) |
in_quotes == 2 ? "\"" : |
|
in_quotes == 1 ? "'" : " >") && rsz > 0) |
break; |
break; |
in_arg = in_quotes = elem_end = 0; |
in_arg = in_quotes = elem_end = 0; |
if (b[pend] == '>') { |
if (b[pend] == '>') { |
Line 647 parse_file(struct parse *p, int fd, const char *fname) |
|
Line 752 parse_file(struct parse *p, int fd, const char *fname) |
|
if (advance(p, b, rlen, &pend, " >") && |
if (advance(p, b, rlen, &pend, " >") && |
rsz > 0) |
rsz > 0) |
break; |
break; |
|
if (pend > poff + 3 && |
|
strncmp(b + poff, "<!--", 4) == 0) { |
|
|
|
/* Skip a comment. */ |
|
|
|
cp = strstr(b + pend - 2, "-->"); |
|
if (cp == NULL) { |
|
if (rsz > 0) { |
|
pend = rlen; |
|
break; |
|
} |
|
cp = b + rlen; |
|
} else |
|
cp += 3; |
|
while (b + pend < cp) { |
|
if (b[++pend] == '\n') { |
|
p->nline++; |
|
p->ncol = 1; |
|
} else |
|
p->ncol++; |
|
} |
|
continue; |
|
} |
elem_end = 0; |
elem_end = 0; |
if (b[pend] != '>') |
if (b[pend] != '>') |
in_tag = 1; |
in_tag = 1; |
Line 665 parse_file(struct parse *p, int fd, const char *fname) |
|
Line 793 parse_file(struct parse *p, int fd, const char *fname) |
|
if (elem_end) |
if (elem_end) |
xml_elem_end(p, b + poff); |
xml_elem_end(p, b + poff); |
|
|
/* Process text up to the next tag. */ |
/* Process an entity. */ |
|
|
|
} else if (b[poff] == '&') { |
|
if (advance(p, b, rlen, &pend, ";") && |
|
rsz > 0) |
|
break; |
|
b[pend] = '\0'; |
|
if (pend < rlen) |
|
pend++; |
|
xml_entity(p, b + poff + 1); |
|
|
|
/* Process text up to the next tag or entity. */ |
|
|
} else { |
} else { |
if (advance(p, b, rlen, &pend, "<") == 0) |
if (advance(p, b, rlen, &pend, "<&") == 0) |
p->ncol--; |
p->ncol--; |
xml_char(p, b + poff, pend - poff); |
xml_char(p, b + poff, pend - poff); |
} |
} |