=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.8 retrieving revision 1.9 diff -u -p -r1.8 -r1.9 --- docbook2mdoc/parse.c 2019/04/02 13:11:09 1.8 +++ docbook2mdoc/parse.c 2019/04/02 15:53:02 1.9 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.8 2019/04/02 13:11:09 schwarze Exp $ */ +/* $Id: parse.c,v 1.9 2019/04/02 15:53:02 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -189,6 +189,62 @@ static const struct element elements[] = { { NULL, NODE_IGNORE } }; +struct entity { + const char *name; + const char *roff; +}; + +/* + * XML character entity references found in the wild. + * Those that don't have an exact mandoc_char(7) representation + * are approximated, and the desired codepoint is given as a comment. + * Encoding them as \\[u...] would leave -Tascii out in the cold. + */ +static const struct entity entities[] = { + { "alpha", "\\(*a" }, + { "amp", "&" }, + { "apos", "'" }, + { "auml", "\\(:a" }, + { "beta", "\\(*b" }, + { "circ", "^" }, /* U+02C6 */ + { "copy", "\\(co" }, + { "dagger", "\\(dg" }, + { "Delta", "\\(*D" }, + { "eacute", "\\('e" }, + { "emsp", "\\ " }, /* U+2003 */ + { "gt", ">" }, + { "hairsp", "\\^" }, + { "kappa", "\\(*k" }, + { "larr", "\\(<-" }, + { "ldquo", "\\(lq" }, + { "le", "\\(<=" }, + { "lowbar", "_" }, + { "lsqb", "[" }, + { "lt", "<" }, + { "mdash", "\\(em" }, + { "minus", "\\-" }, + { "ndash", "\\(en" }, + { "nbsp", "\\ " }, + { "num", "#" }, + { "oslash", "\\(/o" }, + { "ouml", "\\(:o" }, + { "percnt", "%" }, + { "quot", "\\(dq" }, + { "rarr", "\\(->" }, + { "rArr", "\\(rA" }, + { "rdquo", "\\(rq" }, + { "reg", "\\(rg" }, + { "rho", "\\(*r" }, + { "rsqb", "]" }, + { "sigma", "\\(*s" }, + { "shy", "\\&" }, /* U+00AD */ + { "tau", "\\(*t" }, + { "tilde", "\\[u02DC]" }, + { "times", "\\[tmu]" }, + { "uuml", "\\(:u" }, + { NULL, NULL } +}; + static void error_msg(struct parse *p, const char *fmt, ...) { @@ -275,6 +331,52 @@ pnode_trim(struct pnode *pn) break; } +static void +xml_entity(struct parse *p, const char *name) +{ + const struct entity *entity; + struct pnode *dat; + + if (p->del > 0) + return; + + if (p->cur == NULL) { + error_msg(p, "discarding entity before document: &%s;", name); + return; + } + + /* Close out the text node, if there is one. */ + if (p->cur->node == NODE_TEXT) { + pnode_trim(p->cur); + p->cur = p->cur->parent; + } + + if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) + warn_msg(p, "entity after end of document: &%s;", name); + + for (entity = entities; entity->name != NULL; entity++) + if (strcmp(name, entity->name) == 0) + break; + + if (entity->roff == NULL) { + error_msg(p, "unknown entity &%s;", name); + return; + } + + /* Create, append, and close out an entity node. */ + if ((dat = calloc(1, sizeof(*dat))) == NULL || + (dat->b = dat->real = strdup(entity->roff)) == NULL) { + perror(NULL); + exit(1); + } + dat->node = NODE_ESCAPE; + dat->bsz = strlen(dat->b); + dat->parent = p->cur; + TAILQ_INIT(&dat->childq); + TAILQ_INIT(&dat->attrq); + TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); +} + /* * Begin an element. */ @@ -573,14 +675,14 @@ parse_file(struct parse *p, int fd, const char *fname) } /* - * The following three cases (in_arg, in_tag, - * and starting a tag) all parse a word or - * quoted string. If that extends beyond the + * The following four cases (in_arg, in_tag, and + * starting an entity or a tag) all parse a word + * or quoted string. If that extends beyond the * read buffer and the last read(2) still got * data, they all break out of the token loop * to request more data from the read loop. * - * Also, they all detect self-closing tags, + * Also, three of them detect self-closing tags, * those ending with "/>", setting the flag * elem_end and calling xml_elem_end() at the * very end, after handling the attribute value, @@ -689,10 +791,21 @@ parse_file(struct parse *p, int fd, const char *fname) if (elem_end) xml_elem_end(p, b + poff); - /* Process text up to the next tag. */ + /* Process an entity. */ + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";") && + rsz > 0) + break; + b[pend] = '\0'; + if (pend < rlen) + pend++; + xml_entity(p, b + poff + 1); + + /* Process text up to the next tag or entity. */ + } else { - if (advance(p, b, rlen, &pend, "<") == 0) + if (advance(p, b, rlen, &pend, "<&") == 0) p->ncol--; xml_char(p, b + poff, pend - poff); }