=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.5 retrieving revision 1.10 diff -u -p -r1.5 -r1.10 --- docbook2mdoc/parse.c 2019/03/28 12:21:10 1.5 +++ docbook2mdoc/parse.c 2019/04/03 11:23:48 1.10 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.5 2019/03/28 12:21:10 schwarze Exp $ */ +/* $Id: parse.c,v 1.10 2019/04/03 11:23:48 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -17,6 +17,7 @@ */ #include #include +#include #include #include #include @@ -72,6 +73,7 @@ static const struct element elements[] = { { "colspec", NODE_COLSPEC }, { "command", NODE_COMMAND }, { "constant", NODE_CONSTANT }, + { "contrib", NODE_CONTRIB }, { "copyright", NODE_COPYRIGHT }, { "date", NODE_DATE }, { "editor", NODE_EDITOR }, @@ -81,7 +83,7 @@ static const struct element elements[] = { { "envar", NODE_ENVAR }, { "fieldsynopsis", NODE_FIELDSYNOPSIS }, { "filename", NODE_FILENAME }, - { "firstname", NODE_IGNORE }, + { "firstname", NODE_PERSONNAME }, { "firstterm", NODE_FIRSTTERM }, { "footnote", NODE_FOOTNOTE }, { "funcdef", NODE_FUNCDEF }, @@ -121,7 +123,7 @@ static const struct element elements[] = { { "option", NODE_OPTION }, { "orderedlist", NODE_ORDEREDLIST }, { "orgname", NODE_ORGNAME }, - { "othername", NODE_IGNORE }, + { "othername", NODE_PERSONNAME }, { "para", NODE_PARA }, { "paramdef", NODE_PARAMDEF }, { "parameter", NODE_PARAMETER }, @@ -163,7 +165,7 @@ static const struct element elements[] = { { "spanspec", NODE_SPANSPEC }, { "structname", NODE_STRUCTNAME }, { "subtitle", NODE_SUBTITLE }, - { "surname", NODE_IGNORE }, + { "surname", NODE_PERSONNAME }, { "synopsis", NODE_SYNOPSIS }, { "table", NODE_TABLE }, { "tbody", NODE_TBODY }, @@ -187,7 +189,91 @@ static const struct element elements[] = { { NULL, NODE_IGNORE } }; +struct entity { + const char *name; + const char *roff; +}; + /* + * XML character entity references found in the wild. + * Those that don't have an exact mandoc_char(7) representation + * are approximated, and the desired codepoint is given as a comment. + * Encoding them as \\[u...] would leave -Tascii out in the cold. + */ +static const struct entity entities[] = { + { "alpha", "\\(*a" }, + { "amp", "&" }, + { "apos", "'" }, + { "auml", "\\(:a" }, + { "beta", "\\(*b" }, + { "circ", "^" }, /* U+02C6 */ + { "copy", "\\(co" }, + { "dagger", "\\(dg" }, + { "Delta", "\\(*D" }, + { "eacute", "\\('e" }, + { "emsp", "\\ " }, /* U+2003 */ + { "gt", ">" }, + { "hairsp", "\\^" }, + { "kappa", "\\(*k" }, + { "larr", "\\(<-" }, + { "ldquo", "\\(lq" }, + { "le", "\\(<=" }, + { "lowbar", "_" }, + { "lsqb", "[" }, + { "lt", "<" }, + { "mdash", "\\(em" }, + { "minus", "\\-" }, + { "ndash", "\\(en" }, + { "nbsp", "\\ " }, + { "num", "#" }, + { "oslash", "\\(/o" }, + { "ouml", "\\(:o" }, + { "percnt", "%" }, + { "quot", "\\(dq" }, + { "rarr", "\\(->" }, + { "rArr", "\\(rA" }, + { "rdquo", "\\(rq" }, + { "reg", "\\(rg" }, + { "rho", "\\(*r" }, + { "rsqb", "]" }, + { "sigma", "\\(*s" }, + { "shy", "\\&" }, /* U+00AD */ + { "tau", "\\(*t" }, + { "tilde", "\\[u02DC]" }, + { "times", "\\[tmu]" }, + { "uuml", "\\(:u" }, + { NULL, NULL } +}; + +static void +error_msg(struct parse *p, const char *fmt, ...) +{ + va_list ap; + + fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); + p->tree->flags |= TREE_FAIL; +} + +static void +warn_msg(struct parse *p, const char *fmt, ...) +{ + va_list ap; + + if (p->warn == 0) + return; + + fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); +} + +/* * Process a string of characters. * If a text node is already open, append to it. * Otherwise, create a new one as a child of the current node. @@ -201,9 +287,7 @@ xml_char(struct parse *ps, const char *p, int sz) return; if (ps->cur == NULL) { - fprintf(stderr, "%s:%d:%d: discarding text before docum" - "ent: %.*s\n", ps->fname, ps->line, ps->col, sz, p); - ps->tree->flags |= TREE_FAIL; + error_msg(ps, "discarding text before document: %.*s", sz, p); return; } @@ -221,10 +305,8 @@ xml_char(struct parse *ps, const char *p, int sz) } if (ps->tree->flags & TREE_CLOSED && - ps->cur->parent == ps->tree->root && ps->warn) - fprintf(stderr, "%s:%d:%d: warning: " - "text after end of document: %.*s\n", - ps->fname, ps->line, ps->col, sz, p); + ps->cur->parent == ps->tree->root) + warn_msg(ps, "text after end of document: %.*s", sz, p); /* Append to the current text node. */ @@ -249,6 +331,52 @@ pnode_trim(struct pnode *pn) break; } +static void +xml_entity(struct parse *p, const char *name) +{ + const struct entity *entity; + struct pnode *dat; + + if (p->del > 0) + return; + + if (p->cur == NULL) { + error_msg(p, "discarding entity before document: &%s;", name); + return; + } + + /* Close out the text node, if there is one. */ + if (p->cur->node == NODE_TEXT) { + pnode_trim(p->cur); + p->cur = p->cur->parent; + } + + if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) + warn_msg(p, "entity after end of document: &%s;", name); + + for (entity = entities; entity->name != NULL; entity++) + if (strcmp(name, entity->name) == 0) + break; + + if (entity->roff == NULL) { + error_msg(p, "unknown entity &%s;", name); + return; + } + + /* Create, append, and close out an entity node. */ + if ((dat = calloc(1, sizeof(*dat))) == NULL || + (dat->b = dat->real = strdup(entity->roff)) == NULL) { + perror(NULL); + exit(1); + } + dat->node = NODE_ESCAPE; + dat->bsz = strlen(dat->b); + dat->parent = p->cur; + TAILQ_INIT(&dat->childq); + TAILQ_INIT(&dat->attrq); + TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); +} + /* * Begin an element. */ @@ -280,19 +408,14 @@ xml_elem_start(struct parse *ps, const char *name) if (strcmp(elem->name, name) == 0) break; - if (elem->name == NULL) { - fprintf(stderr, "%s:%d:%d: unknown element <%s>\n", - ps->fname, ps->line, ps->col, name); - ps->tree->flags |= TREE_FAIL; - } + if (elem->name == NULL) + error_msg(ps, "unknown element <%s>", name); + ps->ncur = elem->node; switch (ps->ncur) { case NODE_DELETE_WARN: - if (ps->warn) - fprintf(stderr, "%s:%d:%d: warning: " - "skipping element <%s>\n", - ps->fname, ps->line, ps->col, name); + warn_msg(ps, "skipping element <%s>", name); /* FALLTHROUGH */ case NODE_DELETE: ps->del = 1; @@ -306,11 +429,8 @@ xml_elem_start(struct parse *ps, const char *name) break; } - if (ps->tree->flags & TREE_CLOSED && - ps->cur->parent == NULL && ps->warn) - fprintf(stderr, "%s:%d:%d: warning: " - "element after end of document: %s\n", - ps->fname, ps->line, ps->col, name); + if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL) + warn_msg(ps, "element after end of document: <%s>", name); if ((dat = calloc(1, sizeof(*dat))) == NULL) { perror(NULL); @@ -338,10 +458,6 @@ xml_attrkey(struct parse *ps, const char *name) if (ps->del > 0 || *name == '\0') return; if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { - if (ps->warn) - fprintf(stderr, "%s:%d:%d: warning: " - "unknown attribute \"%s\"\n", - ps->fname, ps->line, ps->col, name); ps->attr = 0; return; } @@ -415,10 +531,7 @@ xml_elem_end(struct parse *ps, const char *name) break; default: if (ps->cur == NULL || node != ps->cur->node) { - if (ps->warn) - fprintf(stderr, "%s:%d:%d: warning: " - "element not open: \n", - ps->fname, ps->line, ps->col, name); + warn_msg(ps, "element not open: ", name); break; } @@ -511,6 +624,7 @@ struct ptree * parse_file(struct parse *p, int fd, const char *fname) { char b[4096]; + char *cp; ssize_t rsz; /* Return value from read(2). */ size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ @@ -561,14 +675,14 @@ parse_file(struct parse *p, int fd, const char *fname) } /* - * The following three cases (in_arg, in_tag, - * and starting a tag) all parse a word or - * quoted string. If that extends beyond the + * The following four cases (in_arg, in_tag, and + * starting an entity or a tag) all parse a word + * or quoted string. If that extends beyond the * read buffer and the last read(2) still got * data, they all break out of the token loop * to request more data from the read loop. * - * Also, they all detect self-closing tags, + * Also, three of them detect self-closing tags, * those ending with "/>", setting the flag * elem_end and calling xml_elem_end() at the * very end, after handling the attribute value, @@ -578,14 +692,16 @@ parse_file(struct parse *p, int fd, const char *fname) /* Parse an attribute value. */ if (in_arg) { - if (in_quotes == 0 && b[pend] == '"') { - in_quotes = 1; + if (in_quotes == 0 && + (b[pend] == '\'' || b[pend] == '"')) { + in_quotes = b[pend] == '"' ? 2 : 1; p->ncol++; pend++; continue; } if (advance(p, b, rlen, &pend, - in_quotes ? "\"" : " >") && rsz > 0) + in_quotes == 2 ? "\"" : + in_quotes == 1 ? "'" : " >") && rsz > 0) break; in_arg = in_quotes = elem_end = 0; if (b[pend] == '>') { @@ -636,6 +752,29 @@ parse_file(struct parse *p, int fd, const char *fname) if (advance(p, b, rlen, &pend, " >") && rsz > 0) break; + if (pend > poff + 3 && + strncmp(b + poff, ""); + if (cp == NULL) { + if (rsz > 0) { + pend = rlen; + break; + } + cp = b + rlen; + } else + cp += 3; + while (b + pend < cp) { + if (b[++pend] == '\n') { + p->nline++; + p->ncol = 1; + } else + p->ncol++; + } + continue; + } elem_end = 0; if (b[pend] != '>') in_tag = 1; @@ -654,10 +793,21 @@ parse_file(struct parse *p, int fd, const char *fname) if (elem_end) xml_elem_end(p, b + poff); - /* Process text up to the next tag. */ + /* Process an entity. */ + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";") && + rsz > 0) + break; + b[pend] = '\0'; + if (pend < rlen) + pend++; + xml_entity(p, b + poff + 1); + + /* Process text up to the next tag or entity. */ + } else { - if (advance(p, b, rlen, &pend, "<") == 0) + if (advance(p, b, rlen, &pend, "<&") == 0) p->ncol--; xml_char(p, b + poff, pend - poff); } @@ -677,8 +827,7 @@ parse_file(struct parse *p, int fd, const char *fname) pnode_trim(p->cur); p->cur = p->cur->parent; } - if ((p->tree->flags & TREE_CLOSED) == 0 && p->warn) - fprintf(stderr, "%s:%d:%d: warning: document not closed\n", - p->fname, p->line, p->col); + if ((p->tree->flags & TREE_CLOSED) == 0) + warn_msg(p, "document not closed"); return p->tree; }