=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.10 retrieving revision 1.22 diff -u -p -r1.10 -r1.22 --- docbook2mdoc/parse.c 2019/04/03 11:23:48 1.10 +++ docbook2mdoc/parse.c 2019/04/07 19:33:27 1.22 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.10 2019/04/03 11:23:48 schwarze Exp $ */ +/* $Id: parse.c,v 1.22 2019/04/07 19:33:27 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -30,6 +30,14 @@ * The implementation of the DocBook parser. */ +enum pstate { + PARSE_ELEM, + PARSE_TAG, + PARSE_ARG, + PARSE_SQ, + PARSE_DQ +}; + /* * Global parse state. * Keep this as simple and small as possible. @@ -44,6 +52,7 @@ struct parse { int nline; /* Line number of next token. */ int ncol; /* Column number of next token. */ int del; /* Levels of nested nodes being deleted. */ + int spc; /* Whitespace before the next element. */ int attr; /* The most recent attribute is valid. */ int warn; }; @@ -57,19 +66,21 @@ static const struct element elements[] = { { "acronym", NODE_IGNORE }, { "affiliation", NODE_AFFILIATION }, { "anchor", NODE_DELETE }, + { "appendix", NODE_APPENDIX }, { "application", NODE_APPLICATION }, { "arg", NODE_ARG }, + { "article", NODE_SECTION }, { "author", NODE_AUTHOR }, { "authorgroup", NODE_AUTHORGROUP }, { "blockquote", NODE_BLOCKQUOTE }, - { "book", NODE_BOOK }, + { "book", NODE_SECTION }, { "bookinfo", NODE_BOOKINFO }, { "caution", NODE_CAUTION }, { "chapter", NODE_SECTION }, { "citerefentry", NODE_CITEREFENTRY }, { "citetitle", NODE_CITETITLE }, { "cmdsynopsis", NODE_CMDSYNOPSIS }, - { "code", NODE_CODE }, + { "code", NODE_LITERAL }, { "colspec", NODE_COLSPEC }, { "command", NODE_COMMAND }, { "constant", NODE_CONSTANT }, @@ -81,6 +92,7 @@ static const struct element elements[] = { { "emphasis", NODE_EMPHASIS }, { "entry", NODE_ENTRY }, { "envar", NODE_ENVAR }, + { "errorname", NODE_ERRORNAME }, { "fieldsynopsis", NODE_FIELDSYNOPSIS }, { "filename", NODE_FILENAME }, { "firstname", NODE_PERSONNAME }, @@ -91,6 +103,11 @@ static const struct element elements[] = { { "funcsynopsis", NODE_FUNCSYNOPSIS }, { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, { "function", NODE_FUNCTION }, + { "glossary", NODE_VARIABLELIST }, + { "glossdef", NODE_IGNORE }, + { "glossdiv", NODE_IGNORE }, + { "glossentry", NODE_VARLISTENTRY }, + { "glosslist", NODE_VARIABLELIST }, { "glossterm", NODE_GLOSSTERM }, { "group", NODE_GROUP }, { "holder", NODE_HOLDER }, @@ -98,7 +115,7 @@ static const struct element elements[] = { { "indexterm", NODE_DELETE }, { "info", NODE_INFO }, { "informalequation", NODE_INFORMALEQUATION }, - { "informaltable", NODE_INFORMALTABLE }, + { "informaltable", NODE_TABLE }, { "inlineequation", NODE_INLINEEQUATION }, { "itemizedlist", NODE_ITEMIZEDLIST }, { "keysym", NODE_KEYSYM }, @@ -161,11 +178,14 @@ static const struct element elements[] = { { "sect2", NODE_SECTION }, { "section", NODE_SECTION }, { "sgmltag", NODE_SGMLTAG }, + { "simpara", NODE_PARA }, { "simplelist", NODE_SIMPLELIST }, { "spanspec", NODE_SPANSPEC }, - { "structname", NODE_STRUCTNAME }, + { "structfield", NODE_PARAMETER }, + { "structname", NODE_TYPE }, { "subtitle", NODE_SUBTITLE }, { "surname", NODE_PERSONNAME }, + { "symbol", NODE_CONSTANT }, { "synopsis", NODE_SYNOPSIS }, { "table", NODE_TABLE }, { "tbody", NODE_TBODY }, @@ -177,8 +197,8 @@ static const struct element elements[] = { { "title", NODE_TITLE }, { "trademark", NODE_IGNORE }, { "type", NODE_TYPE }, - { "ulink", NODE_ULINK }, - { "userinput", NODE_USERINPUT }, + { "ulink", NODE_LINK }, + { "userinput", NODE_LITERAL }, { "variablelist", NODE_VARIABLELIST }, { "varlistentry", NODE_VARLISTENTRY }, { "varname", NODE_VARNAME }, @@ -282,6 +302,7 @@ static void xml_char(struct parse *ps, const char *p, int sz) { struct pnode *dat; + size_t newsz; if (ps->del > 0) return; @@ -297,6 +318,7 @@ xml_char(struct parse *ps, const char *p, int sz) exit(1); } dat->node = NODE_TEXT; + dat->spc = ps->spc; dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); @@ -311,24 +333,35 @@ xml_char(struct parse *ps, const char *p, int sz) /* Append to the current text node. */ assert(sz >= 0); - ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1); + newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz; + ps->cur->b = realloc(ps->cur->b, newsz + 1); if (ps->cur->b == NULL) { perror(NULL); exit(1); } + if (ps->cur->bsz && ps->spc) + ps->cur->b[ps->cur->bsz++] = ' '; memcpy(ps->cur->b + ps->cur->bsz, p, sz); - ps->cur->bsz += sz; - ps->cur->b[ps->cur->bsz] = '\0'; + ps->cur->b[ps->cur->bsz = newsz] = '\0'; ps->cur->real = ps->cur->b; + ps->spc = 0; } +/* + * Close out the text node and strip trailing whitespace, if one is open. + */ static void -pnode_trim(struct pnode *pn) +pnode_closetext(struct parse *p) { - assert(pn->node == NODE_TEXT); - for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0') - if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0) - break; + struct pnode *n; + + if ((n = p->cur) == NULL || n->node != NODE_TEXT) + return; + p->cur = n->parent; + while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { + n->b[--n->bsz] = '\0'; + p->spc = 1; + } } static void @@ -345,11 +378,7 @@ xml_entity(struct parse *p, const char *name) return; } - /* Close out the text node, if there is one. */ - if (p->cur->node == NODE_TEXT) { - pnode_trim(p->cur); - p->cur = p->cur->parent; - } + pnode_closetext(p); if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) warn_msg(p, "entity after end of document: &%s;", name); @@ -371,10 +400,12 @@ xml_entity(struct parse *p, const char *name) } dat->node = NODE_ESCAPE; dat->bsz = strlen(dat->b); + dat->spc = p->spc; dat->parent = p->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); + p->spc = 0; } /* @@ -398,11 +429,7 @@ xml_elem_start(struct parse *ps, const char *name) return; } - /* Close out the text node, if there is one. */ - if (ps->cur != NULL && ps->cur->node == NODE_TEXT) { - pnode_trim(ps->cur); - ps->cur = ps->cur->parent; - } + pnode_closetext(ps); for (elem = elements; elem->name != NULL; elem++) if (strcmp(elem->name, name) == 0) @@ -436,7 +463,52 @@ xml_elem_start(struct parse *ps, const char *name) perror(NULL); exit(1); } - dat->node = elem->node; + + /* + * Nodes that begin a new macro or request line or start by + * printing text always want whitespace before themselves. + */ + + switch (dat->node = elem->node) { + case NODE_APPENDIX: + case NODE_AUTHORGROUP: + case NODE_BLOCKQUOTE: + case NODE_BOOKINFO: + case NODE_CAUTION: + case NODE_EDITOR: + case NODE_ENTRY: + case NODE_FUNCDEF: + case NODE_FUNCPROTOTYPE: + case NODE_INFORMALEQUATION: + case NODE_INLINEEQUATION: + case NODE_ITEMIZEDLIST: + case NODE_LEGALNOTICE: + case NODE_LITERALLAYOUT: + case NODE_NOTE: + case NODE_ORDEREDLIST: + case NODE_PARA: + case NODE_PREFACE: + case NODE_PROGRAMLISTING: + case NODE_REFMETA: + case NODE_REFNAMEDIV: + case NODE_REFSYNOPSISDIV: + case NODE_ROW: + case NODE_SBR: + case NODE_SCREEN: + case NODE_SECTION: + case NODE_SYNOPSIS: + case NODE_TGROUP: + case NODE_TIP: + case NODE_TITLE: + case NODE_VARIABLELIST: + case NODE_VARLISTENTRY: + case NODE_WARNING: + dat->spc = 1; + break; + default: + dat->spc = ps->spc; + break; + } dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); @@ -455,7 +527,7 @@ xml_attrkey(struct parse *ps, const char *name) struct pattr *attr; enum attrkey key; - if (ps->del > 0 || *name == '\0') + if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0') return; if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { ps->attr = 0; @@ -477,7 +549,7 @@ xml_attrval(struct parse *ps, const char *name) { struct pattr *attr; - if (ps->del > 0 || ps->attr == 0) + if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0) return; if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL) return; @@ -507,11 +579,8 @@ xml_elem_end(struct parse *ps, const char *name) return; } - /* Close out the text node, if there is one. */ - if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) { - pnode_trim(ps->cur); - ps->cur = ps->cur->parent; - } + if (ps->del == 0) + pnode_closetext(ps); if (name != NULL) { for (elem = elements; elem->name != NULL; elem++) @@ -546,6 +615,7 @@ xml_elem_end(struct parse *ps, const char *name) ps->tree->flags |= TREE_CLOSED; else ps->cur = ps->cur->parent; + ps->spc = 0; break; } assert(ps->del == 0); @@ -579,6 +649,19 @@ parse_free(struct parse *p) free(p); } +static void +increment(struct parse *p, char *b, size_t *pend, int refill) +{ + if (refill) { + if (b[*pend] == '\n') { + p->nline++; + p->ncol = 1; + } else + p->ncol++; + } + ++*pend; +} + /* * Advance the pend pointer to the next character in the charset. * If the charset starts with a space, it stands for any whitespace. @@ -589,7 +672,7 @@ parse_free(struct parse *p) */ static int advance(struct parse *p, char *b, size_t rlen, size_t *pend, - const char *charset) + const char *charset, int refill) { int space; @@ -599,234 +682,221 @@ advance(struct parse *p, char *b, size_t rlen, size_t } else space = 0; - p->nline = p->line; - p->ncol = p->col; + if (refill) { + p->nline = p->line; + p->ncol = p->col; + } while (*pend < rlen) { - if (b[*pend] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; if (space && isspace((unsigned char)b[*pend])) break; if (strchr(charset, b[*pend]) != NULL) break; - ++*pend; + increment(p, b, pend, refill); } if (*pend == rlen) { b[rlen] = '\0'; - return 1; + return refill; } else return 0; } -struct ptree * -parse_file(struct parse *p, int fd, const char *fname) +size_t +parse_string(struct parse *p, char *b, size_t rlen, + enum pstate *pstate, int refill) { - char b[4096]; char *cp; - ssize_t rsz; /* Return value from read(2). */ - size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ - int in_tag, in_arg, in_quotes, elem_end; + int elem_end; - p->fname = fname; - p->nline = 1; - p->ncol = 1; - rlen = 0; - in_tag = in_arg = in_quotes = 0; + p->spc = 0; + pend = 0; + for (;;) { - /* - * Read loop. - * - * We have to enter the read loop once more even on EOF - * because the previous token may have been incomplete, - * such that it asked for more input. - * Once rsz is 0, incomplete tokens will no longer ask - * for more input but instead use whatever there is, - * and then exit the read loop. - * The minus one on the size limit for read(2) is needed - * such that advance() can set b[rlen] to NUL when needed. - */ + /* Proceed to the next token, skipping whitespace. */ - while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { - if ((rlen += rsz) == 0) + if (refill) { + p->line = p->nline; + p->col = p->ncol; + } + if ((poff = pend) == rlen) break; + if (isspace((unsigned char)b[pend])) { + p->spc = 1; + increment(p, b, &pend, refill); + continue; + } - /* Token loop. */ + /* + * The following four cases (ARG, TAG, and starting an + * entity or a tag) all parse a word or quoted string. + * If that extends beyond the read buffer and the last + * read(2) still got data, they all break out of the + * token loop to request more data from the read loop. + * + * Also, three of them detect self-closing tags, those + * ending with "/>", setting the flag elem_end and + * calling xml_elem_end() at the very end, after + * handling the attribute value, attribute name, or + * tag name, respectively. + */ - pend = 0; - for (;;) { + /* Parse an attribute value. */ - /* Proceed to the next token, skipping whitespace. */ - - p->line = p->nline; - p->col = p->ncol; - if ((poff = pend) == rlen) - break; - if (isspace((unsigned char)b[pend])) { - if (b[pend++] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; + if (*pstate >= PARSE_ARG) { + if (*pstate == PARSE_ARG && + (b[pend] == '\'' || b[pend] == '"')) { + *pstate = b[pend] == '"' ? + PARSE_DQ : PARSE_SQ; + increment(p, b, &pend, refill); continue; } - - /* - * The following four cases (in_arg, in_tag, and - * starting an entity or a tag) all parse a word - * or quoted string. If that extends beyond the - * read buffer and the last read(2) still got - * data, they all break out of the token loop - * to request more data from the read loop. - * - * Also, three of them detect self-closing tags, - * those ending with "/>", setting the flag - * elem_end and calling xml_elem_end() at the - * very end, after handling the attribute value, - * attribute name, or tag name, respectively. - */ - - /* Parse an attribute value. */ - - if (in_arg) { - if (in_quotes == 0 && - (b[pend] == '\'' || b[pend] == '"')) { - in_quotes = b[pend] == '"' ? 2 : 1; - p->ncol++; - pend++; - continue; + if (advance(p, b, rlen, &pend, + *pstate == PARSE_DQ ? "\"" : + *pstate == PARSE_SQ ? "'" : " >", refill)) + break; + *pstate = PARSE_TAG; + elem_end = 0; + if (b[pend] == '>') { + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; } - if (advance(p, b, rlen, &pend, - in_quotes == 2 ? "\"" : - in_quotes == 1 ? "'" : " >") && rsz > 0) - break; - in_arg = in_quotes = elem_end = 0; - if (b[pend] == '>') { - in_tag = 0; - if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - } - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_attrval(p, b + poff); - if (elem_end) - xml_elem_end(p, NULL); + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrval(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); - /* Look for an attribute name. */ + /* Look for an attribute name. */ - } else if (in_tag) { - if (advance(p, b, rlen, &pend, " =>") && - rsz > 0) - break; - elem_end = 0; - switch (b[pend]) { - case '>': - in_tag = 0; - if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - break; - case '=': - in_arg = 1; - break; - default: - break; + } else if (*pstate == PARSE_TAG) { + if (advance(p, b, rlen, &pend, " =>", refill)) + break; + elem_end = 0; + switch (b[pend]) { + case '>': + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; } - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_attrkey(p, b + poff); - if (elem_end) - xml_elem_end(p, NULL); + break; + case '=': + *pstate = PARSE_ARG; + break; + default: + break; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrkey(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); - /* Begin an opening or closing tag. */ + /* Begin an opening or closing tag. */ - } else if (b[poff] == '<') { - if (advance(p, b, rlen, &pend, " >") && - rsz > 0) - break; - if (pend > poff + 3 && - strncmp(b + poff, ""); - if (cp == NULL) { - if (rsz > 0) { - pend = rlen; - break; - } - cp = b + rlen; - } else - cp += 3; - while (b + pend < cp) { - if (b[++pend] == '\n') { - p->nline++; - p->ncol = 1; - } else - p->ncol++; - } - continue; - } - elem_end = 0; - if (b[pend] != '>') - in_tag = 1; - else if (pend > 0 && b[pend - 1] == '/') { - b[pend - 1] = '\0'; - elem_end = 1; - } - b[pend] = '\0'; - if (pend < rlen) - pend++; - if (b[++poff] == '/') { - elem_end = 1; - poff++; + cp = strstr(b + pend - 2, "-->"); + if (cp == NULL) { + if (refill) + break; + cp = b + rlen; } else - xml_elem_start(p, b + poff); - if (elem_end) - xml_elem_end(p, b + poff); + cp += 3; + while (b + pend < cp) + increment(p, b, &pend, refill); + continue; + } + elem_end = 0; + if (b[pend] != '>') + *pstate = PARSE_TAG; + else if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + if (b[++poff] == '/') { + elem_end = 1; + poff++; + } else + xml_elem_start(p, b + poff); + if (elem_end) + xml_elem_end(p, b + poff); - /* Process an entity. */ + /* Process an entity. */ - } else if (b[poff] == '&') { - if (advance(p, b, rlen, &pend, ";") && - rsz > 0) - break; - b[pend] = '\0'; - if (pend < rlen) - pend++; - xml_entity(p, b + poff + 1); + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";", refill)) + break; + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_entity(p, b + poff + 1); - /* Process text up to the next tag or entity. */ + /* Process text up to the next tag, entity, or EOL. */ - } else { - if (advance(p, b, rlen, &pend, "<&") == 0) - p->ncol--; - xml_char(p, b + poff, pend - poff); - } + } else { + advance(p, b, rlen, &pend, "<&", refill); + xml_char(p, b + poff, pend - poff); } + } + return poff; +} - /* Buffer exhausted; shift left and re-fill. */ +struct ptree * +parse_file(struct parse *p, int fd, const char *fname) +{ + char b[4096]; + ssize_t rsz; /* Return value from read(2). */ + size_t rlen; /* Number of bytes in b[]. */ + size_t poff; /* Parse offset in b[]. */ + enum pstate pstate; + p->fname = fname; + p->nline = 1; + p->ncol = 1; + pstate = PARSE_ELEM; + rlen = 0; + + /* + * Read loop. + * + * If the previous token was incomplete and asked for more + * input, we have to enter the read loop once more even on EOF. + * Once rsz is 0, incomplete tokens will no longer ask + * for more input but instead use whatever there is, + * and then exit the read loop. + * The minus one on the size limit for read(2) is needed + * such that advance() can set b[rlen] to NUL when needed. + */ + + while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && + (rlen += rsz) > 0) { + poff = parse_string(p, b, rlen, &pstate, rsz > 0); + /* Buffer exhausted; shift left and re-fill. */ assert(poff > 0); - memmove(b, b + poff, rlen - poff); rlen -= poff; + memmove(b, b + poff, rlen); } if (rsz < 0) { perror(fname); p->tree->flags |= TREE_FAIL; } - if (p->cur != NULL && p->cur->node == NODE_TEXT) { - pnode_trim(p->cur); - p->cur = p->cur->parent; - } + pnode_closetext(p); if ((p->tree->flags & TREE_CLOSED) == 0) warn_msg(p, "document not closed"); return p->tree;