=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.17 retrieving revision 1.23 diff -u -p -r1.17 -r1.23 --- docbook2mdoc/parse.c 2019/04/07 14:49:26 1.17 +++ docbook2mdoc/parse.c 2019/04/08 14:37:31 1.23 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.17 2019/04/07 14:49:26 schwarze Exp $ */ +/* $Id: parse.c,v 1.23 2019/04/08 14:37:31 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -45,6 +45,7 @@ enum pstate { struct parse { const char *fname; /* Name of the input file. */ struct ptree *tree; /* Complete parse result. */ + struct pnode *doctype; struct pnode *cur; /* Current node in the tree. */ enum nodeid ncur; /* Type of the current node. */ int line; /* Line number in the input file. */ @@ -52,9 +53,11 @@ struct parse { int nline; /* Line number of next token. */ int ncol; /* Column number of next token. */ int del; /* Levels of nested nodes being deleted. */ - int spc; /* Whitespace before the next element. */ - int attr; /* The most recent attribute is valid. */ - int warn; + int flags; +#define PFLAG_WARN (1 << 0) /* Print warning messages. */ +#define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */ +#define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */ +#define PFLAG_EEND (1 << 3) /* This element is self-closing. */ }; struct element { @@ -66,12 +69,14 @@ static const struct element elements[] = { { "acronym", NODE_IGNORE }, { "affiliation", NODE_AFFILIATION }, { "anchor", NODE_DELETE }, + { "appendix", NODE_APPENDIX }, { "application", NODE_APPLICATION }, { "arg", NODE_ARG }, + { "article", NODE_SECTION }, { "author", NODE_AUTHOR }, { "authorgroup", NODE_AUTHORGROUP }, { "blockquote", NODE_BLOCKQUOTE }, - { "book", NODE_BOOK }, + { "book", NODE_SECTION }, { "bookinfo", NODE_BOOKINFO }, { "caution", NODE_CAUTION }, { "chapter", NODE_SECTION }, @@ -85,9 +90,12 @@ static const struct element elements[] = { { "contrib", NODE_CONTRIB }, { "copyright", NODE_COPYRIGHT }, { "date", NODE_DATE }, + { "!doctype", NODE_DOCTYPE }, + { "!DOCTYPE", NODE_DOCTYPE }, { "editor", NODE_EDITOR }, { "email", NODE_EMAIL }, { "emphasis", NODE_EMPHASIS }, + { "!ENTITY", NODE_ENTITY }, { "entry", NODE_ENTRY }, { "envar", NODE_ENVAR }, { "errorname", NODE_ERRORNAME }, @@ -101,6 +109,11 @@ static const struct element elements[] = { { "funcsynopsis", NODE_FUNCSYNOPSIS }, { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, { "function", NODE_FUNCTION }, + { "glossary", NODE_VARIABLELIST }, + { "glossdef", NODE_IGNORE }, + { "glossdiv", NODE_IGNORE }, + { "glossentry", NODE_VARLISTENTRY }, + { "glosslist", NODE_VARIABLELIST }, { "glossterm", NODE_GLOSSTERM }, { "group", NODE_GROUP }, { "holder", NODE_HOLDER }, @@ -190,7 +203,7 @@ static const struct element elements[] = { { "title", NODE_TITLE }, { "trademark", NODE_IGNORE }, { "type", NODE_TYPE }, - { "ulink", NODE_ULINK }, + { "ulink", NODE_LINK }, { "userinput", NODE_LITERAL }, { "variablelist", NODE_VARIABLELIST }, { "varlistentry", NODE_VARLISTENTRY }, @@ -258,6 +271,10 @@ static const struct entity entities[] = { { NULL, NULL } }; +static size_t parse_string(struct parse *, char *, size_t, + enum pstate *, int); + + static void error_msg(struct parse *p, const char *fmt, ...) { @@ -276,7 +293,7 @@ warn_msg(struct parse *p, const char *fmt, ...) { va_list ap; - if (p->warn == 0) + if ((p->flags & PFLAG_WARN) == 0) return; fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col); @@ -311,7 +328,7 @@ xml_char(struct parse *ps, const char *p, int sz) exit(1); } dat->node = NODE_TEXT; - dat->spc = ps->spc; + dat->spc = (ps->flags & PFLAG_SPC) != 0; dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); @@ -326,18 +343,18 @@ xml_char(struct parse *ps, const char *p, int sz) /* Append to the current text node. */ assert(sz >= 0); - newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz; + newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz; ps->cur->b = realloc(ps->cur->b, newsz + 1); if (ps->cur->b == NULL) { perror(NULL); exit(1); } - if (ps->cur->bsz && ps->spc) + if (ps->cur->bsz && (ps->flags & PFLAG_SPC)) ps->cur->b[ps->cur->bsz++] = ' '; memcpy(ps->cur->b + ps->cur->bsz, p, sz); ps->cur->b[ps->cur->bsz = newsz] = '\0'; ps->cur->real = ps->cur->b; - ps->spc = 0; + ps->flags &= ~PFLAG_SPC; } /* @@ -353,7 +370,7 @@ pnode_closetext(struct parse *p) p->cur = n->parent; while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { n->b[--n->bsz] = '\0'; - p->spc = 1; + p->flags |= PFLAG_SPC; } } @@ -362,6 +379,9 @@ xml_entity(struct parse *p, const char *name) { const struct entity *entity; struct pnode *dat; + const char *ccp; + char *cp; + enum pstate pstate; if (p->del > 0) return; @@ -381,6 +401,25 @@ xml_entity(struct parse *p, const char *name) break; if (entity->roff == NULL) { + if (p->doctype != NULL) { + TAILQ_FOREACH(dat, &p->doctype->childq, child) { + if ((ccp = pnode_getattr_raw(dat, + ATTRKEY_NAME, NULL)) == NULL || + strcmp(ccp, name) != 0 || + (ccp = pnode_getattr_raw(dat, + ATTRKEY_DEFINITION, NULL)) == NULL) + continue; + if ((cp = strdup(ccp)) == NULL) { + perror(NULL); + exit(1); + } + pstate = PARSE_ELEM; + parse_string(p, cp, strlen(cp), &pstate, 0); + p->flags &= ~PFLAG_SPC; + free(cp); + return; + } + } error_msg(p, "unknown entity &%s;", name); return; } @@ -393,12 +432,12 @@ xml_entity(struct parse *p, const char *name) } dat->node = NODE_ESCAPE; dat->bsz = strlen(dat->b); - dat->spc = p->spc; + dat->spc = (p->flags & PFLAG_SPC) != 0; dat->parent = p->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); - p->spc = 0; + p->flags &= ~PFLAG_SPC; } /* @@ -410,15 +449,13 @@ xml_elem_start(struct parse *ps, const char *name) const struct element *elem; struct pnode *dat; - if (*name == '!' || *name == '?') - return; - /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ if (ps->del > 0) { - ps->del++; + if (*name != '!' && *name != '?') + ps->del++; return; } @@ -428,8 +465,11 @@ xml_elem_start(struct parse *ps, const char *name) if (strcmp(elem->name, name) == 0) break; - if (elem->name == NULL) + if (elem->name == NULL) { + if (*name == '!' || *name == '?') + return; error_msg(ps, "unknown element <%s>", name); + } ps->ncur = elem->node; @@ -463,7 +503,14 @@ xml_elem_start(struct parse *ps, const char *name) */ switch (dat->node = elem->node) { + case NODE_DOCTYPE: + case NODE_ENTITY: + case NODE_SBR: + ps->flags |= PFLAG_EEND; + /* FALLTHROUGH */ + case NODE_APPENDIX: case NODE_AUTHORGROUP: + case NODE_BLOCKQUOTE: case NODE_BOOKINFO: case NODE_CAUTION: case NODE_EDITOR: @@ -484,7 +531,6 @@ xml_elem_start(struct parse *ps, const char *name) case NODE_REFNAMEDIV: case NODE_REFSYNOPSISDIV: case NODE_ROW: - case NODE_SBR: case NODE_SCREEN: case NODE_SECTION: case NODE_SYNOPSIS: @@ -497,7 +543,7 @@ xml_elem_start(struct parse *ps, const char *name) dat->spc = 1; break; default: - dat->spc = ps->spc; + dat->spc = (ps->flags & PFLAG_SPC) != 0; break; } dat->parent = ps->cur; @@ -508,7 +554,12 @@ xml_elem_start(struct parse *ps, const char *name) TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); ps->cur = dat; - if (ps->tree->root == NULL) + if (dat->node == NODE_DOCTYPE) { + if (ps->doctype == NULL) + ps->doctype = dat; + else + error_msg(ps, "duplicate doctype"); + } else if (dat->parent == NULL && ps->tree->root == NULL) ps->tree->root = dat; } @@ -516,12 +567,21 @@ static void xml_attrkey(struct parse *ps, const char *name) { struct pattr *attr; + const char *value; enum attrkey key; - if (ps->del > 0 || *name == '\0') + if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0') return; + + if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) && + TAILQ_FIRST(&ps->cur->attrq) == NULL) { + value = name; + name = "NAME"; + } else + value = NULL; + if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { - ps->attr = 0; + ps->flags &= ~PFLAG_ATTR; return; } if ((attr = calloc(1, sizeof(*attr))) == NULL) { @@ -530,9 +590,19 @@ xml_attrkey(struct parse *ps, const char *name) } attr->key = key; attr->val = ATTRVAL__MAX; - attr->rawval = NULL; + if (value == NULL) { + attr->rawval = NULL; + ps->flags |= PFLAG_ATTR; + } else { + if ((attr->rawval = strdup(value)) == NULL) { + perror(NULL); + exit(1); + } + ps->flags &= ~PFLAG_ATTR; + } TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child); - ps->attr = 1; + if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME) + xml_attrkey(ps, "DEFINITION"); } static void @@ -540,7 +610,8 @@ xml_attrval(struct parse *ps, const char *name) { struct pattr *attr; - if (ps->del > 0 || ps->attr == 0) + if (ps->del > 0 || ps->ncur == NODE_IGNORE || + (ps->flags & PFLAG_ATTR) == 0) return; if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL) return; @@ -589,6 +660,9 @@ xml_elem_end(struct parse *ps, const char *name) break; case NODE_IGNORE: break; + case NODE_DOCTYPE: + ps->flags &= ~PFLAG_EEND; + /* FALLTHROUGH */ default: if (ps->cur == NULL || node != ps->cur->node) { warn_msg(ps, "element not open: ", name); @@ -602,11 +676,13 @@ xml_elem_end(struct parse *ps, const char *name) * obviously better than discarding it or crashing. */ - if (ps->cur->parent == NULL) - ps->tree->flags |= TREE_CLOSED; - else + if (ps->cur->parent != NULL || node == NODE_DOCTYPE) { ps->cur = ps->cur->parent; - ps->spc = 0; + if (ps->cur != NULL) + ps->ncur = ps->cur->node; + } else + ps->tree->flags |= TREE_CLOSED; + ps->flags &= ~PFLAG_SPC; break; } assert(ps->del == 0); @@ -624,7 +700,10 @@ parse_alloc(int warn) free(p); return NULL; } - p->warn = warn; + if (warn) + p->flags |= PFLAG_WARN; + else + p->flags &= ~PFLAG_WARN; return p; } @@ -700,7 +779,6 @@ parse_string(struct parse *p, char *b, size_t rlen, size_t pend; /* Offset of the end of the current word. */ int elem_end; - p->spc = 0; pend = 0; for (;;) { @@ -713,7 +791,7 @@ parse_string(struct parse *p, char *b, size_t rlen, if ((poff = pend) == rlen) break; if (isspace((unsigned char)b[pend])) { - p->spc = 1; + p->flags |= PFLAG_SPC; increment(p, b, &pend, refill); continue; } @@ -754,6 +832,8 @@ parse_string(struct parse *p, char *b, size_t rlen, b[pend - 1] = '\0'; elem_end = 1; } + if (p->flags & PFLAG_EEND) + elem_end = 1; } b[pend] = '\0'; if (pend < rlen) @@ -765,6 +845,23 @@ parse_string(struct parse *p, char *b, size_t rlen, /* Look for an attribute name. */ } else if (*pstate == PARSE_TAG) { + switch (p->ncur) { + case NODE_DOCTYPE: + if (b[pend] == '[') { + *pstate = PARSE_ELEM; + increment(p, b, &pend, refill); + continue; + } + /* FALLTHROUGH */ + case NODE_ENTITY: + if (b[pend] == '"' || b[pend] == '\'') { + *pstate = PARSE_ARG; + continue; + } + break; + default: + break; + } if (advance(p, b, rlen, &pend, " =>", refill)) break; elem_end = 0; @@ -775,6 +872,8 @@ parse_string(struct parse *p, char *b, size_t rlen, b[pend - 1] = '\0'; elem_end = 1; } + if (p->flags & PFLAG_EEND) + elem_end = 1; break; case '=': *pstate = PARSE_ARG; @@ -823,11 +922,21 @@ parse_string(struct parse *p, char *b, size_t rlen, if (b[++poff] == '/') { elem_end = 1; poff++; - } else + } else { xml_elem_start(p, b + poff); + if (*pstate == PARSE_ELEM && + p->flags & PFLAG_EEND) + elem_end = 1; + } if (elem_end) xml_elem_end(p, b + poff); + /* Close a doctype. */ + + } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') { + *pstate = PARSE_TAG; + increment(p, b, &pend, refill); + /* Process an entity. */ } else if (b[poff] == '&') { @@ -890,5 +999,6 @@ parse_file(struct parse *p, int fd, const char *fname) pnode_closetext(p); if ((p->tree->flags & TREE_CLOSED) == 0) warn_msg(p, "document not closed"); + pnode_unlink(p->doctype); return p->tree; }