=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.25 retrieving revision 1.51 diff -u -p -r1.25 -r1.51 --- docbook2mdoc/parse.c 2019/04/08 23:40:17 1.25 +++ docbook2mdoc/parse.c 2019/04/24 18:38:02 1.51 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.25 2019/04/08 23:40:17 schwarze Exp $ */ +/* $Id: parse.c,v 1.51 2019/04/24 18:38:02 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -56,165 +56,76 @@ struct parse { int nline; /* Line number of next token. */ int ncol; /* Column number of next token. */ int del; /* Levels of nested nodes being deleted. */ + int nofill; /* Levels of open no-fill displays. */ int flags; #define PFLAG_WARN (1 << 0) /* Print warning messages. */ -#define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */ -#define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */ -#define PFLAG_EEND (1 << 3) /* This element is self-closing. */ +#define PFLAG_LINE (1 << 1) /* New line before the next element. */ +#define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */ +#define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */ +#define PFLAG_EEND (1 << 4) /* This element is self-closing. */ }; -struct element { +struct alias { const char *name; /* DocBook element name. */ enum nodeid node; /* Node type to generate. */ }; -static const struct element elements[] = { +static const struct alias aliases[] = { { "acronym", NODE_IGNORE }, - { "affiliation", NODE_AFFILIATION }, + { "affiliation", NODE_IGNORE }, { "anchor", NODE_DELETE }, - { "appendix", NODE_APPENDIX }, - { "application", NODE_APPLICATION }, - { "arg", NODE_ARG }, + { "application", NODE_COMMAND }, { "article", NODE_SECTION }, - { "author", NODE_AUTHOR }, - { "authorgroup", NODE_AUTHORGROUP }, - { "blockquote", NODE_BLOCKQUOTE }, + { "articleinfo", NODE_BOOKINFO }, { "book", NODE_SECTION }, - { "bookinfo", NODE_BOOKINFO }, - { "caution", NODE_CAUTION }, { "chapter", NODE_SECTION }, - { "citerefentry", NODE_CITEREFENTRY }, - { "citetitle", NODE_CITETITLE }, - { "cmdsynopsis", NODE_CMDSYNOPSIS }, + { "caption", NODE_IGNORE }, { "code", NODE_LITERAL }, - { "colspec", NODE_COLSPEC }, - { "command", NODE_COMMAND }, - { "constant", NODE_CONSTANT }, - { "contrib", NODE_CONTRIB }, - { "copyright", NODE_COPYRIGHT }, - { "date", NODE_DATE }, + { "computeroutput", NODE_LITERAL }, { "!doctype", NODE_DOCTYPE }, - { "!DOCTYPE", NODE_DOCTYPE }, - { "editor", NODE_EDITOR }, - { "email", NODE_EMAIL }, - { "emphasis", NODE_EMPHASIS }, - { "!ENTITY", NODE_ENTITY }, - { "entry", NODE_ENTRY }, - { "envar", NODE_ENVAR }, - { "errorname", NODE_ERRORNAME }, - { "fieldsynopsis", NODE_FIELDSYNOPSIS }, - { "filename", NODE_FILENAME }, + { "figure", NODE_IGNORE }, { "firstname", NODE_PERSONNAME }, - { "firstterm", NODE_FIRSTTERM }, - { "footnote", NODE_FOOTNOTE }, - { "funcdef", NODE_FUNCDEF }, - { "funcprototype", NODE_FUNCPROTOTYPE }, - { "funcsynopsis", NODE_FUNCSYNOPSIS }, - { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, - { "function", NODE_FUNCTION }, { "glossary", NODE_VARIABLELIST }, { "glossdef", NODE_IGNORE }, { "glossdiv", NODE_IGNORE }, { "glossentry", NODE_VARLISTENTRY }, { "glosslist", NODE_VARIABLELIST }, - { "glossterm", NODE_GLOSSTERM }, - { "group", NODE_GROUP }, - { "holder", NODE_HOLDER }, - { "index", NODE_INDEX }, + { "holder", NODE_IGNORE }, + { "imageobject", NODE_IGNORE }, { "indexterm", NODE_DELETE }, - { "info", NODE_INFO }, - { "informalequation", NODE_INFORMALEQUATION }, { "informaltable", NODE_TABLE }, - { "inlineequation", NODE_INLINEEQUATION }, - { "itemizedlist", NODE_ITEMIZEDLIST }, - { "keysym", NODE_KEYSYM }, - { "legalnotice", NODE_LEGALNOTICE }, - { "link", NODE_LINK }, - { "listitem", NODE_LISTITEM }, - { "literal", NODE_LITERAL }, - { "literallayout", NODE_LITERALLAYOUT }, - { "manvolnum", NODE_MANVOLNUM }, - { "member", NODE_MEMBER }, - { "mml:math", NODE_MML_MATH }, - { "mml:mfenced", NODE_MML_MFENCED }, - { "mml:mfrac", NODE_MML_MFRAC }, - { "mml:mi", NODE_MML_MI }, - { "mml:mn", NODE_MML_MN }, - { "mml:mo", NODE_MML_MO }, - { "mml:mrow", NODE_MML_MROW }, - { "mml:msub", NODE_MML_MSUB }, - { "mml:msup", NODE_MML_MSUP }, - { "modifier", NODE_MODIFIER }, - { "note", NODE_NOTE }, - { "option", NODE_OPTION }, - { "orderedlist", NODE_ORDEREDLIST }, - { "orgname", NODE_ORGNAME }, + { "keycap", NODE_KEYSYM }, + { "keycode", NODE_IGNORE }, + { "mediaobject", NODE_BLOCKQUOTE }, + { "orgname", NODE_IGNORE }, + { "othercredit", NODE_AUTHOR }, { "othername", NODE_PERSONNAME }, - { "para", NODE_PARA }, - { "paramdef", NODE_PARAMDEF }, - { "parameter", NODE_PARAMETER }, { "part", NODE_SECTION }, - { "personname", NODE_PERSONNAME }, { "phrase", NODE_IGNORE }, - { "preface", NODE_PREFACE }, { "primary", NODE_DELETE }, - { "programlisting", NODE_PROGRAMLISTING }, - { "prompt", NODE_PROMPT }, - { "quote", NODE_QUOTE }, - { "refclass", NODE_REFCLASS }, - { "refdescriptor", NODE_REFDESCRIPTOR }, - { "refentry", NODE_REFENTRY }, - { "refentryinfo", NODE_REFENTRYINFO }, - { "refentrytitle", NODE_REFENTRYTITLE }, - { "refmeta", NODE_REFMETA }, - { "refmetainfo", NODE_REFMETAINFO }, - { "refmiscinfo", NODE_REFMISCINFO }, - { "refname", NODE_REFNAME }, - { "refnamediv", NODE_REFNAMEDIV }, - { "refpurpose", NODE_REFPURPOSE }, + { "property", NODE_PARAMETER }, { "refsect1", NODE_SECTION }, { "refsect2", NODE_SECTION }, { "refsect3", NODE_SECTION }, { "refsection", NODE_SECTION }, - { "refsynopsisdiv", NODE_REFSYNOPSISDIV }, - { "releaseinfo", NODE_RELEASEINFO }, - { "replaceable", NODE_REPLACEABLE }, - { "row", NODE_ROW }, - { "sbr", NODE_SBR }, - { "screen", NODE_SCREEN }, + { "releaseinfo", NODE_IGNORE }, + { "returnvalue", NODE_IGNORE }, { "secondary", NODE_DELETE }, { "sect1", NODE_SECTION }, { "sect2", NODE_SECTION }, - { "section", NODE_SECTION }, - { "sgmltag", NODE_SGMLTAG }, + { "sect3", NODE_SECTION }, + { "sect4", NODE_SECTION }, + { "sgmltag", NODE_MARKUP }, { "simpara", NODE_PARA }, - { "simplelist", NODE_SIMPLELIST }, - { "spanspec", NODE_SPANSPEC }, { "structfield", NODE_PARAMETER }, { "structname", NODE_TYPE }, - { "subtitle", NODE_SUBTITLE }, { "surname", NODE_PERSONNAME }, { "symbol", NODE_CONSTANT }, - { "synopsis", NODE_SYNOPSIS }, - { "table", NODE_TABLE }, - { "tbody", NODE_TBODY }, - { "term", NODE_TERM }, - { "tfoot", NODE_TFOOT }, - { "tgroup", NODE_TGROUP }, - { "thead", NODE_THEAD }, - { "tip", NODE_TIP }, - { "title", NODE_TITLE }, + { "tag", NODE_MARKUP }, { "trademark", NODE_IGNORE }, - { "type", NODE_TYPE }, { "ulink", NODE_LINK }, { "userinput", NODE_LITERAL }, - { "variablelist", NODE_VARIABLELIST }, - { "varlistentry", NODE_VARLISTENTRY }, - { "varname", NODE_VARNAME }, - { "warning", NODE_WARNING }, - { "wordasword", NODE_WORDASWORD }, - { "xi:include", NODE_DELETE_WARN }, - { "year", NODE_YEAR }, + { "year", NODE_IGNORE }, { NULL, NODE_IGNORE } }; @@ -280,16 +191,24 @@ static void parse_fd(struct parse *, int); static void +fatal(struct parse *p) +{ + fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col); + perror(NULL); + exit(6); +} + +static void error_msg(struct parse *p, const char *fmt, ...) { va_list ap; - fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col); + fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); - p->tree->flags |= TREE_FAIL; + p->tree->flags |= TREE_ERROR; } static void @@ -300,11 +219,12 @@ warn_msg(struct parse *p, const char *fmt, ...) if ((p->flags & PFLAG_WARN) == 0) return; - fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col); + fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); + p->tree->flags |= TREE_WARN; } /* @@ -313,78 +233,159 @@ warn_msg(struct parse *p, const char *fmt, ...) * Otherwise, create a new one as a child of the current node. */ static void -xml_char(struct parse *ps, const char *p, int sz) +xml_text(struct parse *p, const char *word, int sz) { - struct pnode *dat; - size_t newsz; + struct pnode *n, *np; + size_t oldsz, newsz; + int i; - if (ps->del > 0) + assert(sz > 0); + if (p->del > 0) return; - if (ps->cur == NULL) { - error_msg(ps, "discarding text before document: %.*s", sz, p); + if ((n = p->cur) == NULL) { + error_msg(p, "discarding text before document: %.*s", + sz, word); return; } - if (ps->cur->node != NODE_TEXT) { - if ((dat = calloc(1, sizeof(*dat))) == NULL) { - perror(NULL); - exit(1); - } - dat->node = NODE_TEXT; - dat->spc = (ps->flags & PFLAG_SPC) != 0; - dat->parent = ps->cur; - TAILQ_INIT(&dat->childq); - TAILQ_INIT(&dat->attrq); - TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); - ps->cur = dat; + /* Append to the current text node, if one is open. */ + + if (n->node == NODE_TEXT) { + oldsz = strlen(n->b); + newsz = oldsz + sz; + if (oldsz && (p->flags & PFLAG_SPC)) + newsz++; + if ((n->b = realloc(n->b, newsz + 1)) == NULL) + fatal(p); + if (oldsz && (p->flags & PFLAG_SPC)) + n->b[oldsz++] = ' '; + memcpy(n->b + oldsz, word, sz); + n->b[newsz] = '\0'; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + return; } - if (ps->tree->flags & TREE_CLOSED && - ps->cur->parent == ps->tree->root) - warn_msg(ps, "text after end of document: %.*s", sz, p); + if (p->tree->flags & TREE_CLOSED && n == p->tree->root) + warn_msg(p, "text after end of document: %.*s", sz, word); - /* Append to the current text node. */ + /* Create a new text node. */ - assert(sz >= 0); - newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz; - ps->cur->b = realloc(ps->cur->b, newsz + 1); - if (ps->cur->b == NULL) { - perror(NULL); - exit(1); + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + + /* + * If this node follows an in-line macro without intervening + * whitespace, keep the text in it as short as possible, + * and do not keep it open. + */ + + np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child); + while (np != NULL) { + switch (pnode_class(np->node)) { + case CLASS_VOID: + case CLASS_TEXT: + case CLASS_BLOCK: + case CLASS_NOFILL: + np = NULL; + break; + case CLASS_TRANS: + np = TAILQ_LAST(&np->childq, pnodeq); + continue; + case CLASS_LINE: + case CLASS_ENCL: + break; + } + break; } - if (ps->cur->bsz && (ps->flags & PFLAG_SPC)) - ps->cur->b[ps->cur->bsz++] = ' '; - memcpy(ps->cur->b + ps->cur->bsz, p, sz); - ps->cur->b[ps->cur->bsz = newsz] = '\0'; - ps->cur->real = ps->cur->b; - ps->flags &= ~PFLAG_SPC; + if (np != NULL) { + i = 0; + while (i < sz && !isspace((unsigned char)word[i])) + i++; + if ((n->b = strndup(word, i)) == NULL) + fatal(p); + if (i == sz) + return; + while (i < sz && isspace((unsigned char)word[i])) + i++; + if (i == sz) { + p->flags |= PFLAG_SPC; + return; + } + + /* Put any remaining text into a second node. */ + + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags |= NFLAG_SPC; + word += i; + sz -= i; + } + if ((n->b = strndup(word, sz)) == NULL) + fatal(p); + + /* The new node remains open for later pnode_closetext(). */ + + p->cur = n; } /* * Close out the text node and strip trailing whitespace, if one is open. */ static void -pnode_closetext(struct parse *p) +pnode_closetext(struct parse *p, int check_last_word) { struct pnode *n; + char *cp, *last_word; if ((n = p->cur) == NULL || n->node != NODE_TEXT) return; p->cur = n->parent; - while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { - n->b[--n->bsz] = '\0'; + for (cp = strchr(n->b, '\0'); + cp > n->b && isspace((unsigned char)cp[-1]); + *--cp = '\0') p->flags |= PFLAG_SPC; - } + + if (p->flags & PFLAG_SPC || !check_last_word) + return; + + /* + * Find the beginning of the last word + * and delete whitespace before it. + */ + + while (cp > n->b && !isspace((unsigned char)cp[-1])) + cp--; + if (cp == n->b) + return; + + last_word = cp; + while (cp > n->b && isspace((unsigned char)cp[-1])) + *--cp = '\0'; + + /* Move the last word into its own node, for use with .Pf. */ + + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags |= NFLAG_SPC; + if ((n->b = strdup(last_word)) == NULL) + fatal(p); } static void xml_entity(struct parse *p, const char *name) { const struct entity *entity; - struct pnode *dat; + struct pnode *n; const char *ccp; char *cp; + unsigned int codepoint; enum pstate pstate; if (p->del > 0) @@ -395,7 +396,7 @@ xml_entity(struct parse *p, const char *name) return; } - pnode_closetext(p); + pnode_closetext(p, 0); if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) warn_msg(p, "entity after end of document: &%s;", name); @@ -406,231 +407,216 @@ xml_entity(struct parse *p, const char *name) if (entity->roff == NULL) { if (p->doctype != NULL) { - TAILQ_FOREACH(dat, &p->doctype->childq, child) { - if ((ccp = pnode_getattr_raw(dat, + TAILQ_FOREACH(n, &p->doctype->childq, child) { + if ((ccp = pnode_getattr_raw(n, ATTRKEY_NAME, NULL)) == NULL || strcmp(ccp, name) != 0) continue; - if ((ccp = pnode_getattr_raw(dat, + if ((ccp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL) { parse_file(p, -1, ccp); - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); return; } - if ((ccp = pnode_getattr_raw(dat, + if ((ccp = pnode_getattr_raw(n, ATTRKEY_DEFINITION, NULL)) == NULL) continue; - if ((cp = strdup(ccp)) == NULL) { - perror(NULL); - exit(1); - } + if ((cp = strdup(ccp)) == NULL) + fatal(p); pstate = PARSE_ELEM; parse_string(p, cp, strlen(cp), &pstate, 0); - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); free(cp); return; } } + if (*name == '#') { + codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp); + if (ccp == NULL) { + if ((n = pnode_alloc(p->cur)) == NULL || + asprintf(&n->b, "\\[u%4.4X]", + codepoint) < 0) + fatal(p); + goto done; + } + } error_msg(p, "unknown entity &%s;", name); return; } /* Create, append, and close out an entity node. */ - if ((dat = calloc(1, sizeof(*dat))) == NULL || - (dat->b = dat->real = strdup(entity->roff)) == NULL) { - perror(NULL); - exit(1); - } - dat->node = NODE_ESCAPE; - dat->bsz = strlen(dat->b); - dat->spc = (p->flags & PFLAG_SPC) != 0; - dat->parent = p->cur; - TAILQ_INIT(&dat->childq); - TAILQ_INIT(&dat->attrq); - TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); - p->flags &= ~PFLAG_SPC; + if ((n = pnode_alloc(p->cur)) == NULL || + (n->b = strdup(entity->roff)) == NULL) + fatal(p); +done: + n->node = NODE_ESCAPE; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); } /* + * Parse an element name. + */ +static enum nodeid +xml_name2node(struct parse *p, const char *name) +{ + const struct alias *alias; + enum nodeid node; + + if ((node = pnode_parse(name)) < NODE_UNKNOWN) + return node; + + for (alias = aliases; alias->name != NULL; alias++) + if (strcmp(alias->name, name) == 0) + return alias->node; + + return NODE_UNKNOWN; +} + +/* * Begin an element. */ static void -xml_elem_start(struct parse *ps, const char *name) +xml_elem_start(struct parse *p, const char *name) { - const struct element *elem; - struct pnode *dat; + struct pnode *n; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ - if (ps->del > 0) { + if (p->del > 0) { if (*name != '!' && *name != '?') - ps->del++; + p->del++; return; } - pnode_closetext(ps); - - for (elem = elements; elem->name != NULL; elem++) - if (strcmp(elem->name, name) == 0) - break; - - if (elem->name == NULL) { - if (*name == '!' || *name == '?') - return; - error_msg(ps, "unknown element <%s>", name); - } - - ps->ncur = elem->node; - - switch (ps->ncur) { + switch (p->ncur = xml_name2node(p, name)) { case NODE_DELETE_WARN: - warn_msg(ps, "skipping element <%s>", name); + warn_msg(p, "skipping element <%s>", name); /* FALLTHROUGH */ case NODE_DELETE: - ps->del = 1; + p->del = 1; /* FALLTHROUGH */ case NODE_IGNORE: return; - case NODE_INLINEEQUATION: - ps->tree->flags |= TREE_EQN; - break; + case NODE_UNKNOWN: + if (*name != '!' && *name != '?') + error_msg(p, "unknown element <%s>", name); + return; default: break; } - if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL) - warn_msg(ps, "element after end of document: <%s>", name); + if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL) + warn_msg(p, "element after end of document: <%s>", name); - if ((dat = calloc(1, sizeof(*dat))) == NULL) { - perror(NULL); - exit(1); + switch (pnode_class(p->ncur)) { + case CLASS_LINE: + case CLASS_ENCL: + pnode_closetext(p, 1); + break; + default: + pnode_closetext(p, 0); + break; } + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + /* + * Some elements are self-closing. * Nodes that begin a new macro or request line or start by * printing text always want whitespace before themselves. */ - switch (dat->node = elem->node) { + switch (n->node = p->ncur) { case NODE_DOCTYPE: case NODE_ENTITY: case NODE_SBR: - ps->flags |= PFLAG_EEND; - /* FALLTHROUGH */ - case NODE_APPENDIX: - case NODE_AUTHORGROUP: - case NODE_BLOCKQUOTE: - case NODE_BOOKINFO: - case NODE_CAUTION: - case NODE_EDITOR: - case NODE_ENTRY: - case NODE_FUNCDEF: - case NODE_FUNCPROTOTYPE: - case NODE_INFORMALEQUATION: - case NODE_INLINEEQUATION: - case NODE_ITEMIZEDLIST: - case NODE_LEGALNOTICE: - case NODE_LITERALLAYOUT: - case NODE_NOTE: - case NODE_ORDEREDLIST: - case NODE_PARA: - case NODE_PREFACE: - case NODE_PROGRAMLISTING: - case NODE_REFMETA: - case NODE_REFNAMEDIV: - case NODE_REFSYNOPSISDIV: - case NODE_ROW: - case NODE_SCREEN: - case NODE_SECTION: - case NODE_SYNOPSIS: - case NODE_TGROUP: - case NODE_TIP: - case NODE_TITLE: - case NODE_VARIABLELIST: - case NODE_VARLISTENTRY: - case NODE_WARNING: - dat->spc = 1; + case NODE_VOID: + p->flags |= PFLAG_EEND; break; default: - dat->spc = (ps->flags & PFLAG_SPC) != 0; break; } - dat->parent = ps->cur; - TAILQ_INIT(&dat->childq); - TAILQ_INIT(&dat->attrq); - - if (ps->cur != NULL) - TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); - - ps->cur = dat; - if (dat->node == NODE_DOCTYPE) { - if (ps->doctype == NULL) - ps->doctype = dat; + switch (pnode_class(p->ncur)) { + case CLASS_LINE: + case CLASS_ENCL: + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + break; + case CLASS_NOFILL: + p->nofill++; + /* FALLTHROUGH */ + default: + n->flags |= NFLAG_SPC; + break; + } + p->cur = n; + if (n->node == NODE_DOCTYPE) { + if (p->doctype == NULL) + p->doctype = n; else - error_msg(ps, "duplicate doctype"); - } else if (dat->parent == NULL && ps->tree->root == NULL) - ps->tree->root = dat; + error_msg(p, "duplicate doctype"); + } else if (n->parent == NULL && p->tree->root == NULL) + p->tree->root = n; } static void -xml_attrkey(struct parse *ps, const char *name) +xml_attrkey(struct parse *p, const char *name) { - struct pattr *attr; + struct pattr *a; const char *value; enum attrkey key; - if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0') + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0') return; - if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) && - TAILQ_FIRST(&ps->cur->attrq) == NULL) { + if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) && + TAILQ_FIRST(&p->cur->attrq) == NULL) { value = name; name = "NAME"; } else value = NULL; if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { - ps->flags &= ~PFLAG_ATTR; + p->flags &= ~PFLAG_ATTR; return; } - if ((attr = calloc(1, sizeof(*attr))) == NULL) { - perror(NULL); - exit(1); - } - attr->key = key; - attr->val = ATTRVAL__MAX; + if ((a = calloc(1, sizeof(*a))) == NULL) + fatal(p); + + a->key = key; + a->val = ATTRVAL__MAX; if (value == NULL) { - attr->rawval = NULL; - ps->flags |= PFLAG_ATTR; + a->rawval = NULL; + p->flags |= PFLAG_ATTR; } else { - if ((attr->rawval = strdup(value)) == NULL) { - perror(NULL); - exit(1); - } - ps->flags &= ~PFLAG_ATTR; + if ((a->rawval = strdup(value)) == NULL) + fatal(p); + p->flags &= ~PFLAG_ATTR; } - TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child); - if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME) - xml_attrkey(ps, "DEFINITION"); + TAILQ_INSERT_TAIL(&p->cur->attrq, a, child); + if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME) + xml_attrkey(p, "DEFINITION"); } static void -xml_attrval(struct parse *ps, const char *name) +xml_attrval(struct parse *p, const char *name) { - struct pattr *attr; + struct pattr *a; - if (ps->del > 0 || ps->ncur == NODE_IGNORE || - (ps->flags & PFLAG_ATTR) == 0) + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || + (p->flags & PFLAG_ATTR) == 0) return; - if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL) + if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL) return; - if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX && - (attr->rawval = strdup(name)) == NULL) { - perror(NULL); - exit(1); - } + if ((a->val = attrval_parse(name)) == ATTRVAL__MAX && + (a->rawval = strdup(name)) == NULL) + fatal(p); + p->flags &= ~PFLAG_ATTR; } /* @@ -638,47 +624,59 @@ xml_attrval(struct parse *ps, const char *name) * If we're at a text node, roll that one up first. */ static void -xml_elem_end(struct parse *ps, const char *name) +xml_elem_end(struct parse *p, const char *name) { - const struct element *elem; + struct pnode *n; + const char *cp; enum nodeid node; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ - if (ps->del > 1) { - ps->del--; + if (p->del > 1) { + p->del--; return; } - if (ps->del == 0) - pnode_closetext(ps); + if (p->del == 0) + pnode_closetext(p, 0); - if (name != NULL) { - for (elem = elements; elem->name != NULL; elem++) - if (strcmp(elem->name, name) == 0) - break; - node = elem->node; - } else - node = ps->ncur; + n = p->cur; + node = name == NULL ? p->ncur : xml_name2node(p, name); switch (node) { case NODE_DELETE_WARN: case NODE_DELETE: - if (ps->del > 0) - ps->del--; + if (p->del > 0) + p->del--; break; case NODE_IGNORE: + case NODE_UNKNOWN: break; + case NODE_INCLUDE: + p->cur = n->parent; + cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL); + if (cp == NULL) + error_msg(p, " element " + "without href attribute"); + else + parse_file(p, -1, cp); + pnode_unlink(n); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + break; case NODE_DOCTYPE: - ps->flags &= ~PFLAG_EEND; + case NODE_SBR: + case NODE_VOID: + p->flags &= ~PFLAG_EEND; /* FALLTHROUGH */ default: - if (ps->cur == NULL || node != ps->cur->node) { - warn_msg(ps, "element not open: ", name); + if (n == NULL || node != n->node) { + warn_msg(p, "element not open: ", name); break; } + if (pnode_class(node) == CLASS_NOFILL) + p->nofill--; /* * Refrain from actually closing the document element. @@ -687,16 +685,24 @@ xml_elem_end(struct parse *ps, const char *name) * obviously better than discarding it or crashing. */ - if (ps->cur->parent != NULL || node == NODE_DOCTYPE) { - ps->cur = ps->cur->parent; - if (ps->cur != NULL) - ps->ncur = ps->cur->node; + if (n->parent != NULL || node == NODE_DOCTYPE) { + p->cur = n->parent; + if (p->cur != NULL) + p->ncur = p->cur->node; } else - ps->tree->flags |= TREE_CLOSED; - ps->flags &= ~PFLAG_SPC; + p->tree->flags |= TREE_CLOSED; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + + /* Include a file containing entity declarations. */ + + if (node == NODE_ENTITY && strcmp("%", + pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 && + (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL) + parse_file(p, -1, cp); + break; } - assert(ps->del == 0); + assert(p->del == 0); } struct parse * @@ -786,11 +792,12 @@ parse_string(struct parse *p, char *b, size_t rlen, enum pstate *pstate, int refill) { char *cp; + size_t pws; /* Parse offset including whitespace. */ size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ int elem_end; - pend = 0; + pend = pws = 0; for (;;) { /* Proceed to the next token, skipping whitespace. */ @@ -803,6 +810,10 @@ parse_string(struct parse *p, char *b, size_t rlen, break; if (isspace((unsigned char)b[pend])) { p->flags |= PFLAG_SPC; + if (b[pend] == '\n') { + p->flags |= PFLAG_LINE; + pws = pend + 1; + } increment(p, b, &pend, refill); continue; } @@ -961,9 +972,16 @@ parse_string(struct parse *p, char *b, size_t rlen, /* Process text up to the next tag, entity, or EOL. */ } else { - advance(p, b, rlen, &pend, "<&", refill); - xml_char(p, b + poff, pend - poff); + advance(p, b, rlen, &pend, + p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n", + refill); + if (p->nofill) + poff = pws; + xml_text(p, b + poff, pend - poff); + if (b[pend] == '\n') + pnode_closetext(p, 0); } + pws = pend; } return poff; } @@ -1047,7 +1065,7 @@ parse_file(struct parse *p, int fd, const char *fname) /* On the top level, finalize the parse tree. */ if (save_fname == NULL) { - pnode_closetext(p); + pnode_closetext(p, 0); if (p->tree->root == NULL) error_msg(p, "empty document"); else if ((p->tree->flags & TREE_CLOSED) == 0)