=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.2 retrieving revision 1.52 diff -u -p -r1.2 -r1.52 --- docbook2mdoc/parse.c 2019/03/26 20:54:43 1.2 +++ docbook2mdoc/parse.c 2019/04/25 17:57:59 1.52 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.2 2019/03/26 20:54:43 schwarze Exp $ */ +/* $Id: parse.c,v 1.52 2019/04/25 17:57:59 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -17,8 +17,12 @@ */ #include #include -#include +#include +#include +#include +#include #include +#include #include #include @@ -29,345 +33,677 @@ * The implementation of the DocBook parser. */ +enum pstate { + PARSE_ELEM, + PARSE_TAG, + PARSE_ARG, + PARSE_SQ, + PARSE_DQ +}; + /* * Global parse state. * Keep this as simple and small as possible. */ struct parse { - XML_Parser xml; const char *fname; /* Name of the input file. */ struct ptree *tree; /* Complete parse result. */ + struct pnode *doctype; struct pnode *cur; /* Current node in the tree. */ - int warn; + enum nodeid ncur; /* Type of the current node. */ + int line; /* Line number in the input file. */ + int col; /* Column number in the input file. */ + int nline; /* Line number of next token. */ + int ncol; /* Column number of next token. */ + int del; /* Levels of nested nodes being deleted. */ + int nofill; /* Levels of open no-fill displays. */ + int flags; +#define PFLAG_WARN (1 << 0) /* Print warning messages. */ +#define PFLAG_LINE (1 << 1) /* New line before the next element. */ +#define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */ +#define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */ +#define PFLAG_EEND (1 << 4) /* This element is self-closing. */ }; -struct element { +struct alias { const char *name; /* DocBook element name. */ enum nodeid node; /* Node type to generate. */ }; -static const struct element elements[] = { - { "acronym", NODE_ACRONYM }, - { "affiliation", NODE_AFFILIATION }, - { "anchor", NODE_ANCHOR }, - { "application", NODE_APPLICATION }, - { "arg", NODE_ARG }, - { "author", NODE_AUTHOR }, - { "authorgroup", NODE_AUTHORGROUP }, - { "blockquote", NODE_BLOCKQUOTE }, - { "book", NODE_BOOK }, - { "bookinfo", NODE_BOOKINFO }, - { "caution", NODE_CAUTION }, +static const struct alias aliases[] = { + { "acronym", NODE_IGNORE }, + { "affiliation", NODE_IGNORE }, + { "anchor", NODE_DELETE }, + { "application", NODE_COMMAND }, + { "article", NODE_SECTION }, + { "articleinfo", NODE_BOOKINFO }, + { "book", NODE_SECTION }, { "chapter", NODE_SECTION }, - { "citerefentry", NODE_CITEREFENTRY }, - { "citetitle", NODE_CITETITLE }, - { "cmdsynopsis", NODE_CMDSYNOPSIS }, - { "code", NODE_CODE }, - { "colspec", NODE_COLSPEC }, - { "command", NODE_COMMAND }, - { "constant", NODE_CONSTANT }, - { "copyright", NODE_COPYRIGHT }, - { "date", NODE_DATE }, - { "editor", NODE_EDITOR }, - { "email", NODE_EMAIL }, - { "emphasis", NODE_EMPHASIS }, - { "entry", NODE_ENTRY }, - { "envar", NODE_ENVAR }, - { "fieldsynopsis", NODE_FIELDSYNOPSIS }, - { "filename", NODE_FILENAME }, - { "firstname", NODE_FIRSTNAME }, - { "firstterm", NODE_FIRSTTERM }, - { "footnote", NODE_FOOTNOTE }, - { "funcdef", NODE_FUNCDEF }, - { "funcprototype", NODE_FUNCPROTOTYPE }, - { "funcsynopsis", NODE_FUNCSYNOPSIS }, - { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, - { "function", NODE_FUNCTION }, - { "glossterm", NODE_GLOSSTERM }, - { "group", NODE_GROUP }, - { "holder", NODE_HOLDER }, - { "index", NODE_INDEX }, - { "indexterm", NODE_INDEXTERM }, - { "info", NODE_INFO }, - { "informalequation", NODE_INFORMALEQUATION }, - { "informaltable", NODE_INFORMALTABLE }, - { "inlineequation", NODE_INLINEEQUATION }, - { "itemizedlist", NODE_ITEMIZEDLIST }, - { "keysym", NODE_KEYSYM }, - { "legalnotice", NODE_LEGALNOTICE }, - { "link", NODE_LINK }, - { "listitem", NODE_LISTITEM }, - { "literal", NODE_LITERAL }, - { "literallayout", NODE_LITERALLAYOUT }, - { "manvolnum", NODE_MANVOLNUM }, - { "member", NODE_MEMBER }, - { "mml:math", NODE_MML_MATH }, - { "mml:mfenced", NODE_MML_MFENCED }, - { "mml:mfrac", NODE_MML_MFRAC }, - { "mml:mi", NODE_MML_MI }, - { "mml:mn", NODE_MML_MN }, - { "mml:mo", NODE_MML_MO }, - { "mml:mrow", NODE_MML_MROW }, - { "mml:msub", NODE_MML_MSUB }, - { "mml:msup", NODE_MML_MSUP }, - { "modifier", NODE_MODIFIER }, - { "note", NODE_NOTE }, - { "option", NODE_OPTION }, - { "orderedlist", NODE_ORDEREDLIST }, - { "orgname", NODE_ORGNAME }, - { "othername", NODE_OTHERNAME }, - { "para", NODE_PARA }, - { "paramdef", NODE_PARAMDEF }, - { "parameter", NODE_PARAMETER }, + { "caption", NODE_IGNORE }, + { "code", NODE_LITERAL }, + { "computeroutput", NODE_LITERAL }, + { "!doctype", NODE_DOCTYPE }, + { "figure", NODE_IGNORE }, + { "firstname", NODE_PERSONNAME }, + { "glossary", NODE_VARIABLELIST }, + { "glossdef", NODE_IGNORE }, + { "glossdiv", NODE_IGNORE }, + { "glossentry", NODE_VARLISTENTRY }, + { "glosslist", NODE_VARIABLELIST }, + { "holder", NODE_IGNORE }, + { "imageobject", NODE_IGNORE }, + { "indexterm", NODE_DELETE }, + { "informaltable", NODE_TABLE }, + { "keycap", NODE_KEYSYM }, + { "keycode", NODE_IGNORE }, + { "mediaobject", NODE_BLOCKQUOTE }, + { "orgname", NODE_IGNORE }, + { "othercredit", NODE_AUTHOR }, + { "othername", NODE_PERSONNAME }, { "part", NODE_SECTION }, - { "personname", NODE_PERSONNAME }, - { "phrase", NODE_PHRASE }, - { "preface", NODE_PREFACE }, - { "primary", NODE_PRIMARY }, - { "programlisting", NODE_PROGRAMLISTING }, - { "prompt", NODE_PROMPT }, - { "quote", NODE_QUOTE }, - { "refclass", NODE_REFCLASS }, - { "refdescriptor", NODE_REFDESCRIPTOR }, - { "refentry", NODE_REFENTRY }, - { "refentryinfo", NODE_REFENTRYINFO }, - { "refentrytitle", NODE_REFENTRYTITLE }, - { "refmeta", NODE_REFMETA }, - { "refmetainfo", NODE_REFMETAINFO }, - { "refmiscinfo", NODE_REFMISCINFO }, - { "refname", NODE_REFNAME }, - { "refnamediv", NODE_REFNAMEDIV }, - { "refpurpose", NODE_REFPURPOSE }, + { "phrase", NODE_IGNORE }, + { "primary", NODE_DELETE }, + { "property", NODE_PARAMETER }, + { "reference", NODE_SECTION }, { "refsect1", NODE_SECTION }, { "refsect2", NODE_SECTION }, { "refsect3", NODE_SECTION }, { "refsection", NODE_SECTION }, - { "refsynopsisdiv", NODE_REFSYNOPSISDIV }, - { "releaseinfo", NODE_RELEASEINFO }, - { "replaceable", NODE_REPLACEABLE }, - { "row", NODE_ROW }, - { "sbr", NODE_SBR }, - { "screen", NODE_SCREEN }, - { "secondary", NODE_SECONDARY }, + { "releaseinfo", NODE_IGNORE }, + { "returnvalue", NODE_IGNORE }, + { "secondary", NODE_DELETE }, { "sect1", NODE_SECTION }, { "sect2", NODE_SECTION }, - { "section", NODE_SECTION }, - { "sgmltag", NODE_SGMLTAG }, - { "simplelist", NODE_SIMPLELIST }, - { "spanspec", NODE_SPANSPEC }, - { "structname", NODE_STRUCTNAME }, - { "subtitle", NODE_SUBTITLE }, - { "surname", NODE_SURNAME }, - { "synopsis", NODE_SYNOPSIS }, - { "table", NODE_TABLE }, - { "tbody", NODE_TBODY }, - { "term", NODE_TERM }, - { "tfoot", NODE_TFOOT }, - { "tgroup", NODE_TGROUP }, - { "thead", NODE_THEAD }, - { "tip", NODE_TIP }, - { "title", NODE_TITLE }, - { "trademark", NODE_TRADEMARK }, - { "type", NODE_TYPE }, - { "ulink", NODE_ULINK }, - { "userinput", NODE_USERINPUT }, - { "variablelist", NODE_VARIABLELIST }, - { "varlistentry", NODE_VARLISTENTRY }, - { "varname", NODE_VARNAME }, - { "warning", NODE_WARNING }, - { "wordasword", NODE_WORDASWORD }, - { "xi:include", NODE_WARN }, - { "year", NODE_YEAR }, - { NULL, NODE__MAX } + { "sect3", NODE_SECTION }, + { "sect4", NODE_SECTION }, + { "sgmltag", NODE_MARKUP }, + { "simpara", NODE_PARA }, + { "structfield", NODE_PARAMETER }, + { "structname", NODE_TYPE }, + { "surname", NODE_PERSONNAME }, + { "symbol", NODE_CONSTANT }, + { "tag", NODE_MARKUP }, + { "trademark", NODE_IGNORE }, + { "ulink", NODE_LINK }, + { "userinput", NODE_LITERAL }, + { "year", NODE_IGNORE }, + { NULL, NODE_IGNORE } }; +struct entity { + const char *name; + const char *roff; +}; + /* + * XML character entity references found in the wild. + * Those that don't have an exact mandoc_char(7) representation + * are approximated, and the desired codepoint is given as a comment. + * Encoding them as \\[u...] would leave -Tascii out in the cold. + */ +static const struct entity entities[] = { + { "alpha", "\\(*a" }, + { "amp", "&" }, + { "apos", "'" }, + { "auml", "\\(:a" }, + { "beta", "\\(*b" }, + { "circ", "^" }, /* U+02C6 */ + { "copy", "\\(co" }, + { "dagger", "\\(dg" }, + { "Delta", "\\(*D" }, + { "eacute", "\\('e" }, + { "emsp", "\\ " }, /* U+2003 */ + { "gt", ">" }, + { "hairsp", "\\^" }, + { "kappa", "\\(*k" }, + { "larr", "\\(<-" }, + { "ldquo", "\\(lq" }, + { "le", "\\(<=" }, + { "lowbar", "_" }, + { "lsqb", "[" }, + { "lt", "<" }, + { "mdash", "\\(em" }, + { "minus", "\\-" }, + { "ndash", "\\(en" }, + { "nbsp", "\\ " }, + { "num", "#" }, + { "oslash", "\\(/o" }, + { "ouml", "\\(:o" }, + { "percnt", "%" }, + { "quot", "\\(dq" }, + { "rarr", "\\(->" }, + { "rArr", "\\(rA" }, + { "rdquo", "\\(rq" }, + { "reg", "\\(rg" }, + { "rho", "\\(*r" }, + { "rsqb", "]" }, + { "sigma", "\\(*s" }, + { "shy", "\\&" }, /* U+00AD */ + { "tau", "\\(*t" }, + { "tilde", "\\[u02DC]" }, + { "times", "\\[tmu]" }, + { "uuml", "\\(:u" }, + { NULL, NULL } +}; + +static size_t parse_string(struct parse *, char *, size_t, + enum pstate *, int); +static void parse_fd(struct parse *, int); + + +static void +fatal(struct parse *p) +{ + fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col); + perror(NULL); + exit(6); +} + +static void +error_msg(struct parse *p, const char *fmt, ...) +{ + va_list ap; + + fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); + p->tree->flags |= TREE_ERROR; +} + +static void +warn_msg(struct parse *p, const char *fmt, ...) +{ + va_list ap; + + if ((p->flags & PFLAG_WARN) == 0) + return; + + fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col); + va_start(ap, fmt); + vfprintf(stderr, fmt, ap); + va_end(ap); + fputc('\n', stderr); + p->tree->flags |= TREE_WARN; +} + +/* * Process a string of characters. * If a text node is already open, append to it. * Otherwise, create a new one as a child of the current node. */ static void -xml_char(void *arg, const XML_Char *p, int sz) +xml_text(struct parse *p, const char *word, int sz) { - struct parse *ps; - struct pnode *dat; + struct pnode *n, *np; + size_t oldsz, newsz; int i; - ps = arg; - if (ps->tree->flags && TREE_FAIL) + assert(sz > 0); + if (p->del > 0) return; + if ((n = p->cur) == NULL) { + error_msg(p, "discarding text before document: %.*s", + sz, word); + return; + } + + /* Append to the current text node, if one is open. */ + + if (n->node == NODE_TEXT) { + oldsz = strlen(n->b); + newsz = oldsz + sz; + if (oldsz && (p->flags & PFLAG_SPC)) + newsz++; + if ((n->b = realloc(n->b, newsz + 1)) == NULL) + fatal(p); + if (oldsz && (p->flags & PFLAG_SPC)) + n->b[oldsz++] = ' '; + memcpy(n->b + oldsz, word, sz); + n->b[newsz] = '\0'; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + return; + } + + if (p->tree->flags & TREE_CLOSED && n == p->tree->root) + warn_msg(p, "text after end of document: %.*s", sz, word); + + /* Create a new text node. */ + + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + /* - * Only create a new node if there is non-whitespace text. - * Strip all leading whitespace. + * If this node follows an in-line macro without intervening + * whitespace, keep the text in it as short as possible, + * and do not keep it open. */ - if (ps->cur->node != NODE_TEXT) { - for (i = 0; i < sz; i++) - if (isspace((unsigned char)p[i]) == 0) - break; - if (i == sz) - return; - p += i; - sz -= i; - if ((dat = calloc(1, sizeof(*dat))) == NULL) { - perror(NULL); - exit(1); + np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child); + while (np != NULL) { + switch (pnode_class(np->node)) { + case CLASS_VOID: + case CLASS_TEXT: + case CLASS_BLOCK: + case CLASS_NOFILL: + np = NULL; + break; + case CLASS_TRANS: + np = TAILQ_LAST(&np->childq, pnodeq); + continue; + case CLASS_LINE: + case CLASS_ENCL: + break; } - dat->node = NODE_TEXT; - dat->parent = ps->cur; - TAILQ_INIT(&dat->childq); - TAILQ_INIT(&dat->attrq); - TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); - ps->cur = dat; + break; } + if (np != NULL) { + i = 0; + while (i < sz && !isspace((unsigned char)word[i])) + i++; + if ((n->b = strndup(word, i)) == NULL) + fatal(p); + if (i == sz) + return; + while (i < sz && isspace((unsigned char)word[i])) + i++; + if (i == sz) { + p->flags |= PFLAG_SPC; + return; + } - /* Append to the current text node. */ + /* Put any remaining text into a second node. */ - assert(sz >= 0); - ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1); - if (ps->cur->b == NULL) { - perror(NULL); - exit(1); + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags |= NFLAG_SPC; + word += i; + sz -= i; } - memcpy(ps->cur->b + ps->cur->bsz, p, sz); - ps->cur->bsz += sz; - ps->cur->b[ps->cur->bsz] = '\0'; - ps->cur->real = ps->cur->b; + if ((n->b = strndup(word, sz)) == NULL) + fatal(p); + + /* The new node remains open for later pnode_closetext(). */ + + p->cur = n; } +/* + * Close out the text node and strip trailing whitespace, if one is open. + */ static void -pnode_trim(struct pnode *pn) +pnode_closetext(struct parse *p, int check_last_word) { - assert(pn->node == NODE_TEXT); - for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0') - if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0) - break; + struct pnode *n; + char *cp, *last_word; + + if ((n = p->cur) == NULL || n->node != NODE_TEXT) + return; + p->cur = n->parent; + for (cp = strchr(n->b, '\0'); + cp > n->b && isspace((unsigned char)cp[-1]); + *--cp = '\0') + p->flags |= PFLAG_SPC; + + if (p->flags & PFLAG_SPC || !check_last_word) + return; + + /* + * Find the beginning of the last word + * and delete whitespace before it. + */ + + while (cp > n->b && !isspace((unsigned char)cp[-1])) + cp--; + if (cp == n->b) + return; + + last_word = cp; + while (cp > n->b && isspace((unsigned char)cp[-1])) + *--cp = '\0'; + + /* Move the last word into its own node, for use with .Pf. */ + + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); + n->node = NODE_TEXT; + n->flags |= NFLAG_SPC; + if ((n->b = strdup(last_word)) == NULL) + fatal(p); } -/* - * Begin an element. - * If the name is unknown, abort parsing. - */ static void -xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts) +xml_entity(struct parse *p, const char *name) { - struct parse *ps; - const struct element *elem; - enum attrkey key; - struct pnode *dat; - struct pattr *pattr; - const XML_Char **att; + const struct entity *entity; + struct pnode *n; + const char *ccp; + char *cp; + unsigned int codepoint; + enum pstate pstate; - ps = arg; - if (ps->tree->flags && TREE_FAIL) + if (p->del > 0) return; - /* Close out the text node, if there is one. */ - if (ps->cur != NULL && ps->cur->node == NODE_TEXT) { - pnode_trim(ps->cur); - ps->cur = ps->cur->parent; + if (p->cur == NULL) { + error_msg(p, "discarding entity before document: &%s;", name); + return; } - for (elem = elements; elem->name != NULL; elem++) - if (strcmp(elem->name, name) == 0) + pnode_closetext(p, 0); + + if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) + warn_msg(p, "entity after end of document: &%s;", name); + + for (entity = entities; entity->name != NULL; entity++) + if (strcmp(name, entity->name) == 0) break; - if (elem->name == NULL) { - fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n", - ps->fname, XML_GetCurrentLineNumber(ps->xml), - XML_GetCurrentColumnNumber(ps->xml), name); - ps->tree->flags |= TREE_FAIL; + if (entity->roff == NULL) { + if (p->doctype != NULL) { + TAILQ_FOREACH(n, &p->doctype->childq, child) { + if ((ccp = pnode_getattr_raw(n, + ATTRKEY_NAME, NULL)) == NULL || + strcmp(ccp, name) != 0) + continue; + if ((ccp = pnode_getattr_raw(n, + ATTRKEY_SYSTEM, NULL)) != NULL) { + parse_file(p, -1, ccp); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + return; + } + if ((ccp = pnode_getattr_raw(n, + ATTRKEY_DEFINITION, NULL)) == NULL) + continue; + if ((cp = strdup(ccp)) == NULL) + fatal(p); + pstate = PARSE_ELEM; + parse_string(p, cp, strlen(cp), &pstate, 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + free(cp); + return; + } + } + if (*name == '#') { + codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp); + if (ccp == NULL) { + if ((n = pnode_alloc(p->cur)) == NULL || + asprintf(&n->b, "\\[u%4.4X]", + codepoint) < 0) + fatal(p); + goto done; + } + } + error_msg(p, "unknown entity &%s;", name); return; } - switch (elem->node) { - case NODE_WARN: - if (ps->warn) - fprintf(stderr, "%s:%zu:%zu: warning: " - "ignoring element <%s>\n", ps->fname, - XML_GetCurrentLineNumber(ps->xml), - XML_GetCurrentColumnNumber(ps->xml), name); + /* Create, append, and close out an entity node. */ + if ((n = pnode_alloc(p->cur)) == NULL || + (n->b = strdup(entity->roff)) == NULL) + fatal(p); +done: + n->node = NODE_ESCAPE; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); +} + +/* + * Parse an element name. + */ +static enum nodeid +xml_name2node(struct parse *p, const char *name) +{ + const struct alias *alias; + enum nodeid node; + + if ((node = pnode_parse(name)) < NODE_UNKNOWN) + return node; + + for (alias = aliases; alias->name != NULL; alias++) + if (strcmp(alias->name, name) == 0) + return alias->node; + + return NODE_UNKNOWN; +} + +/* + * Begin an element. + */ +static void +xml_elem_start(struct parse *p, const char *name) +{ + struct pnode *n; + + /* + * An ancestor is excluded from the tree; + * keep track of the number of levels excluded. + */ + if (p->del > 0) { + if (*name != '!' && *name != '?') + p->del++; + return; + } + + switch (p->ncur = xml_name2node(p, name)) { + case NODE_DELETE_WARN: + warn_msg(p, "skipping element <%s>", name); /* FALLTHROUGH */ + case NODE_DELETE: + p->del = 1; + /* FALLTHROUGH */ case NODE_IGNORE: return; - case NODE_INLINEEQUATION: - ps->tree->flags |= TREE_EQN; - break; + case NODE_UNKNOWN: + if (*name != '!' && *name != '?') + error_msg(p, "unknown element <%s>", name); + return; default: break; } - if ((dat = calloc(1, sizeof(*dat))) == NULL) { - perror(NULL); - exit(1); + if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL) + warn_msg(p, "element after end of document: <%s>", name); + + switch (pnode_class(p->ncur)) { + case CLASS_LINE: + case CLASS_ENCL: + pnode_closetext(p, 1); + break; + default: + pnode_closetext(p, 0); + break; } - dat->node = elem->node; - dat->parent = ps->cur; - TAILQ_INIT(&dat->childq); - TAILQ_INIT(&dat->attrq); - if (ps->cur != NULL) - TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); + if ((n = pnode_alloc(p->cur)) == NULL) + fatal(p); - ps->cur = dat; - if (ps->tree->root == NULL) - ps->tree->root = dat; - /* - * Process attributes. + * Some elements are self-closing. + * Nodes that begin a new macro or request line or start by + * printing text always want whitespace before themselves. */ - for (att = atts; *att != NULL; att += 2) { - if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) { - if (ps->warn) - fprintf(stderr, "%s:%zu:%zu: warning: " - "unknown attribute \"%s\"\n", - ps->fname, - XML_GetCurrentLineNumber(ps->xml), - XML_GetCurrentColumnNumber(ps->xml), - *att); - continue; - } - pattr = calloc(1, sizeof(*pattr)); - pattr->key = key; - if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX) - pattr->rawval = strdup(att[1]); - TAILQ_INSERT_TAIL(&dat->attrq, pattr, child); + + switch (n->node = p->ncur) { + case NODE_DOCTYPE: + case NODE_ENTITY: + case NODE_SBR: + case NODE_VOID: + p->flags |= PFLAG_EEND; + break; + default: + break; } + switch (pnode_class(p->ncur)) { + case CLASS_LINE: + case CLASS_ENCL: + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + break; + case CLASS_NOFILL: + p->nofill++; + /* FALLTHROUGH */ + default: + n->flags |= NFLAG_SPC; + break; + } + p->cur = n; + if (n->node == NODE_DOCTYPE) { + if (p->doctype == NULL) + p->doctype = n; + else + error_msg(p, "duplicate doctype"); + } else if (n->parent == NULL && p->tree->root == NULL) + p->tree->root = n; } +static void +xml_attrkey(struct parse *p, const char *name) +{ + struct pattr *a; + const char *value; + enum attrkey key; + + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0') + return; + + if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) && + TAILQ_FIRST(&p->cur->attrq) == NULL) { + value = name; + name = "NAME"; + } else + value = NULL; + + if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { + p->flags &= ~PFLAG_ATTR; + return; + } + if ((a = calloc(1, sizeof(*a))) == NULL) + fatal(p); + + a->key = key; + a->val = ATTRVAL__MAX; + if (value == NULL) { + a->rawval = NULL; + p->flags |= PFLAG_ATTR; + } else { + if ((a->rawval = strdup(value)) == NULL) + fatal(p); + p->flags &= ~PFLAG_ATTR; + } + TAILQ_INSERT_TAIL(&p->cur->attrq, a, child); + if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME) + xml_attrkey(p, "DEFINITION"); +} + +static void +xml_attrval(struct parse *p, const char *name) +{ + struct pattr *a; + + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || + (p->flags & PFLAG_ATTR) == 0) + return; + if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL) + return; + if ((a->val = attrval_parse(name)) == ATTRVAL__MAX && + (a->rawval = strdup(name)) == NULL) + fatal(p); + p->flags &= ~PFLAG_ATTR; +} + /* * Roll up the parse tree. * If we're at a text node, roll that one up first. */ static void -xml_elem_end(void *arg, const XML_Char *name) +xml_elem_end(struct parse *p, const char *name) { - struct parse *ps; - const struct element *elem; + struct pnode *n; + const char *cp; + enum nodeid node; - ps = arg; - if (ps->tree->flags && TREE_FAIL) + /* + * An ancestor is excluded from the tree; + * keep track of the number of levels excluded. + */ + if (p->del > 1) { + p->del--; return; - - /* Close out the text node, if there is one. */ - if (ps->cur->node == NODE_TEXT) { - pnode_trim(ps->cur); - ps->cur = ps->cur->parent; } - for (elem = elements; elem->name != NULL; elem++) - if (strcmp(elem->name, name) == 0) - break; + if (p->del == 0) + pnode_closetext(p, 0); - switch (elem->node) { + n = p->cur; + node = name == NULL ? p->ncur : xml_name2node(p, name); + + switch (node) { + case NODE_DELETE_WARN: + case NODE_DELETE: + if (p->del > 0) + p->del--; + break; case NODE_IGNORE: - case NODE_WARN: + case NODE_UNKNOWN: break; + case NODE_INCLUDE: + p->cur = n->parent; + cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL); + if (cp == NULL) + error_msg(p, " element " + "without href attribute"); + else + parse_file(p, -1, cp); + pnode_unlink(n); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + break; + case NODE_DOCTYPE: + case NODE_SBR: + case NODE_VOID: + p->flags &= ~PFLAG_EEND; + /* FALLTHROUGH */ default: - assert(elem->node == ps->cur->node); - ps->cur = ps->cur->parent; + if (n == NULL || node != n->node) { + warn_msg(p, "element not open: ", name); + break; + } + if (pnode_class(node) == CLASS_NOFILL) + p->nofill--; + + /* + * Refrain from actually closing the document element. + * If no more content follows, no harm is done, but if + * some content still follows, simply processing it is + * obviously better than discarding it or crashing. + */ + + if (n->parent != NULL || node == NODE_DOCTYPE) { + p->cur = n->parent; + if (p->cur != NULL) + p->ncur = p->cur->node; + } else + p->tree->flags |= TREE_CLOSED; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + + /* Include a file containing entity declarations. */ + + if (node == NODE_ENTITY && strcmp("%", + pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 && + (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL) + parse_file(p, -1, cp); + + break; } + assert(p->del == 0); } struct parse * @@ -382,16 +718,10 @@ parse_alloc(int warn) free(p); return NULL; } - - if ((p->xml = XML_ParserCreate(NULL)) == NULL) { - free(p->tree); - free(p); - return NULL; - } - p->warn = warn; - XML_SetCharacterDataHandler(p->xml, xml_char); - XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end); - XML_SetUserData(p->xml, p); + if (warn) + p->flags |= PFLAG_WARN; + else + p->flags &= ~PFLAG_WARN; return p; } @@ -400,7 +730,6 @@ parse_free(struct parse *p) { if (p == NULL) return; - XML_ParserFree(p->xml); if (p->tree != NULL) { pnode_unlink(p->tree->root); free(p->tree); @@ -408,28 +737,349 @@ parse_free(struct parse *p) free(p); } +static void +increment(struct parse *p, char *b, size_t *pend, int refill) +{ + if (refill) { + if (b[*pend] == '\n') { + p->nline++; + p->ncol = 1; + } else + p->ncol++; + } + ++*pend; +} + +/* + * Advance the pend pointer to the next character in the charset. + * If the charset starts with a space, it stands for any whitespace. + * Update the new input file position, used for messages. + * Do not overrun the buffer b of length rlen. + * When reaching the end, NUL-terminate the buffer and return 1; + * otherwise, return 0. + */ +static int +advance(struct parse *p, char *b, size_t rlen, size_t *pend, + const char *charset, int refill) +{ + int space; + + if (*charset == ' ') { + space = 1; + charset++; + } else + space = 0; + + if (refill) { + p->nline = p->line; + p->ncol = p->col; + } + while (*pend < rlen) { + if (space && isspace((unsigned char)b[*pend])) + break; + if (strchr(charset, b[*pend]) != NULL) + break; + increment(p, b, pend, refill); + } + if (*pend == rlen) { + b[rlen] = '\0'; + return refill; + } else + return 0; +} + +size_t +parse_string(struct parse *p, char *b, size_t rlen, + enum pstate *pstate, int refill) +{ + char *cp; + size_t pws; /* Parse offset including whitespace. */ + size_t poff; /* Parse offset in b[]. */ + size_t pend; /* Offset of the end of the current word. */ + int elem_end; + + pend = pws = 0; + for (;;) { + + /* Proceed to the next token, skipping whitespace. */ + + if (refill) { + p->line = p->nline; + p->col = p->ncol; + } + if ((poff = pend) == rlen) + break; + if (isspace((unsigned char)b[pend])) { + p->flags |= PFLAG_SPC; + if (b[pend] == '\n') { + p->flags |= PFLAG_LINE; + pws = pend + 1; + } + increment(p, b, &pend, refill); + continue; + } + + /* + * The following four cases (ARG, TAG, and starting an + * entity or a tag) all parse a word or quoted string. + * If that extends beyond the read buffer and the last + * read(2) still got data, they all break out of the + * token loop to request more data from the read loop. + * + * Also, three of them detect self-closing tags, those + * ending with "/>", setting the flag elem_end and + * calling xml_elem_end() at the very end, after + * handling the attribute value, attribute name, or + * tag name, respectively. + */ + + /* Parse an attribute value. */ + + if (*pstate >= PARSE_ARG) { + if (*pstate == PARSE_ARG && + (b[pend] == '\'' || b[pend] == '"')) { + *pstate = b[pend] == '"' ? + PARSE_DQ : PARSE_SQ; + increment(p, b, &pend, refill); + continue; + } + if (advance(p, b, rlen, &pend, + *pstate == PARSE_DQ ? "\"" : + *pstate == PARSE_SQ ? "'" : " >", refill)) + break; + *pstate = PARSE_TAG; + elem_end = 0; + if (b[pend] == '>') { + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + if (p->flags & PFLAG_EEND) + elem_end = 1; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrval(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); + + /* Look for an attribute name. */ + + } else if (*pstate == PARSE_TAG) { + switch (p->ncur) { + case NODE_DOCTYPE: + if (b[pend] == '[') { + *pstate = PARSE_ELEM; + increment(p, b, &pend, refill); + continue; + } + /* FALLTHROUGH */ + case NODE_ENTITY: + if (b[pend] == '"' || b[pend] == '\'') { + *pstate = PARSE_ARG; + continue; + } + break; + default: + break; + } + if (advance(p, b, rlen, &pend, " =>", refill)) + break; + elem_end = 0; + switch (b[pend]) { + case '>': + *pstate = PARSE_ELEM; + if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + if (p->flags & PFLAG_EEND) + elem_end = 1; + break; + case '=': + *pstate = PARSE_ARG; + break; + default: + break; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_attrkey(p, b + poff); + if (elem_end) + xml_elem_end(p, NULL); + + /* Begin an opening or closing tag. */ + + } else if (b[poff] == '<') { + if (advance(p, b, rlen, &pend, " >", refill)) + break; + if (pend > poff + 3 && + strncmp(b + poff, ""); + if (cp == NULL) { + if (refill) + break; + cp = b + rlen; + } else + cp += 3; + while (b + pend < cp) + increment(p, b, &pend, refill); + continue; + } + elem_end = 0; + if (b[pend] != '>') + *pstate = PARSE_TAG; + else if (pend > 0 && b[pend - 1] == '/') { + b[pend - 1] = '\0'; + elem_end = 1; + } + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + if (b[++poff] == '/') { + elem_end = 1; + poff++; + } else { + xml_elem_start(p, b + poff); + if (*pstate == PARSE_ELEM && + p->flags & PFLAG_EEND) + elem_end = 1; + } + if (elem_end) + xml_elem_end(p, b + poff); + + /* Close a doctype. */ + + } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') { + *pstate = PARSE_TAG; + increment(p, b, &pend, refill); + + /* Process an entity. */ + + } else if (b[poff] == '&') { + if (advance(p, b, rlen, &pend, ";", refill)) + break; + b[pend] = '\0'; + if (pend < rlen) + increment(p, b, &pend, refill); + xml_entity(p, b + poff + 1); + + /* Process text up to the next tag, entity, or EOL. */ + + } else { + advance(p, b, rlen, &pend, + p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n", + refill); + if (p->nofill) + poff = pws; + xml_text(p, b + poff, pend - poff); + if (b[pend] == '\n') + pnode_closetext(p, 0); + } + pws = pend; + } + return poff; +} + + +/* + * The read loop. + * If the previous token was incomplete and asked for more input, + * we have to enter the read loop once more even on EOF. + * Once rsz is 0, incomplete tokens will no longer ask for more input + * but instead use whatever there is, and then exit the read loop. + * The minus one on the size limit for read(2) is needed such that + * advance() can set b[rlen] to NUL when needed. + */ +static void +parse_fd(struct parse *p, int fd) +{ + char b[4096]; + ssize_t rsz; /* Return value from read(2). */ + size_t rlen; /* Number of bytes in b[]. */ + size_t poff; /* Parse offset in b[]. */ + enum pstate pstate; + + rlen = 0; + pstate = PARSE_ELEM; + while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && + (rlen += rsz) > 0) { + poff = parse_string(p, b, rlen, &pstate, rsz > 0); + /* Buffer exhausted; shift left and re-fill. */ + assert(poff > 0); + rlen -= poff; + memmove(b, b + poff, rlen); + } + if (rsz < 0) + error_msg(p, "read: %s", strerror(errno)); +} + +/* + * Open and parse a file. + */ struct ptree * parse_file(struct parse *p, int fd, const char *fname) { - char b[4096]; - ssize_t ssz; + const char *save_fname; + int save_line, save_col; + /* Save and initialize reporting data. */ + + save_fname = p->fname; + save_line = p->nline; + save_col = p->ncol; p->fname = fname; - do { - if ((ssz = read(fd, b, sizeof(b))) < 0) { - perror(fname); - pnode_unlink(p->tree->root); - p->tree->root = p->cur = NULL; - p->tree->flags |= TREE_FAIL; - return NULL; - } - if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) { - fprintf(stderr, "%s:%zu:%zu: %s\n", fname, - XML_GetCurrentLineNumber(p->xml), - XML_GetCurrentColumnNumber(p->xml), - XML_ErrorString(XML_GetErrorCode(p->xml))); - p->tree->flags |= TREE_FAIL; - } - } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0); + p->line = 0; + p->col = 0; + + /* Open the file, unless it is already open. */ + + if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) { + error_msg(p, "open: %s", strerror(errno)); + p->fname = save_fname; + return p->tree; + } + + /* + * After opening the starting file, change to the directory it + * is located in, in case it wants to include any further files, + * which are typically given with relative paths in DocBook. + * Do this on a best-effort basis; don't complain about failure. + */ + + if (save_fname == NULL && (fname = dirname(fname)) != NULL && + strcmp(fname, ".") != 0) + (void)chdir(fname); + + /* Run the read loop. */ + + p->nline = 1; + p->ncol = 1; + parse_fd(p, fd); + + /* On the top level, finalize the parse tree. */ + + if (save_fname == NULL) { + pnode_closetext(p, 0); + if (p->tree->root == NULL) + error_msg(p, "empty document"); + else if ((p->tree->flags & TREE_CLOSED) == 0) + warn_msg(p, "document not closed"); + pnode_unlink(p->doctype); + } + + /* Clean up. */ + + if (fd != STDIN_FILENO) + close(fd); + p->fname = save_fname; + p->nline = save_line; + p->ncol = save_col; return p->tree; }