/* $Id: parse.c,v 1.4 2019/03/26 22:39:33 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include "node.h" #include "parse.h" /* * The implementation of the DocBook parser. */ /* * Global parse state. * Keep this as simple and small as possible. */ struct parse { XML_Parser xml; const char *fname; /* Name of the input file. */ struct ptree *tree; /* Complete parse result. */ struct pnode *cur; /* Current node in the tree. */ int del; /* Levels of nested nodes being deleted. */ int warn; }; struct element { const char *name; /* DocBook element name. */ enum nodeid node; /* Node type to generate. */ }; static const struct element elements[] = { { "acronym", NODE_IGNORE }, { "affiliation", NODE_AFFILIATION }, { "anchor", NODE_DELETE }, { "application", NODE_APPLICATION }, { "arg", NODE_ARG }, { "author", NODE_AUTHOR }, { "authorgroup", NODE_AUTHORGROUP }, { "blockquote", NODE_BLOCKQUOTE }, { "book", NODE_BOOK }, { "bookinfo", NODE_BOOKINFO }, { "caution", NODE_CAUTION }, { "chapter", NODE_SECTION }, { "citerefentry", NODE_CITEREFENTRY }, { "citetitle", NODE_CITETITLE }, { "cmdsynopsis", NODE_CMDSYNOPSIS }, { "code", NODE_CODE }, { "colspec", NODE_COLSPEC }, { "command", NODE_COMMAND }, { "constant", NODE_CONSTANT }, { "copyright", NODE_COPYRIGHT }, { "date", NODE_DATE }, { "editor", NODE_EDITOR }, { "email", NODE_EMAIL }, { "emphasis", NODE_EMPHASIS }, { "entry", NODE_ENTRY }, { "envar", NODE_ENVAR }, { "fieldsynopsis", NODE_FIELDSYNOPSIS }, { "filename", NODE_FILENAME }, { "firstname", NODE_IGNORE }, { "firstterm", NODE_FIRSTTERM }, { "footnote", NODE_FOOTNOTE }, { "funcdef", NODE_FUNCDEF }, { "funcprototype", NODE_FUNCPROTOTYPE }, { "funcsynopsis", NODE_FUNCSYNOPSIS }, { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, { "function", NODE_FUNCTION }, { "glossterm", NODE_GLOSSTERM }, { "group", NODE_GROUP }, { "holder", NODE_HOLDER }, { "index", NODE_INDEX }, { "indexterm", NODE_DELETE }, { "info", NODE_INFO }, { "informalequation", NODE_INFORMALEQUATION }, { "informaltable", NODE_INFORMALTABLE }, { "inlineequation", NODE_INLINEEQUATION }, { "itemizedlist", NODE_ITEMIZEDLIST }, { "keysym", NODE_KEYSYM }, { "legalnotice", NODE_LEGALNOTICE }, { "link", NODE_LINK }, { "listitem", NODE_LISTITEM }, { "literal", NODE_LITERAL }, { "literallayout", NODE_LITERALLAYOUT }, { "manvolnum", NODE_MANVOLNUM }, { "member", NODE_MEMBER }, { "mml:math", NODE_MML_MATH }, { "mml:mfenced", NODE_MML_MFENCED }, { "mml:mfrac", NODE_MML_MFRAC }, { "mml:mi", NODE_MML_MI }, { "mml:mn", NODE_MML_MN }, { "mml:mo", NODE_MML_MO }, { "mml:mrow", NODE_MML_MROW }, { "mml:msub", NODE_MML_MSUB }, { "mml:msup", NODE_MML_MSUP }, { "modifier", NODE_MODIFIER }, { "note", NODE_NOTE }, { "option", NODE_OPTION }, { "orderedlist", NODE_ORDEREDLIST }, { "orgname", NODE_ORGNAME }, { "othername", NODE_IGNORE }, { "para", NODE_PARA }, { "paramdef", NODE_PARAMDEF }, { "parameter", NODE_PARAMETER }, { "part", NODE_SECTION }, { "personname", NODE_PERSONNAME }, { "phrase", NODE_IGNORE }, { "preface", NODE_PREFACE }, { "primary", NODE_DELETE }, { "programlisting", NODE_PROGRAMLISTING }, { "prompt", NODE_PROMPT }, { "quote", NODE_QUOTE }, { "refclass", NODE_REFCLASS }, { "refdescriptor", NODE_REFDESCRIPTOR }, { "refentry", NODE_REFENTRY }, { "refentryinfo", NODE_REFENTRYINFO }, { "refentrytitle", NODE_REFENTRYTITLE }, { "refmeta", NODE_REFMETA }, { "refmetainfo", NODE_REFMETAINFO }, { "refmiscinfo", NODE_REFMISCINFO }, { "refname", NODE_REFNAME }, { "refnamediv", NODE_REFNAMEDIV }, { "refpurpose", NODE_REFPURPOSE }, { "refsect1", NODE_SECTION }, { "refsect2", NODE_SECTION }, { "refsect3", NODE_SECTION }, { "refsection", NODE_SECTION }, { "refsynopsisdiv", NODE_REFSYNOPSISDIV }, { "releaseinfo", NODE_RELEASEINFO }, { "replaceable", NODE_REPLACEABLE }, { "row", NODE_ROW }, { "sbr", NODE_SBR }, { "screen", NODE_SCREEN }, { "secondary", NODE_DELETE }, { "sect1", NODE_SECTION }, { "sect2", NODE_SECTION }, { "section", NODE_SECTION }, { "sgmltag", NODE_SGMLTAG }, { "simplelist", NODE_SIMPLELIST }, { "spanspec", NODE_SPANSPEC }, { "structname", NODE_STRUCTNAME }, { "subtitle", NODE_SUBTITLE }, { "surname", NODE_IGNORE }, { "synopsis", NODE_SYNOPSIS }, { "table", NODE_TABLE }, { "tbody", NODE_TBODY }, { "term", NODE_TERM }, { "tfoot", NODE_TFOOT }, { "tgroup", NODE_TGROUP }, { "thead", NODE_THEAD }, { "tip", NODE_TIP }, { "title", NODE_TITLE }, { "trademark", NODE_IGNORE }, { "type", NODE_TYPE }, { "ulink", NODE_ULINK }, { "userinput", NODE_USERINPUT }, { "variablelist", NODE_VARIABLELIST }, { "varlistentry", NODE_VARLISTENTRY }, { "varname", NODE_VARNAME }, { "warning", NODE_WARNING }, { "wordasword", NODE_WORDASWORD }, { "xi:include", NODE_DELETE_WARN }, { "year", NODE_YEAR }, { NULL, NODE__MAX } }; /* * Process a string of characters. * If a text node is already open, append to it. * Otherwise, create a new one as a child of the current node. */ static void xml_char(void *arg, const XML_Char *p, int sz) { struct parse *ps; struct pnode *dat; int i; ps = arg; if (ps->del > 0 || ps->tree->flags & TREE_FAIL) return; /* * Only create a new node if there is non-whitespace text. * Strip all leading whitespace. */ if (ps->cur->node != NODE_TEXT) { for (i = 0; i < sz; i++) if (isspace((unsigned char)p[i]) == 0) break; if (i == sz) return; p += i; sz -= i; if ((dat = calloc(1, sizeof(*dat))) == NULL) { perror(NULL); exit(1); } dat->node = NODE_TEXT; dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); ps->cur = dat; } /* Append to the current text node. */ assert(sz >= 0); ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1); if (ps->cur->b == NULL) { perror(NULL); exit(1); } memcpy(ps->cur->b + ps->cur->bsz, p, sz); ps->cur->bsz += sz; ps->cur->b[ps->cur->bsz] = '\0'; ps->cur->real = ps->cur->b; } static void pnode_trim(struct pnode *pn) { assert(pn->node == NODE_TEXT); for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0') if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0) break; } /* * Begin an element. * If the name is unknown, abort parsing. */ static void xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts) { struct parse *ps; const struct element *elem; enum attrkey key; struct pnode *dat; struct pattr *pattr; const XML_Char **att; ps = arg; if (ps->tree->flags & TREE_FAIL) return; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ if (ps->del > 0) { ps->del++; return; } /* Close out the text node, if there is one. */ if (ps->cur != NULL && ps->cur->node == NODE_TEXT) { pnode_trim(ps->cur); ps->cur = ps->cur->parent; } for (elem = elements; elem->name != NULL; elem++) if (strcmp(elem->name, name) == 0) break; if (elem->name == NULL) { fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n", ps->fname, XML_GetCurrentLineNumber(ps->xml), XML_GetCurrentColumnNumber(ps->xml), name); ps->tree->flags |= TREE_FAIL; return; } switch (elem->node) { case NODE_DELETE_WARN: if (ps->warn) fprintf(stderr, "%s:%zu:%zu: warning: " "skipping element <%s>\n", ps->fname, XML_GetCurrentLineNumber(ps->xml), XML_GetCurrentColumnNumber(ps->xml), name); /* FALLTHROUGH */ case NODE_DELETE: ps->del = 1; /* FALLTHROUGH */ case NODE_IGNORE: return; case NODE_INLINEEQUATION: ps->tree->flags |= TREE_EQN; break; default: break; } if ((dat = calloc(1, sizeof(*dat))) == NULL) { perror(NULL); exit(1); } dat->node = elem->node; dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); if (ps->cur != NULL) TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); ps->cur = dat; if (ps->tree->root == NULL) ps->tree->root = dat; /* * Process attributes. */ for (att = atts; *att != NULL; att += 2) { if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) { if (ps->warn) fprintf(stderr, "%s:%zu:%zu: warning: " "unknown attribute \"%s\"\n", ps->fname, XML_GetCurrentLineNumber(ps->xml), XML_GetCurrentColumnNumber(ps->xml), *att); continue; } pattr = calloc(1, sizeof(*pattr)); pattr->key = key; if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX) pattr->rawval = strdup(att[1]); TAILQ_INSERT_TAIL(&dat->attrq, pattr, child); } } /* * Roll up the parse tree. * If we're at a text node, roll that one up first. */ static void xml_elem_end(void *arg, const XML_Char *name) { struct parse *ps; const struct element *elem; ps = arg; if (ps->tree->flags & TREE_FAIL) return; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ if (ps->del > 1) { ps->del--; return; } /* Close out the text node, if there is one. */ if (ps->del == 0 && ps->cur->node == NODE_TEXT) { pnode_trim(ps->cur); ps->cur = ps->cur->parent; } for (elem = elements; elem->name != NULL; elem++) if (strcmp(elem->name, name) == 0) break; switch (elem->node) { case NODE_DELETE_WARN: case NODE_DELETE: ps->del--; break; case NODE_IGNORE: break; default: assert(elem->node == ps->cur->node); ps->cur = ps->cur->parent; break; } assert(ps->del == 0); } struct parse * parse_alloc(int warn) { struct parse *p; if ((p = calloc(1, sizeof(*p))) == NULL) return NULL; if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) { free(p); return NULL; } if ((p->xml = XML_ParserCreate(NULL)) == NULL) { free(p->tree); free(p); return NULL; } p->warn = warn; XML_SetCharacterDataHandler(p->xml, xml_char); XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end); XML_SetUserData(p->xml, p); return p; } void parse_free(struct parse *p) { if (p == NULL) return; XML_ParserFree(p->xml); if (p->tree != NULL) { pnode_unlink(p->tree->root); free(p->tree); } free(p); } struct ptree * parse_file(struct parse *p, int fd, const char *fname) { char b[4096]; ssize_t ssz; p->fname = fname; do { if ((ssz = read(fd, b, sizeof(b))) < 0) { perror(fname); pnode_unlink(p->tree->root); p->tree->root = p->cur = NULL; p->tree->flags |= TREE_FAIL; return NULL; } if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) { fprintf(stderr, "%s:%zu:%zu: %s\n", fname, XML_GetCurrentLineNumber(p->xml), XML_GetCurrentColumnNumber(p->xml), XML_ErrorString(XML_GetErrorCode(p->xml))); p->tree->flags |= TREE_FAIL; } } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0); return p->tree; }