File: [cvsweb.bsd.lv] / docbook2mdoc / parse.c (download)
Revision 1.2, Tue Mar 26 20:54:43 2019 UTC (5 years, 2 months ago) by schwarze
Branch: MAIN
Changes since 1.1: +31 -11 lines
Provide an easy way to parse an XML element without generating a node,
either ignoring it outright or emitting a warning if -W was specified.
Use this to handle <xi:include> more cleanly, fixing two FIXMEs.
|
/* $Id: parse.c,v 1.2 2019/03/26 20:54:43 schwarze Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <assert.h>
#include <ctype.h>
#include <expat.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include "node.h"
#include "parse.h"
/*
* The implementation of the DocBook parser.
*/
/*
* Global parse state.
* Keep this as simple and small as possible.
*/
struct parse {
XML_Parser xml;
const char *fname; /* Name of the input file. */
struct ptree *tree; /* Complete parse result. */
struct pnode *cur; /* Current node in the tree. */
int warn;
};
struct element {
const char *name; /* DocBook element name. */
enum nodeid node; /* Node type to generate. */
};
static const struct element elements[] = {
{ "acronym", NODE_ACRONYM },
{ "affiliation", NODE_AFFILIATION },
{ "anchor", NODE_ANCHOR },
{ "application", NODE_APPLICATION },
{ "arg", NODE_ARG },
{ "author", NODE_AUTHOR },
{ "authorgroup", NODE_AUTHORGROUP },
{ "blockquote", NODE_BLOCKQUOTE },
{ "book", NODE_BOOK },
{ "bookinfo", NODE_BOOKINFO },
{ "caution", NODE_CAUTION },
{ "chapter", NODE_SECTION },
{ "citerefentry", NODE_CITEREFENTRY },
{ "citetitle", NODE_CITETITLE },
{ "cmdsynopsis", NODE_CMDSYNOPSIS },
{ "code", NODE_CODE },
{ "colspec", NODE_COLSPEC },
{ "command", NODE_COMMAND },
{ "constant", NODE_CONSTANT },
{ "copyright", NODE_COPYRIGHT },
{ "date", NODE_DATE },
{ "editor", NODE_EDITOR },
{ "email", NODE_EMAIL },
{ "emphasis", NODE_EMPHASIS },
{ "entry", NODE_ENTRY },
{ "envar", NODE_ENVAR },
{ "fieldsynopsis", NODE_FIELDSYNOPSIS },
{ "filename", NODE_FILENAME },
{ "firstname", NODE_FIRSTNAME },
{ "firstterm", NODE_FIRSTTERM },
{ "footnote", NODE_FOOTNOTE },
{ "funcdef", NODE_FUNCDEF },
{ "funcprototype", NODE_FUNCPROTOTYPE },
{ "funcsynopsis", NODE_FUNCSYNOPSIS },
{ "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
{ "function", NODE_FUNCTION },
{ "glossterm", NODE_GLOSSTERM },
{ "group", NODE_GROUP },
{ "holder", NODE_HOLDER },
{ "index", NODE_INDEX },
{ "indexterm", NODE_INDEXTERM },
{ "info", NODE_INFO },
{ "informalequation", NODE_INFORMALEQUATION },
{ "informaltable", NODE_INFORMALTABLE },
{ "inlineequation", NODE_INLINEEQUATION },
{ "itemizedlist", NODE_ITEMIZEDLIST },
{ "keysym", NODE_KEYSYM },
{ "legalnotice", NODE_LEGALNOTICE },
{ "link", NODE_LINK },
{ "listitem", NODE_LISTITEM },
{ "literal", NODE_LITERAL },
{ "literallayout", NODE_LITERALLAYOUT },
{ "manvolnum", NODE_MANVOLNUM },
{ "member", NODE_MEMBER },
{ "mml:math", NODE_MML_MATH },
{ "mml:mfenced", NODE_MML_MFENCED },
{ "mml:mfrac", NODE_MML_MFRAC },
{ "mml:mi", NODE_MML_MI },
{ "mml:mn", NODE_MML_MN },
{ "mml:mo", NODE_MML_MO },
{ "mml:mrow", NODE_MML_MROW },
{ "mml:msub", NODE_MML_MSUB },
{ "mml:msup", NODE_MML_MSUP },
{ "modifier", NODE_MODIFIER },
{ "note", NODE_NOTE },
{ "option", NODE_OPTION },
{ "orderedlist", NODE_ORDEREDLIST },
{ "orgname", NODE_ORGNAME },
{ "othername", NODE_OTHERNAME },
{ "para", NODE_PARA },
{ "paramdef", NODE_PARAMDEF },
{ "parameter", NODE_PARAMETER },
{ "part", NODE_SECTION },
{ "personname", NODE_PERSONNAME },
{ "phrase", NODE_PHRASE },
{ "preface", NODE_PREFACE },
{ "primary", NODE_PRIMARY },
{ "programlisting", NODE_PROGRAMLISTING },
{ "prompt", NODE_PROMPT },
{ "quote", NODE_QUOTE },
{ "refclass", NODE_REFCLASS },
{ "refdescriptor", NODE_REFDESCRIPTOR },
{ "refentry", NODE_REFENTRY },
{ "refentryinfo", NODE_REFENTRYINFO },
{ "refentrytitle", NODE_REFENTRYTITLE },
{ "refmeta", NODE_REFMETA },
{ "refmetainfo", NODE_REFMETAINFO },
{ "refmiscinfo", NODE_REFMISCINFO },
{ "refname", NODE_REFNAME },
{ "refnamediv", NODE_REFNAMEDIV },
{ "refpurpose", NODE_REFPURPOSE },
{ "refsect1", NODE_SECTION },
{ "refsect2", NODE_SECTION },
{ "refsect3", NODE_SECTION },
{ "refsection", NODE_SECTION },
{ "refsynopsisdiv", NODE_REFSYNOPSISDIV },
{ "releaseinfo", NODE_RELEASEINFO },
{ "replaceable", NODE_REPLACEABLE },
{ "row", NODE_ROW },
{ "sbr", NODE_SBR },
{ "screen", NODE_SCREEN },
{ "secondary", NODE_SECONDARY },
{ "sect1", NODE_SECTION },
{ "sect2", NODE_SECTION },
{ "section", NODE_SECTION },
{ "sgmltag", NODE_SGMLTAG },
{ "simplelist", NODE_SIMPLELIST },
{ "spanspec", NODE_SPANSPEC },
{ "structname", NODE_STRUCTNAME },
{ "subtitle", NODE_SUBTITLE },
{ "surname", NODE_SURNAME },
{ "synopsis", NODE_SYNOPSIS },
{ "table", NODE_TABLE },
{ "tbody", NODE_TBODY },
{ "term", NODE_TERM },
{ "tfoot", NODE_TFOOT },
{ "tgroup", NODE_TGROUP },
{ "thead", NODE_THEAD },
{ "tip", NODE_TIP },
{ "title", NODE_TITLE },
{ "trademark", NODE_TRADEMARK },
{ "type", NODE_TYPE },
{ "ulink", NODE_ULINK },
{ "userinput", NODE_USERINPUT },
{ "variablelist", NODE_VARIABLELIST },
{ "varlistentry", NODE_VARLISTENTRY },
{ "varname", NODE_VARNAME },
{ "warning", NODE_WARNING },
{ "wordasword", NODE_WORDASWORD },
{ "xi:include", NODE_WARN },
{ "year", NODE_YEAR },
{ NULL, NODE__MAX }
};
/*
* Process a string of characters.
* If a text node is already open, append to it.
* Otherwise, create a new one as a child of the current node.
*/
static void
xml_char(void *arg, const XML_Char *p, int sz)
{
struct parse *ps;
struct pnode *dat;
int i;
ps = arg;
if (ps->tree->flags && TREE_FAIL)
return;
/*
* Only create a new node if there is non-whitespace text.
* Strip all leading whitespace.
*/
if (ps->cur->node != NODE_TEXT) {
for (i = 0; i < sz; i++)
if (isspace((unsigned char)p[i]) == 0)
break;
if (i == sz)
return;
p += i;
sz -= i;
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
}
dat->node = NODE_TEXT;
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INIT(&dat->attrq);
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
}
/* Append to the current text node. */
assert(sz >= 0);
ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
if (ps->cur->b == NULL) {
perror(NULL);
exit(1);
}
memcpy(ps->cur->b + ps->cur->bsz, p, sz);
ps->cur->bsz += sz;
ps->cur->b[ps->cur->bsz] = '\0';
ps->cur->real = ps->cur->b;
}
static void
pnode_trim(struct pnode *pn)
{
assert(pn->node == NODE_TEXT);
for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
break;
}
/*
* Begin an element.
* If the name is unknown, abort parsing.
*/
static void
xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
{
struct parse *ps;
const struct element *elem;
enum attrkey key;
struct pnode *dat;
struct pattr *pattr;
const XML_Char **att;
ps = arg;
if (ps->tree->flags && TREE_FAIL)
return;
/* Close out the text node, if there is one. */
if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
pnode_trim(ps->cur);
ps->cur = ps->cur->parent;
}
for (elem = elements; elem->name != NULL; elem++)
if (strcmp(elem->name, name) == 0)
break;
if (elem->name == NULL) {
fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
ps->fname, XML_GetCurrentLineNumber(ps->xml),
XML_GetCurrentColumnNumber(ps->xml), name);
ps->tree->flags |= TREE_FAIL;
return;
}
switch (elem->node) {
case NODE_WARN:
if (ps->warn)
fprintf(stderr, "%s:%zu:%zu: warning: "
"ignoring element <%s>\n", ps->fname,
XML_GetCurrentLineNumber(ps->xml),
XML_GetCurrentColumnNumber(ps->xml), name);
/* FALLTHROUGH */
case NODE_IGNORE:
return;
case NODE_INLINEEQUATION:
ps->tree->flags |= TREE_EQN;
break;
default:
break;
}
if ((dat = calloc(1, sizeof(*dat))) == NULL) {
perror(NULL);
exit(1);
}
dat->node = elem->node;
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INIT(&dat->attrq);
if (ps->cur != NULL)
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
if (ps->tree->root == NULL)
ps->tree->root = dat;
/*
* Process attributes.
*/
for (att = atts; *att != NULL; att += 2) {
if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
if (ps->warn)
fprintf(stderr, "%s:%zu:%zu: warning: "
"unknown attribute \"%s\"\n",
ps->fname,
XML_GetCurrentLineNumber(ps->xml),
XML_GetCurrentColumnNumber(ps->xml),
*att);
continue;
}
pattr = calloc(1, sizeof(*pattr));
pattr->key = key;
if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
pattr->rawval = strdup(att[1]);
TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
}
}
/*
* Roll up the parse tree.
* If we're at a text node, roll that one up first.
*/
static void
xml_elem_end(void *arg, const XML_Char *name)
{
struct parse *ps;
const struct element *elem;
ps = arg;
if (ps->tree->flags && TREE_FAIL)
return;
/* Close out the text node, if there is one. */
if (ps->cur->node == NODE_TEXT) {
pnode_trim(ps->cur);
ps->cur = ps->cur->parent;
}
for (elem = elements; elem->name != NULL; elem++)
if (strcmp(elem->name, name) == 0)
break;
switch (elem->node) {
case NODE_IGNORE:
case NODE_WARN:
break;
default:
assert(elem->node == ps->cur->node);
ps->cur = ps->cur->parent;
}
}
struct parse *
parse_alloc(int warn)
{
struct parse *p;
if ((p = calloc(1, sizeof(*p))) == NULL)
return NULL;
if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
free(p);
return NULL;
}
if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
free(p->tree);
free(p);
return NULL;
}
p->warn = warn;
XML_SetCharacterDataHandler(p->xml, xml_char);
XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
XML_SetUserData(p->xml, p);
return p;
}
void
parse_free(struct parse *p)
{
if (p == NULL)
return;
XML_ParserFree(p->xml);
if (p->tree != NULL) {
pnode_unlink(p->tree->root);
free(p->tree);
}
free(p);
}
struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
char b[4096];
ssize_t ssz;
p->fname = fname;
do {
if ((ssz = read(fd, b, sizeof(b))) < 0) {
perror(fname);
pnode_unlink(p->tree->root);
p->tree->root = p->cur = NULL;
p->tree->flags |= TREE_FAIL;
return NULL;
}
if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
XML_GetCurrentLineNumber(p->xml),
XML_GetCurrentColumnNumber(p->xml),
XML_ErrorString(XML_GetErrorCode(p->xml)));
p->tree->flags |= TREE_FAIL;
}
} while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
return p->tree;
}