Annotation of docbook2mdoc/parse.c, Revision 1.1
1.1 ! schwarze 1: /* $Id$ */
! 2: /*
! 3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
! 4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
! 5: *
! 6: * Permission to use, copy, modify, and distribute this software for any
! 7: * purpose with or without fee is hereby granted, provided that the above
! 8: * copyright notice and this permission notice appear in all copies.
! 9: *
! 10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
! 11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
! 13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 17: */
! 18: #include <assert.h>
! 19: #include <ctype.h>
! 20: #include <expat.h>
! 21: #include <stdio.h>
! 22: #include <string.h>
! 23: #include <unistd.h>
! 24:
! 25: #include "node.h"
! 26: #include "parse.h"
! 27:
! 28: /*
! 29: * The implementation of the DocBook parser.
! 30: */
! 31:
! 32: /*
! 33: * Global parse state.
! 34: * Keep this as simple and small as possible.
! 35: */
! 36: struct parse {
! 37: XML_Parser xml;
! 38: const char *fname; /* Name of the input file. */
! 39: struct ptree *tree; /* Complete parse result. */
! 40: struct pnode *cur; /* Current node in the tree. */
! 41: int warn;
! 42: };
! 43:
! 44: struct element {
! 45: const char *name; /* DocBook element name. */
! 46: enum nodeid node; /* Node type to generate. */
! 47: };
! 48:
! 49: static const struct element elements[] = {
! 50: { "acronym", NODE_ACRONYM },
! 51: { "affiliation", NODE_AFFILIATION },
! 52: { "anchor", NODE_ANCHOR },
! 53: { "application", NODE_APPLICATION },
! 54: { "arg", NODE_ARG },
! 55: { "author", NODE_AUTHOR },
! 56: { "authorgroup", NODE_AUTHORGROUP },
! 57: { "blockquote", NODE_BLOCKQUOTE },
! 58: { "book", NODE_BOOK },
! 59: { "bookinfo", NODE_BOOKINFO },
! 60: { "caution", NODE_CAUTION },
! 61: { "chapter", NODE_SECTION },
! 62: { "citerefentry", NODE_CITEREFENTRY },
! 63: { "citetitle", NODE_CITETITLE },
! 64: { "cmdsynopsis", NODE_CMDSYNOPSIS },
! 65: { "code", NODE_CODE },
! 66: { "colspec", NODE_COLSPEC },
! 67: { "command", NODE_COMMAND },
! 68: { "constant", NODE_CONSTANT },
! 69: { "copyright", NODE_COPYRIGHT },
! 70: { "date", NODE_DATE },
! 71: { "editor", NODE_EDITOR },
! 72: { "email", NODE_EMAIL },
! 73: { "emphasis", NODE_EMPHASIS },
! 74: { "entry", NODE_ENTRY },
! 75: { "envar", NODE_ENVAR },
! 76: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
! 77: { "filename", NODE_FILENAME },
! 78: { "firstname", NODE_FIRSTNAME },
! 79: { "firstterm", NODE_FIRSTTERM },
! 80: { "footnote", NODE_FOOTNOTE },
! 81: { "funcdef", NODE_FUNCDEF },
! 82: { "funcprototype", NODE_FUNCPROTOTYPE },
! 83: { "funcsynopsis", NODE_FUNCSYNOPSIS },
! 84: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
! 85: { "function", NODE_FUNCTION },
! 86: { "glossterm", NODE_GLOSSTERM },
! 87: { "group", NODE_GROUP },
! 88: { "holder", NODE_HOLDER },
! 89: { "index", NODE_INDEX },
! 90: { "indexterm", NODE_INDEXTERM },
! 91: { "info", NODE_INFO },
! 92: { "informalequation", NODE_INFORMALEQUATION },
! 93: { "informaltable", NODE_INFORMALTABLE },
! 94: { "inlineequation", NODE_INLINEEQUATION },
! 95: { "itemizedlist", NODE_ITEMIZEDLIST },
! 96: { "keysym", NODE_KEYSYM },
! 97: { "legalnotice", NODE_LEGALNOTICE },
! 98: { "link", NODE_LINK },
! 99: { "listitem", NODE_LISTITEM },
! 100: { "literal", NODE_LITERAL },
! 101: { "literallayout", NODE_LITERALLAYOUT },
! 102: { "manvolnum", NODE_MANVOLNUM },
! 103: { "member", NODE_MEMBER },
! 104: { "mml:math", NODE_MML_MATH },
! 105: { "mml:mfenced", NODE_MML_MFENCED },
! 106: { "mml:mfrac", NODE_MML_MFRAC },
! 107: { "mml:mi", NODE_MML_MI },
! 108: { "mml:mn", NODE_MML_MN },
! 109: { "mml:mo", NODE_MML_MO },
! 110: { "mml:mrow", NODE_MML_MROW },
! 111: { "mml:msub", NODE_MML_MSUB },
! 112: { "mml:msup", NODE_MML_MSUP },
! 113: { "modifier", NODE_MODIFIER },
! 114: { "note", NODE_NOTE },
! 115: { "option", NODE_OPTION },
! 116: { "orderedlist", NODE_ORDEREDLIST },
! 117: { "orgname", NODE_ORGNAME },
! 118: { "othername", NODE_OTHERNAME },
! 119: { "para", NODE_PARA },
! 120: { "paramdef", NODE_PARAMDEF },
! 121: { "parameter", NODE_PARAMETER },
! 122: { "part", NODE_SECTION },
! 123: { "personname", NODE_PERSONNAME },
! 124: { "phrase", NODE_PHRASE },
! 125: { "preface", NODE_PREFACE },
! 126: { "primary", NODE_PRIMARY },
! 127: { "programlisting", NODE_PROGRAMLISTING },
! 128: { "prompt", NODE_PROMPT },
! 129: { "quote", NODE_QUOTE },
! 130: { "refclass", NODE_REFCLASS },
! 131: { "refdescriptor", NODE_REFDESCRIPTOR },
! 132: { "refentry", NODE_REFENTRY },
! 133: { "refentryinfo", NODE_REFENTRYINFO },
! 134: { "refentrytitle", NODE_REFENTRYTITLE },
! 135: { "refmeta", NODE_REFMETA },
! 136: { "refmetainfo", NODE_REFMETAINFO },
! 137: { "refmiscinfo", NODE_REFMISCINFO },
! 138: { "refname", NODE_REFNAME },
! 139: { "refnamediv", NODE_REFNAMEDIV },
! 140: { "refpurpose", NODE_REFPURPOSE },
! 141: { "refsect1", NODE_SECTION },
! 142: { "refsect2", NODE_SECTION },
! 143: { "refsect3", NODE_SECTION },
! 144: { "refsection", NODE_SECTION },
! 145: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
! 146: { "releaseinfo", NODE_RELEASEINFO },
! 147: { "replaceable", NODE_REPLACEABLE },
! 148: { "row", NODE_ROW },
! 149: { "sbr", NODE_SBR },
! 150: { "screen", NODE_SCREEN },
! 151: { "secondary", NODE_SECONDARY },
! 152: { "sect1", NODE_SECTION },
! 153: { "sect2", NODE_SECTION },
! 154: { "section", NODE_SECTION },
! 155: { "sgmltag", NODE_SGMLTAG },
! 156: { "simplelist", NODE_SIMPLELIST },
! 157: { "spanspec", NODE_SPANSPEC },
! 158: { "structname", NODE_STRUCTNAME },
! 159: { "subtitle", NODE_SUBTITLE },
! 160: { "surname", NODE_SURNAME },
! 161: { "synopsis", NODE_SYNOPSIS },
! 162: { "table", NODE_TABLE },
! 163: { "tbody", NODE_TBODY },
! 164: { "term", NODE_TERM },
! 165: { "tfoot", NODE_TFOOT },
! 166: { "tgroup", NODE_TGROUP },
! 167: { "thead", NODE_THEAD },
! 168: { "tip", NODE_TIP },
! 169: { "title", NODE_TITLE },
! 170: { "trademark", NODE_TRADEMARK },
! 171: { "type", NODE_TYPE },
! 172: { "ulink", NODE_ULINK },
! 173: { "userinput", NODE_USERINPUT },
! 174: { "variablelist", NODE_VARIABLELIST },
! 175: { "varlistentry", NODE_VARLISTENTRY },
! 176: { "varname", NODE_VARNAME },
! 177: { "warning", NODE_WARNING },
! 178: { "wordasword", NODE_WORDASWORD },
! 179: { "year", NODE_YEAR },
! 180: { NULL, NODE__MAX }
! 181: };
! 182:
! 183: /*
! 184: * Process a string of characters.
! 185: * If a text node is already open, append to it.
! 186: * Otherwise, create a new one as a child of the current node.
! 187: */
! 188: static void
! 189: xml_char(void *arg, const XML_Char *p, int sz)
! 190: {
! 191: struct parse *ps;
! 192: struct pnode *dat;
! 193: int i;
! 194:
! 195: ps = arg;
! 196: if (ps->tree->flags && TREE_FAIL)
! 197: return;
! 198:
! 199: /*
! 200: * Only create a new node if there is non-whitespace text.
! 201: * Strip all leading whitespace.
! 202: */
! 203: if (ps->cur->node != NODE_TEXT) {
! 204: for (i = 0; i < sz; i++)
! 205: if (isspace((unsigned char)p[i]) == 0)
! 206: break;
! 207: if (i == sz)
! 208: return;
! 209: p += i;
! 210: sz -= i;
! 211:
! 212: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
! 213: perror(NULL);
! 214: exit(1);
! 215: }
! 216: dat->node = NODE_TEXT;
! 217: dat->parent = ps->cur;
! 218: TAILQ_INIT(&dat->childq);
! 219: TAILQ_INIT(&dat->attrq);
! 220: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
! 221: ps->cur = dat;
! 222: }
! 223:
! 224: /* Append to the current text node. */
! 225:
! 226: assert(sz >= 0);
! 227: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
! 228: if (ps->cur->b == NULL) {
! 229: perror(NULL);
! 230: exit(1);
! 231: }
! 232: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
! 233: ps->cur->bsz += sz;
! 234: ps->cur->b[ps->cur->bsz] = '\0';
! 235: ps->cur->real = ps->cur->b;
! 236: }
! 237:
! 238: static void
! 239: pnode_trim(struct pnode *pn)
! 240: {
! 241: assert(pn->node == NODE_TEXT);
! 242: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
! 243: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
! 244: break;
! 245: }
! 246:
! 247: /*
! 248: * Begin an element.
! 249: * If the name is unknown, abort parsing.
! 250: */
! 251: static void
! 252: xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
! 253: {
! 254: struct parse *ps;
! 255: const struct element *elem;
! 256: enum attrkey key;
! 257: struct pnode *dat;
! 258: struct pattr *pattr;
! 259: const XML_Char **att;
! 260:
! 261: ps = arg;
! 262: if (ps->tree->flags && TREE_FAIL)
! 263: return;
! 264:
! 265: /* FIXME: find a better way to ditch other namespaces. */
! 266: if (strcmp(name, "xi:include") == 0)
! 267: return;
! 268:
! 269: /* Close out the text node, if there is one. */
! 270: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
! 271: pnode_trim(ps->cur);
! 272: ps->cur = ps->cur->parent;
! 273: }
! 274:
! 275: for (elem = elements; elem->name != NULL; elem++)
! 276: if (strcmp(elem->name, name) == 0)
! 277: break;
! 278:
! 279: if (elem->name == NULL) {
! 280: fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
! 281: ps->fname, XML_GetCurrentLineNumber(ps->xml),
! 282: XML_GetCurrentColumnNumber(ps->xml), name);
! 283: ps->tree->flags |= TREE_FAIL;
! 284: return;
! 285: }
! 286:
! 287: if (elem->node == NODE_INLINEEQUATION)
! 288: ps->tree->flags |= TREE_EQN;
! 289:
! 290: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
! 291: perror(NULL);
! 292: exit(1);
! 293: }
! 294: dat->node = elem->node;
! 295: dat->parent = ps->cur;
! 296: TAILQ_INIT(&dat->childq);
! 297: TAILQ_INIT(&dat->attrq);
! 298:
! 299: if (ps->cur != NULL)
! 300: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
! 301:
! 302: ps->cur = dat;
! 303: if (ps->tree->root == NULL)
! 304: ps->tree->root = dat;
! 305:
! 306: /*
! 307: * Process attributes.
! 308: */
! 309: for (att = atts; *att != NULL; att += 2) {
! 310: if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
! 311: if (ps->warn)
! 312: fprintf(stderr, "%s:%zu:%zu: warning: "
! 313: "unknown attribute \"%s\"\n",
! 314: ps->fname,
! 315: XML_GetCurrentLineNumber(ps->xml),
! 316: XML_GetCurrentColumnNumber(ps->xml),
! 317: *att);
! 318: continue;
! 319: }
! 320: pattr = calloc(1, sizeof(*pattr));
! 321: pattr->key = key;
! 322: if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
! 323: pattr->rawval = strdup(att[1]);
! 324: TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
! 325: }
! 326: }
! 327:
! 328: /*
! 329: * Roll up the parse tree.
! 330: * If we're at a text node, roll that one up first.
! 331: */
! 332: static void
! 333: xml_elem_end(void *arg, const XML_Char *name)
! 334: {
! 335: struct parse *ps;
! 336:
! 337: ps = arg;
! 338: if (ps->tree->flags && TREE_FAIL)
! 339: return;
! 340:
! 341: /* FIXME: find a better way to ditch other namespaces. */
! 342: if (strcmp(name, "xi:include") == 0)
! 343: return;
! 344:
! 345: /* Close out the text node, if there is one. */
! 346: if (ps->cur->node == NODE_TEXT) {
! 347: pnode_trim(ps->cur);
! 348: ps->cur = ps->cur->parent;
! 349: }
! 350: ps->cur = ps->cur->parent;
! 351: }
! 352:
! 353: struct parse *
! 354: parse_alloc(int warn)
! 355: {
! 356: struct parse *p;
! 357:
! 358: if ((p = calloc(1, sizeof(*p))) == NULL)
! 359: return NULL;
! 360:
! 361: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
! 362: free(p);
! 363: return NULL;
! 364: }
! 365:
! 366: if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
! 367: free(p->tree);
! 368: free(p);
! 369: return NULL;
! 370: }
! 371: p->warn = warn;
! 372: XML_SetCharacterDataHandler(p->xml, xml_char);
! 373: XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
! 374: XML_SetUserData(p->xml, p);
! 375: return p;
! 376: }
! 377:
! 378: void
! 379: parse_free(struct parse *p)
! 380: {
! 381: if (p == NULL)
! 382: return;
! 383: XML_ParserFree(p->xml);
! 384: if (p->tree != NULL) {
! 385: pnode_unlink(p->tree->root);
! 386: free(p->tree);
! 387: }
! 388: free(p);
! 389: }
! 390:
! 391: struct ptree *
! 392: parse_file(struct parse *p, int fd, const char *fname)
! 393: {
! 394: char b[4096];
! 395: ssize_t ssz;
! 396:
! 397: p->fname = fname;
! 398: do {
! 399: if ((ssz = read(fd, b, sizeof(b))) < 0) {
! 400: perror(fname);
! 401: pnode_unlink(p->tree->root);
! 402: p->tree->root = p->cur = NULL;
! 403: p->tree->flags |= TREE_FAIL;
! 404: return NULL;
! 405: }
! 406: if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
! 407: fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
! 408: XML_GetCurrentLineNumber(p->xml),
! 409: XML_GetCurrentColumnNumber(p->xml),
! 410: XML_ErrorString(XML_GetErrorCode(p->xml)));
! 411: p->tree->flags |= TREE_FAIL;
! 412: }
! 413: } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
! 414: return p->tree;
! 415: }
CVSweb