Annotation of docbook2mdoc/parse.c, Revision 1.4
1.4 ! schwarze 1: /* $Id: parse.c,v 1.3 2019/03/26 21:52:09 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
20: #include <expat.h>
21: #include <stdio.h>
22: #include <string.h>
23: #include <unistd.h>
24:
25: #include "node.h"
26: #include "parse.h"
27:
28: /*
29: * The implementation of the DocBook parser.
30: */
31:
32: /*
33: * Global parse state.
34: * Keep this as simple and small as possible.
35: */
36: struct parse {
37: XML_Parser xml;
38: const char *fname; /* Name of the input file. */
39: struct ptree *tree; /* Complete parse result. */
40: struct pnode *cur; /* Current node in the tree. */
1.4 ! schwarze 41: int del; /* Levels of nested nodes being deleted. */
1.1 schwarze 42: int warn;
43: };
44:
45: struct element {
46: const char *name; /* DocBook element name. */
47: enum nodeid node; /* Node type to generate. */
48: };
49:
50: static const struct element elements[] = {
1.3 schwarze 51: { "acronym", NODE_IGNORE },
1.1 schwarze 52: { "affiliation", NODE_AFFILIATION },
1.4 ! schwarze 53: { "anchor", NODE_DELETE },
1.1 schwarze 54: { "application", NODE_APPLICATION },
55: { "arg", NODE_ARG },
56: { "author", NODE_AUTHOR },
57: { "authorgroup", NODE_AUTHORGROUP },
58: { "blockquote", NODE_BLOCKQUOTE },
59: { "book", NODE_BOOK },
60: { "bookinfo", NODE_BOOKINFO },
61: { "caution", NODE_CAUTION },
62: { "chapter", NODE_SECTION },
63: { "citerefentry", NODE_CITEREFENTRY },
64: { "citetitle", NODE_CITETITLE },
65: { "cmdsynopsis", NODE_CMDSYNOPSIS },
66: { "code", NODE_CODE },
67: { "colspec", NODE_COLSPEC },
68: { "command", NODE_COMMAND },
69: { "constant", NODE_CONSTANT },
70: { "copyright", NODE_COPYRIGHT },
71: { "date", NODE_DATE },
72: { "editor", NODE_EDITOR },
73: { "email", NODE_EMAIL },
74: { "emphasis", NODE_EMPHASIS },
75: { "entry", NODE_ENTRY },
76: { "envar", NODE_ENVAR },
77: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
78: { "filename", NODE_FILENAME },
1.3 schwarze 79: { "firstname", NODE_IGNORE },
1.1 schwarze 80: { "firstterm", NODE_FIRSTTERM },
81: { "footnote", NODE_FOOTNOTE },
82: { "funcdef", NODE_FUNCDEF },
83: { "funcprototype", NODE_FUNCPROTOTYPE },
84: { "funcsynopsis", NODE_FUNCSYNOPSIS },
85: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
86: { "function", NODE_FUNCTION },
87: { "glossterm", NODE_GLOSSTERM },
88: { "group", NODE_GROUP },
89: { "holder", NODE_HOLDER },
90: { "index", NODE_INDEX },
1.4 ! schwarze 91: { "indexterm", NODE_DELETE },
1.1 schwarze 92: { "info", NODE_INFO },
93: { "informalequation", NODE_INFORMALEQUATION },
94: { "informaltable", NODE_INFORMALTABLE },
95: { "inlineequation", NODE_INLINEEQUATION },
96: { "itemizedlist", NODE_ITEMIZEDLIST },
97: { "keysym", NODE_KEYSYM },
98: { "legalnotice", NODE_LEGALNOTICE },
99: { "link", NODE_LINK },
100: { "listitem", NODE_LISTITEM },
101: { "literal", NODE_LITERAL },
102: { "literallayout", NODE_LITERALLAYOUT },
103: { "manvolnum", NODE_MANVOLNUM },
104: { "member", NODE_MEMBER },
105: { "mml:math", NODE_MML_MATH },
106: { "mml:mfenced", NODE_MML_MFENCED },
107: { "mml:mfrac", NODE_MML_MFRAC },
108: { "mml:mi", NODE_MML_MI },
109: { "mml:mn", NODE_MML_MN },
110: { "mml:mo", NODE_MML_MO },
111: { "mml:mrow", NODE_MML_MROW },
112: { "mml:msub", NODE_MML_MSUB },
113: { "mml:msup", NODE_MML_MSUP },
114: { "modifier", NODE_MODIFIER },
115: { "note", NODE_NOTE },
116: { "option", NODE_OPTION },
117: { "orderedlist", NODE_ORDEREDLIST },
118: { "orgname", NODE_ORGNAME },
1.3 schwarze 119: { "othername", NODE_IGNORE },
1.1 schwarze 120: { "para", NODE_PARA },
121: { "paramdef", NODE_PARAMDEF },
122: { "parameter", NODE_PARAMETER },
123: { "part", NODE_SECTION },
124: { "personname", NODE_PERSONNAME },
1.3 schwarze 125: { "phrase", NODE_IGNORE },
1.1 schwarze 126: { "preface", NODE_PREFACE },
1.4 ! schwarze 127: { "primary", NODE_DELETE },
1.1 schwarze 128: { "programlisting", NODE_PROGRAMLISTING },
129: { "prompt", NODE_PROMPT },
130: { "quote", NODE_QUOTE },
131: { "refclass", NODE_REFCLASS },
132: { "refdescriptor", NODE_REFDESCRIPTOR },
133: { "refentry", NODE_REFENTRY },
134: { "refentryinfo", NODE_REFENTRYINFO },
135: { "refentrytitle", NODE_REFENTRYTITLE },
136: { "refmeta", NODE_REFMETA },
137: { "refmetainfo", NODE_REFMETAINFO },
138: { "refmiscinfo", NODE_REFMISCINFO },
139: { "refname", NODE_REFNAME },
140: { "refnamediv", NODE_REFNAMEDIV },
141: { "refpurpose", NODE_REFPURPOSE },
142: { "refsect1", NODE_SECTION },
143: { "refsect2", NODE_SECTION },
144: { "refsect3", NODE_SECTION },
145: { "refsection", NODE_SECTION },
146: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
147: { "releaseinfo", NODE_RELEASEINFO },
148: { "replaceable", NODE_REPLACEABLE },
149: { "row", NODE_ROW },
150: { "sbr", NODE_SBR },
151: { "screen", NODE_SCREEN },
1.4 ! schwarze 152: { "secondary", NODE_DELETE },
1.1 schwarze 153: { "sect1", NODE_SECTION },
154: { "sect2", NODE_SECTION },
155: { "section", NODE_SECTION },
156: { "sgmltag", NODE_SGMLTAG },
157: { "simplelist", NODE_SIMPLELIST },
158: { "spanspec", NODE_SPANSPEC },
159: { "structname", NODE_STRUCTNAME },
160: { "subtitle", NODE_SUBTITLE },
1.3 schwarze 161: { "surname", NODE_IGNORE },
1.1 schwarze 162: { "synopsis", NODE_SYNOPSIS },
163: { "table", NODE_TABLE },
164: { "tbody", NODE_TBODY },
165: { "term", NODE_TERM },
166: { "tfoot", NODE_TFOOT },
167: { "tgroup", NODE_TGROUP },
168: { "thead", NODE_THEAD },
169: { "tip", NODE_TIP },
170: { "title", NODE_TITLE },
1.3 schwarze 171: { "trademark", NODE_IGNORE },
1.1 schwarze 172: { "type", NODE_TYPE },
173: { "ulink", NODE_ULINK },
174: { "userinput", NODE_USERINPUT },
175: { "variablelist", NODE_VARIABLELIST },
176: { "varlistentry", NODE_VARLISTENTRY },
177: { "varname", NODE_VARNAME },
178: { "warning", NODE_WARNING },
179: { "wordasword", NODE_WORDASWORD },
1.4 ! schwarze 180: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 181: { "year", NODE_YEAR },
182: { NULL, NODE__MAX }
183: };
184:
185: /*
186: * Process a string of characters.
187: * If a text node is already open, append to it.
188: * Otherwise, create a new one as a child of the current node.
189: */
190: static void
191: xml_char(void *arg, const XML_Char *p, int sz)
192: {
193: struct parse *ps;
194: struct pnode *dat;
195: int i;
196:
197: ps = arg;
1.4 ! schwarze 198: if (ps->del > 0 || ps->tree->flags & TREE_FAIL)
1.1 schwarze 199: return;
200:
201: /*
202: * Only create a new node if there is non-whitespace text.
203: * Strip all leading whitespace.
204: */
205: if (ps->cur->node != NODE_TEXT) {
206: for (i = 0; i < sz; i++)
207: if (isspace((unsigned char)p[i]) == 0)
208: break;
209: if (i == sz)
210: return;
211: p += i;
212: sz -= i;
213:
214: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
215: perror(NULL);
216: exit(1);
217: }
218: dat->node = NODE_TEXT;
219: dat->parent = ps->cur;
220: TAILQ_INIT(&dat->childq);
221: TAILQ_INIT(&dat->attrq);
222: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
223: ps->cur = dat;
224: }
225:
226: /* Append to the current text node. */
227:
228: assert(sz >= 0);
229: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
230: if (ps->cur->b == NULL) {
231: perror(NULL);
232: exit(1);
233: }
234: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
235: ps->cur->bsz += sz;
236: ps->cur->b[ps->cur->bsz] = '\0';
237: ps->cur->real = ps->cur->b;
238: }
239:
240: static void
241: pnode_trim(struct pnode *pn)
242: {
243: assert(pn->node == NODE_TEXT);
244: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
245: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
246: break;
247: }
248:
249: /*
250: * Begin an element.
251: * If the name is unknown, abort parsing.
252: */
253: static void
254: xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
255: {
256: struct parse *ps;
257: const struct element *elem;
258: enum attrkey key;
259: struct pnode *dat;
260: struct pattr *pattr;
261: const XML_Char **att;
262:
263: ps = arg;
1.4 ! schwarze 264: if (ps->tree->flags & TREE_FAIL)
1.1 schwarze 265: return;
266:
1.4 ! schwarze 267: /*
! 268: * An ancestor is excluded from the tree;
! 269: * keep track of the number of levels excluded.
! 270: */
! 271: if (ps->del > 0) {
! 272: ps->del++;
! 273: return;
! 274: }
! 275:
1.1 schwarze 276: /* Close out the text node, if there is one. */
277: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
278: pnode_trim(ps->cur);
279: ps->cur = ps->cur->parent;
280: }
281:
282: for (elem = elements; elem->name != NULL; elem++)
283: if (strcmp(elem->name, name) == 0)
284: break;
285:
286: if (elem->name == NULL) {
287: fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
288: ps->fname, XML_GetCurrentLineNumber(ps->xml),
289: XML_GetCurrentColumnNumber(ps->xml), name);
290: ps->tree->flags |= TREE_FAIL;
291: return;
292: }
293:
1.2 schwarze 294: switch (elem->node) {
1.4 ! schwarze 295: case NODE_DELETE_WARN:
1.2 schwarze 296: if (ps->warn)
297: fprintf(stderr, "%s:%zu:%zu: warning: "
1.4 ! schwarze 298: "skipping element <%s>\n", ps->fname,
1.2 schwarze 299: XML_GetCurrentLineNumber(ps->xml),
300: XML_GetCurrentColumnNumber(ps->xml), name);
301: /* FALLTHROUGH */
1.4 ! schwarze 302: case NODE_DELETE:
! 303: ps->del = 1;
! 304: /* FALLTHROUGH */
1.2 schwarze 305: case NODE_IGNORE:
306: return;
307: case NODE_INLINEEQUATION:
1.1 schwarze 308: ps->tree->flags |= TREE_EQN;
1.2 schwarze 309: break;
310: default:
311: break;
312: }
1.1 schwarze 313:
314: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
315: perror(NULL);
316: exit(1);
317: }
318: dat->node = elem->node;
319: dat->parent = ps->cur;
320: TAILQ_INIT(&dat->childq);
321: TAILQ_INIT(&dat->attrq);
322:
323: if (ps->cur != NULL)
324: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
325:
326: ps->cur = dat;
327: if (ps->tree->root == NULL)
328: ps->tree->root = dat;
329:
330: /*
331: * Process attributes.
332: */
333: for (att = atts; *att != NULL; att += 2) {
334: if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
335: if (ps->warn)
336: fprintf(stderr, "%s:%zu:%zu: warning: "
337: "unknown attribute \"%s\"\n",
338: ps->fname,
339: XML_GetCurrentLineNumber(ps->xml),
340: XML_GetCurrentColumnNumber(ps->xml),
341: *att);
342: continue;
343: }
344: pattr = calloc(1, sizeof(*pattr));
345: pattr->key = key;
346: if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
347: pattr->rawval = strdup(att[1]);
348: TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
349: }
350: }
351:
352: /*
353: * Roll up the parse tree.
354: * If we're at a text node, roll that one up first.
355: */
356: static void
357: xml_elem_end(void *arg, const XML_Char *name)
358: {
359: struct parse *ps;
1.2 schwarze 360: const struct element *elem;
1.1 schwarze 361:
362: ps = arg;
1.4 ! schwarze 363: if (ps->tree->flags & TREE_FAIL)
1.1 schwarze 364: return;
365:
1.4 ! schwarze 366: /*
! 367: * An ancestor is excluded from the tree;
! 368: * keep track of the number of levels excluded.
! 369: */
! 370: if (ps->del > 1) {
! 371: ps->del--;
! 372: return;
! 373: }
! 374:
1.1 schwarze 375: /* Close out the text node, if there is one. */
1.4 ! schwarze 376: if (ps->del == 0 && ps->cur->node == NODE_TEXT) {
1.1 schwarze 377: pnode_trim(ps->cur);
378: ps->cur = ps->cur->parent;
379: }
1.2 schwarze 380:
381: for (elem = elements; elem->name != NULL; elem++)
382: if (strcmp(elem->name, name) == 0)
383: break;
384:
385: switch (elem->node) {
1.4 ! schwarze 386: case NODE_DELETE_WARN:
! 387: case NODE_DELETE:
! 388: ps->del--;
! 389: break;
1.2 schwarze 390: case NODE_IGNORE:
391: break;
392: default:
393: assert(elem->node == ps->cur->node);
394: ps->cur = ps->cur->parent;
1.4 ! schwarze 395: break;
1.2 schwarze 396: }
1.4 ! schwarze 397: assert(ps->del == 0);
1.1 schwarze 398: }
399:
400: struct parse *
401: parse_alloc(int warn)
402: {
403: struct parse *p;
404:
405: if ((p = calloc(1, sizeof(*p))) == NULL)
406: return NULL;
407:
408: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
409: free(p);
410: return NULL;
411: }
412:
413: if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
414: free(p->tree);
415: free(p);
416: return NULL;
417: }
418: p->warn = warn;
419: XML_SetCharacterDataHandler(p->xml, xml_char);
420: XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
421: XML_SetUserData(p->xml, p);
422: return p;
423: }
424:
425: void
426: parse_free(struct parse *p)
427: {
428: if (p == NULL)
429: return;
430: XML_ParserFree(p->xml);
431: if (p->tree != NULL) {
432: pnode_unlink(p->tree->root);
433: free(p->tree);
434: }
435: free(p);
436: }
437:
438: struct ptree *
439: parse_file(struct parse *p, int fd, const char *fname)
440: {
441: char b[4096];
442: ssize_t ssz;
443:
444: p->fname = fname;
445: do {
446: if ((ssz = read(fd, b, sizeof(b))) < 0) {
447: perror(fname);
448: pnode_unlink(p->tree->root);
449: p->tree->root = p->cur = NULL;
450: p->tree->flags |= TREE_FAIL;
451: return NULL;
452: }
453: if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
454: fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
455: XML_GetCurrentLineNumber(p->xml),
456: XML_GetCurrentColumnNumber(p->xml),
457: XML_ErrorString(XML_GetErrorCode(p->xml)));
458: p->tree->flags |= TREE_FAIL;
459: }
460: } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
461: return p->tree;
462: }
CVSweb