Annotation of docbook2mdoc/parse.c, Revision 1.3
1.3 ! schwarze 1: /* $Id: parse.c,v 1.2 2019/03/26 20:54:43 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
20: #include <expat.h>
21: #include <stdio.h>
22: #include <string.h>
23: #include <unistd.h>
24:
25: #include "node.h"
26: #include "parse.h"
27:
28: /*
29: * The implementation of the DocBook parser.
30: */
31:
32: /*
33: * Global parse state.
34: * Keep this as simple and small as possible.
35: */
36: struct parse {
37: XML_Parser xml;
38: const char *fname; /* Name of the input file. */
39: struct ptree *tree; /* Complete parse result. */
40: struct pnode *cur; /* Current node in the tree. */
41: int warn;
42: };
43:
44: struct element {
45: const char *name; /* DocBook element name. */
46: enum nodeid node; /* Node type to generate. */
47: };
48:
49: static const struct element elements[] = {
1.3 ! schwarze 50: { "acronym", NODE_IGNORE },
1.1 schwarze 51: { "affiliation", NODE_AFFILIATION },
1.3 ! schwarze 52: { "anchor", NODE_IGNORE },
1.1 schwarze 53: { "application", NODE_APPLICATION },
54: { "arg", NODE_ARG },
55: { "author", NODE_AUTHOR },
56: { "authorgroup", NODE_AUTHORGROUP },
57: { "blockquote", NODE_BLOCKQUOTE },
58: { "book", NODE_BOOK },
59: { "bookinfo", NODE_BOOKINFO },
60: { "caution", NODE_CAUTION },
61: { "chapter", NODE_SECTION },
62: { "citerefentry", NODE_CITEREFENTRY },
63: { "citetitle", NODE_CITETITLE },
64: { "cmdsynopsis", NODE_CMDSYNOPSIS },
65: { "code", NODE_CODE },
66: { "colspec", NODE_COLSPEC },
67: { "command", NODE_COMMAND },
68: { "constant", NODE_CONSTANT },
69: { "copyright", NODE_COPYRIGHT },
70: { "date", NODE_DATE },
71: { "editor", NODE_EDITOR },
72: { "email", NODE_EMAIL },
73: { "emphasis", NODE_EMPHASIS },
74: { "entry", NODE_ENTRY },
75: { "envar", NODE_ENVAR },
76: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
77: { "filename", NODE_FILENAME },
1.3 ! schwarze 78: { "firstname", NODE_IGNORE },
1.1 schwarze 79: { "firstterm", NODE_FIRSTTERM },
80: { "footnote", NODE_FOOTNOTE },
81: { "funcdef", NODE_FUNCDEF },
82: { "funcprototype", NODE_FUNCPROTOTYPE },
83: { "funcsynopsis", NODE_FUNCSYNOPSIS },
84: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
85: { "function", NODE_FUNCTION },
86: { "glossterm", NODE_GLOSSTERM },
87: { "group", NODE_GROUP },
88: { "holder", NODE_HOLDER },
89: { "index", NODE_INDEX },
90: { "indexterm", NODE_INDEXTERM },
91: { "info", NODE_INFO },
92: { "informalequation", NODE_INFORMALEQUATION },
93: { "informaltable", NODE_INFORMALTABLE },
94: { "inlineequation", NODE_INLINEEQUATION },
95: { "itemizedlist", NODE_ITEMIZEDLIST },
96: { "keysym", NODE_KEYSYM },
97: { "legalnotice", NODE_LEGALNOTICE },
98: { "link", NODE_LINK },
99: { "listitem", NODE_LISTITEM },
100: { "literal", NODE_LITERAL },
101: { "literallayout", NODE_LITERALLAYOUT },
102: { "manvolnum", NODE_MANVOLNUM },
103: { "member", NODE_MEMBER },
104: { "mml:math", NODE_MML_MATH },
105: { "mml:mfenced", NODE_MML_MFENCED },
106: { "mml:mfrac", NODE_MML_MFRAC },
107: { "mml:mi", NODE_MML_MI },
108: { "mml:mn", NODE_MML_MN },
109: { "mml:mo", NODE_MML_MO },
110: { "mml:mrow", NODE_MML_MROW },
111: { "mml:msub", NODE_MML_MSUB },
112: { "mml:msup", NODE_MML_MSUP },
113: { "modifier", NODE_MODIFIER },
114: { "note", NODE_NOTE },
115: { "option", NODE_OPTION },
116: { "orderedlist", NODE_ORDEREDLIST },
117: { "orgname", NODE_ORGNAME },
1.3 ! schwarze 118: { "othername", NODE_IGNORE },
1.1 schwarze 119: { "para", NODE_PARA },
120: { "paramdef", NODE_PARAMDEF },
121: { "parameter", NODE_PARAMETER },
122: { "part", NODE_SECTION },
123: { "personname", NODE_PERSONNAME },
1.3 ! schwarze 124: { "phrase", NODE_IGNORE },
1.1 schwarze 125: { "preface", NODE_PREFACE },
126: { "primary", NODE_PRIMARY },
127: { "programlisting", NODE_PROGRAMLISTING },
128: { "prompt", NODE_PROMPT },
129: { "quote", NODE_QUOTE },
130: { "refclass", NODE_REFCLASS },
131: { "refdescriptor", NODE_REFDESCRIPTOR },
132: { "refentry", NODE_REFENTRY },
133: { "refentryinfo", NODE_REFENTRYINFO },
134: { "refentrytitle", NODE_REFENTRYTITLE },
135: { "refmeta", NODE_REFMETA },
136: { "refmetainfo", NODE_REFMETAINFO },
137: { "refmiscinfo", NODE_REFMISCINFO },
138: { "refname", NODE_REFNAME },
139: { "refnamediv", NODE_REFNAMEDIV },
140: { "refpurpose", NODE_REFPURPOSE },
141: { "refsect1", NODE_SECTION },
142: { "refsect2", NODE_SECTION },
143: { "refsect3", NODE_SECTION },
144: { "refsection", NODE_SECTION },
145: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
146: { "releaseinfo", NODE_RELEASEINFO },
147: { "replaceable", NODE_REPLACEABLE },
148: { "row", NODE_ROW },
149: { "sbr", NODE_SBR },
150: { "screen", NODE_SCREEN },
151: { "secondary", NODE_SECONDARY },
152: { "sect1", NODE_SECTION },
153: { "sect2", NODE_SECTION },
154: { "section", NODE_SECTION },
155: { "sgmltag", NODE_SGMLTAG },
156: { "simplelist", NODE_SIMPLELIST },
157: { "spanspec", NODE_SPANSPEC },
158: { "structname", NODE_STRUCTNAME },
159: { "subtitle", NODE_SUBTITLE },
1.3 ! schwarze 160: { "surname", NODE_IGNORE },
1.1 schwarze 161: { "synopsis", NODE_SYNOPSIS },
162: { "table", NODE_TABLE },
163: { "tbody", NODE_TBODY },
164: { "term", NODE_TERM },
165: { "tfoot", NODE_TFOOT },
166: { "tgroup", NODE_TGROUP },
167: { "thead", NODE_THEAD },
168: { "tip", NODE_TIP },
169: { "title", NODE_TITLE },
1.3 ! schwarze 170: { "trademark", NODE_IGNORE },
1.1 schwarze 171: { "type", NODE_TYPE },
172: { "ulink", NODE_ULINK },
173: { "userinput", NODE_USERINPUT },
174: { "variablelist", NODE_VARIABLELIST },
175: { "varlistentry", NODE_VARLISTENTRY },
176: { "varname", NODE_VARNAME },
177: { "warning", NODE_WARNING },
178: { "wordasword", NODE_WORDASWORD },
1.2 schwarze 179: { "xi:include", NODE_WARN },
1.1 schwarze 180: { "year", NODE_YEAR },
181: { NULL, NODE__MAX }
182: };
183:
184: /*
185: * Process a string of characters.
186: * If a text node is already open, append to it.
187: * Otherwise, create a new one as a child of the current node.
188: */
189: static void
190: xml_char(void *arg, const XML_Char *p, int sz)
191: {
192: struct parse *ps;
193: struct pnode *dat;
194: int i;
195:
196: ps = arg;
197: if (ps->tree->flags && TREE_FAIL)
198: return;
199:
200: /*
201: * Only create a new node if there is non-whitespace text.
202: * Strip all leading whitespace.
203: */
204: if (ps->cur->node != NODE_TEXT) {
205: for (i = 0; i < sz; i++)
206: if (isspace((unsigned char)p[i]) == 0)
207: break;
208: if (i == sz)
209: return;
210: p += i;
211: sz -= i;
212:
213: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
214: perror(NULL);
215: exit(1);
216: }
217: dat->node = NODE_TEXT;
218: dat->parent = ps->cur;
219: TAILQ_INIT(&dat->childq);
220: TAILQ_INIT(&dat->attrq);
221: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
222: ps->cur = dat;
223: }
224:
225: /* Append to the current text node. */
226:
227: assert(sz >= 0);
228: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
229: if (ps->cur->b == NULL) {
230: perror(NULL);
231: exit(1);
232: }
233: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
234: ps->cur->bsz += sz;
235: ps->cur->b[ps->cur->bsz] = '\0';
236: ps->cur->real = ps->cur->b;
237: }
238:
239: static void
240: pnode_trim(struct pnode *pn)
241: {
242: assert(pn->node == NODE_TEXT);
243: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
244: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
245: break;
246: }
247:
248: /*
249: * Begin an element.
250: * If the name is unknown, abort parsing.
251: */
252: static void
253: xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
254: {
255: struct parse *ps;
256: const struct element *elem;
257: enum attrkey key;
258: struct pnode *dat;
259: struct pattr *pattr;
260: const XML_Char **att;
261:
262: ps = arg;
263: if (ps->tree->flags && TREE_FAIL)
264: return;
265:
266: /* Close out the text node, if there is one. */
267: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
268: pnode_trim(ps->cur);
269: ps->cur = ps->cur->parent;
270: }
271:
272: for (elem = elements; elem->name != NULL; elem++)
273: if (strcmp(elem->name, name) == 0)
274: break;
275:
276: if (elem->name == NULL) {
277: fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
278: ps->fname, XML_GetCurrentLineNumber(ps->xml),
279: XML_GetCurrentColumnNumber(ps->xml), name);
280: ps->tree->flags |= TREE_FAIL;
281: return;
282: }
283:
1.2 schwarze 284: switch (elem->node) {
285: case NODE_WARN:
286: if (ps->warn)
287: fprintf(stderr, "%s:%zu:%zu: warning: "
288: "ignoring element <%s>\n", ps->fname,
289: XML_GetCurrentLineNumber(ps->xml),
290: XML_GetCurrentColumnNumber(ps->xml), name);
291: /* FALLTHROUGH */
292: case NODE_IGNORE:
293: return;
294: case NODE_INLINEEQUATION:
1.1 schwarze 295: ps->tree->flags |= TREE_EQN;
1.2 schwarze 296: break;
297: default:
298: break;
299: }
1.1 schwarze 300:
301: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
302: perror(NULL);
303: exit(1);
304: }
305: dat->node = elem->node;
306: dat->parent = ps->cur;
307: TAILQ_INIT(&dat->childq);
308: TAILQ_INIT(&dat->attrq);
309:
310: if (ps->cur != NULL)
311: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
312:
313: ps->cur = dat;
314: if (ps->tree->root == NULL)
315: ps->tree->root = dat;
316:
317: /*
318: * Process attributes.
319: */
320: for (att = atts; *att != NULL; att += 2) {
321: if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
322: if (ps->warn)
323: fprintf(stderr, "%s:%zu:%zu: warning: "
324: "unknown attribute \"%s\"\n",
325: ps->fname,
326: XML_GetCurrentLineNumber(ps->xml),
327: XML_GetCurrentColumnNumber(ps->xml),
328: *att);
329: continue;
330: }
331: pattr = calloc(1, sizeof(*pattr));
332: pattr->key = key;
333: if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
334: pattr->rawval = strdup(att[1]);
335: TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
336: }
337: }
338:
339: /*
340: * Roll up the parse tree.
341: * If we're at a text node, roll that one up first.
342: */
343: static void
344: xml_elem_end(void *arg, const XML_Char *name)
345: {
346: struct parse *ps;
1.2 schwarze 347: const struct element *elem;
1.1 schwarze 348:
349: ps = arg;
350: if (ps->tree->flags && TREE_FAIL)
351: return;
352:
353: /* Close out the text node, if there is one. */
354: if (ps->cur->node == NODE_TEXT) {
355: pnode_trim(ps->cur);
356: ps->cur = ps->cur->parent;
357: }
1.2 schwarze 358:
359: for (elem = elements; elem->name != NULL; elem++)
360: if (strcmp(elem->name, name) == 0)
361: break;
362:
363: switch (elem->node) {
364: case NODE_IGNORE:
365: case NODE_WARN:
366: break;
367: default:
368: assert(elem->node == ps->cur->node);
369: ps->cur = ps->cur->parent;
370: }
1.1 schwarze 371: }
372:
373: struct parse *
374: parse_alloc(int warn)
375: {
376: struct parse *p;
377:
378: if ((p = calloc(1, sizeof(*p))) == NULL)
379: return NULL;
380:
381: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
382: free(p);
383: return NULL;
384: }
385:
386: if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
387: free(p->tree);
388: free(p);
389: return NULL;
390: }
391: p->warn = warn;
392: XML_SetCharacterDataHandler(p->xml, xml_char);
393: XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
394: XML_SetUserData(p->xml, p);
395: return p;
396: }
397:
398: void
399: parse_free(struct parse *p)
400: {
401: if (p == NULL)
402: return;
403: XML_ParserFree(p->xml);
404: if (p->tree != NULL) {
405: pnode_unlink(p->tree->root);
406: free(p->tree);
407: }
408: free(p);
409: }
410:
411: struct ptree *
412: parse_file(struct parse *p, int fd, const char *fname)
413: {
414: char b[4096];
415: ssize_t ssz;
416:
417: p->fname = fname;
418: do {
419: if ((ssz = read(fd, b, sizeof(b))) < 0) {
420: perror(fname);
421: pnode_unlink(p->tree->root);
422: p->tree->root = p->cur = NULL;
423: p->tree->flags |= TREE_FAIL;
424: return NULL;
425: }
426: if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
427: fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
428: XML_GetCurrentLineNumber(p->xml),
429: XML_GetCurrentColumnNumber(p->xml),
430: XML_ErrorString(XML_GetErrorCode(p->xml)));
431: p->tree->flags |= TREE_FAIL;
432: }
433: } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
434: return p->tree;
435: }
CVSweb