Annotation of docbook2mdoc/parse.c, Revision 1.27
1.27 ! schwarze 1: /* $Id: parse.c,v 1.26 2019/04/09 01:39:09 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
66: struct element {
67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
71: static const struct element elements[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.1 schwarze 73: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 74: { "anchor", NODE_DELETE },
1.22 schwarze 75: { "appendix", NODE_APPENDIX },
1.1 schwarze 76: { "application", NODE_APPLICATION },
77: { "arg", NODE_ARG },
1.22 schwarze 78: { "article", NODE_SECTION },
1.1 schwarze 79: { "author", NODE_AUTHOR },
80: { "authorgroup", NODE_AUTHORGROUP },
81: { "blockquote", NODE_BLOCKQUOTE },
1.22 schwarze 82: { "book", NODE_SECTION },
1.1 schwarze 83: { "bookinfo", NODE_BOOKINFO },
84: { "caution", NODE_CAUTION },
85: { "chapter", NODE_SECTION },
86: { "citerefentry", NODE_CITEREFENTRY },
87: { "citetitle", NODE_CITETITLE },
88: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 89: { "code", NODE_LITERAL },
1.1 schwarze 90: { "colspec", NODE_COLSPEC },
91: { "command", NODE_COMMAND },
92: { "constant", NODE_CONSTANT },
1.7 schwarze 93: { "contrib", NODE_CONTRIB },
1.1 schwarze 94: { "copyright", NODE_COPYRIGHT },
95: { "date", NODE_DATE },
1.23 schwarze 96: { "!doctype", NODE_DOCTYPE },
97: { "!DOCTYPE", NODE_DOCTYPE },
1.1 schwarze 98: { "editor", NODE_EDITOR },
99: { "email", NODE_EMAIL },
100: { "emphasis", NODE_EMPHASIS },
1.23 schwarze 101: { "!ENTITY", NODE_ENTITY },
1.1 schwarze 102: { "entry", NODE_ENTRY },
103: { "envar", NODE_ENVAR },
1.13 schwarze 104: { "errorname", NODE_ERRORNAME },
1.1 schwarze 105: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
106: { "filename", NODE_FILENAME },
1.7 schwarze 107: { "firstname", NODE_PERSONNAME },
1.1 schwarze 108: { "firstterm", NODE_FIRSTTERM },
109: { "footnote", NODE_FOOTNOTE },
110: { "funcdef", NODE_FUNCDEF },
111: { "funcprototype", NODE_FUNCPROTOTYPE },
112: { "funcsynopsis", NODE_FUNCSYNOPSIS },
113: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
114: { "function", NODE_FUNCTION },
1.21 schwarze 115: { "glossary", NODE_VARIABLELIST },
116: { "glossdef", NODE_IGNORE },
117: { "glossdiv", NODE_IGNORE },
118: { "glossentry", NODE_VARLISTENTRY },
119: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 120: { "glossterm", NODE_GLOSSTERM },
121: { "group", NODE_GROUP },
122: { "holder", NODE_HOLDER },
123: { "index", NODE_INDEX },
1.4 schwarze 124: { "indexterm", NODE_DELETE },
1.1 schwarze 125: { "info", NODE_INFO },
126: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 127: { "informaltable", NODE_TABLE },
1.1 schwarze 128: { "inlineequation", NODE_INLINEEQUATION },
129: { "itemizedlist", NODE_ITEMIZEDLIST },
130: { "keysym", NODE_KEYSYM },
131: { "legalnotice", NODE_LEGALNOTICE },
132: { "link", NODE_LINK },
133: { "listitem", NODE_LISTITEM },
134: { "literal", NODE_LITERAL },
135: { "literallayout", NODE_LITERALLAYOUT },
136: { "manvolnum", NODE_MANVOLNUM },
137: { "member", NODE_MEMBER },
138: { "mml:math", NODE_MML_MATH },
139: { "mml:mfenced", NODE_MML_MFENCED },
140: { "mml:mfrac", NODE_MML_MFRAC },
141: { "mml:mi", NODE_MML_MI },
142: { "mml:mn", NODE_MML_MN },
143: { "mml:mo", NODE_MML_MO },
144: { "mml:mrow", NODE_MML_MROW },
145: { "mml:msub", NODE_MML_MSUB },
146: { "mml:msup", NODE_MML_MSUP },
147: { "modifier", NODE_MODIFIER },
148: { "note", NODE_NOTE },
149: { "option", NODE_OPTION },
150: { "orderedlist", NODE_ORDEREDLIST },
151: { "orgname", NODE_ORGNAME },
1.7 schwarze 152: { "othername", NODE_PERSONNAME },
1.1 schwarze 153: { "para", NODE_PARA },
154: { "paramdef", NODE_PARAMDEF },
155: { "parameter", NODE_PARAMETER },
156: { "part", NODE_SECTION },
157: { "personname", NODE_PERSONNAME },
1.3 schwarze 158: { "phrase", NODE_IGNORE },
1.1 schwarze 159: { "preface", NODE_PREFACE },
1.4 schwarze 160: { "primary", NODE_DELETE },
1.1 schwarze 161: { "programlisting", NODE_PROGRAMLISTING },
162: { "prompt", NODE_PROMPT },
163: { "quote", NODE_QUOTE },
164: { "refclass", NODE_REFCLASS },
165: { "refdescriptor", NODE_REFDESCRIPTOR },
166: { "refentry", NODE_REFENTRY },
167: { "refentryinfo", NODE_REFENTRYINFO },
168: { "refentrytitle", NODE_REFENTRYTITLE },
169: { "refmeta", NODE_REFMETA },
170: { "refmetainfo", NODE_REFMETAINFO },
171: { "refmiscinfo", NODE_REFMISCINFO },
172: { "refname", NODE_REFNAME },
173: { "refnamediv", NODE_REFNAMEDIV },
174: { "refpurpose", NODE_REFPURPOSE },
175: { "refsect1", NODE_SECTION },
176: { "refsect2", NODE_SECTION },
177: { "refsect3", NODE_SECTION },
178: { "refsection", NODE_SECTION },
179: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
180: { "releaseinfo", NODE_RELEASEINFO },
181: { "replaceable", NODE_REPLACEABLE },
182: { "row", NODE_ROW },
183: { "sbr", NODE_SBR },
184: { "screen", NODE_SCREEN },
1.4 schwarze 185: { "secondary", NODE_DELETE },
1.1 schwarze 186: { "sect1", NODE_SECTION },
187: { "sect2", NODE_SECTION },
188: { "section", NODE_SECTION },
189: { "sgmltag", NODE_SGMLTAG },
1.15 schwarze 190: { "simpara", NODE_PARA },
1.1 schwarze 191: { "simplelist", NODE_SIMPLELIST },
192: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 193: { "structfield", NODE_PARAMETER },
194: { "structname", NODE_TYPE },
1.1 schwarze 195: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 196: { "surname", NODE_PERSONNAME },
1.12 schwarze 197: { "symbol", NODE_CONSTANT },
1.1 schwarze 198: { "synopsis", NODE_SYNOPSIS },
199: { "table", NODE_TABLE },
200: { "tbody", NODE_TBODY },
201: { "term", NODE_TERM },
202: { "tfoot", NODE_TFOOT },
203: { "tgroup", NODE_TGROUP },
204: { "thead", NODE_THEAD },
205: { "tip", NODE_TIP },
206: { "title", NODE_TITLE },
1.3 schwarze 207: { "trademark", NODE_IGNORE },
1.1 schwarze 208: { "type", NODE_TYPE },
1.18 schwarze 209: { "ulink", NODE_LINK },
1.13 schwarze 210: { "userinput", NODE_LITERAL },
1.1 schwarze 211: { "variablelist", NODE_VARIABLELIST },
212: { "varlistentry", NODE_VARLISTENTRY },
213: { "varname", NODE_VARNAME },
214: { "warning", NODE_WARNING },
215: { "wordasword", NODE_WORDASWORD },
1.26 schwarze 216: { "xi:include", NODE_INCLUDE },
1.1 schwarze 217: { "year", NODE_YEAR },
1.5 schwarze 218: { NULL, NODE_IGNORE }
1.1 schwarze 219: };
220:
1.9 schwarze 221: struct entity {
222: const char *name;
223: const char *roff;
224: };
225:
226: /*
227: * XML character entity references found in the wild.
228: * Those that don't have an exact mandoc_char(7) representation
229: * are approximated, and the desired codepoint is given as a comment.
230: * Encoding them as \\[u...] would leave -Tascii out in the cold.
231: */
232: static const struct entity entities[] = {
233: { "alpha", "\\(*a" },
234: { "amp", "&" },
235: { "apos", "'" },
236: { "auml", "\\(:a" },
237: { "beta", "\\(*b" },
238: { "circ", "^" }, /* U+02C6 */
239: { "copy", "\\(co" },
240: { "dagger", "\\(dg" },
241: { "Delta", "\\(*D" },
242: { "eacute", "\\('e" },
243: { "emsp", "\\ " }, /* U+2003 */
244: { "gt", ">" },
245: { "hairsp", "\\^" },
246: { "kappa", "\\(*k" },
247: { "larr", "\\(<-" },
248: { "ldquo", "\\(lq" },
249: { "le", "\\(<=" },
250: { "lowbar", "_" },
251: { "lsqb", "[" },
252: { "lt", "<" },
253: { "mdash", "\\(em" },
254: { "minus", "\\-" },
255: { "ndash", "\\(en" },
256: { "nbsp", "\\ " },
257: { "num", "#" },
258: { "oslash", "\\(/o" },
259: { "ouml", "\\(:o" },
260: { "percnt", "%" },
261: { "quot", "\\(dq" },
262: { "rarr", "\\(->" },
263: { "rArr", "\\(rA" },
264: { "rdquo", "\\(rq" },
265: { "reg", "\\(rg" },
266: { "rho", "\\(*r" },
267: { "rsqb", "]" },
268: { "sigma", "\\(*s" },
269: { "shy", "\\&" }, /* U+00AD */
270: { "tau", "\\(*t" },
271: { "tilde", "\\[u02DC]" },
272: { "times", "\\[tmu]" },
273: { "uuml", "\\(:u" },
274: { NULL, NULL }
275: };
276:
1.23 schwarze 277: static size_t parse_string(struct parse *, char *, size_t,
278: enum pstate *, int);
1.24 schwarze 279: static void parse_fd(struct parse *, int);
1.23 schwarze 280:
281:
1.6 schwarze 282: static void
283: error_msg(struct parse *p, const char *fmt, ...)
284: {
285: va_list ap;
286:
287: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
288: va_start(ap, fmt);
289: vfprintf(stderr, fmt, ap);
290: va_end(ap);
291: fputc('\n', stderr);
292: p->tree->flags |= TREE_FAIL;
293: }
294:
295: static void
296: warn_msg(struct parse *p, const char *fmt, ...)
297: {
298: va_list ap;
299:
1.23 schwarze 300: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 301: return;
302:
303: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
304: va_start(ap, fmt);
305: vfprintf(stderr, fmt, ap);
306: va_end(ap);
307: fputc('\n', stderr);
308: }
309:
1.1 schwarze 310: /*
311: * Process a string of characters.
312: * If a text node is already open, append to it.
313: * Otherwise, create a new one as a child of the current node.
314: */
315: static void
1.5 schwarze 316: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 317: {
318: struct pnode *dat;
1.16 schwarze 319: size_t newsz;
1.1 schwarze 320:
1.5 schwarze 321: if (ps->del > 0)
1.1 schwarze 322: return;
323:
1.5 schwarze 324: if (ps->cur == NULL) {
1.6 schwarze 325: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 326: return;
327: }
328:
1.1 schwarze 329: if (ps->cur->node != NODE_TEXT) {
330: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
331: perror(NULL);
332: exit(1);
333: }
334: dat->node = NODE_TEXT;
1.23 schwarze 335: dat->spc = (ps->flags & PFLAG_SPC) != 0;
1.1 schwarze 336: dat->parent = ps->cur;
337: TAILQ_INIT(&dat->childq);
338: TAILQ_INIT(&dat->attrq);
339: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
340: ps->cur = dat;
341: }
342:
1.5 schwarze 343: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 344: ps->cur->parent == ps->tree->root)
345: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 346:
1.1 schwarze 347: /* Append to the current text node. */
348:
349: assert(sz >= 0);
1.23 schwarze 350: newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz;
1.16 schwarze 351: ps->cur->b = realloc(ps->cur->b, newsz + 1);
1.1 schwarze 352: if (ps->cur->b == NULL) {
353: perror(NULL);
354: exit(1);
355: }
1.23 schwarze 356: if (ps->cur->bsz && (ps->flags & PFLAG_SPC))
1.16 schwarze 357: ps->cur->b[ps->cur->bsz++] = ' ';
1.1 schwarze 358: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
1.16 schwarze 359: ps->cur->b[ps->cur->bsz = newsz] = '\0';
1.1 schwarze 360: ps->cur->real = ps->cur->b;
1.23 schwarze 361: ps->flags &= ~PFLAG_SPC;
1.1 schwarze 362: }
363:
1.16 schwarze 364: /*
365: * Close out the text node and strip trailing whitespace, if one is open.
366: */
1.1 schwarze 367: static void
1.16 schwarze 368: pnode_closetext(struct parse *p)
1.1 schwarze 369: {
1.16 schwarze 370: struct pnode *n;
371:
372: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
373: return;
374: p->cur = n->parent;
375: while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
376: n->b[--n->bsz] = '\0';
1.23 schwarze 377: p->flags |= PFLAG_SPC;
1.16 schwarze 378: }
1.1 schwarze 379: }
380:
1.9 schwarze 381: static void
382: xml_entity(struct parse *p, const char *name)
383: {
384: const struct entity *entity;
385: struct pnode *dat;
1.23 schwarze 386: const char *ccp;
387: char *cp;
388: enum pstate pstate;
1.9 schwarze 389:
390: if (p->del > 0)
391: return;
392:
393: if (p->cur == NULL) {
394: error_msg(p, "discarding entity before document: &%s;", name);
395: return;
396: }
397:
1.16 schwarze 398: pnode_closetext(p);
1.9 schwarze 399:
400: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
401: warn_msg(p, "entity after end of document: &%s;", name);
402:
403: for (entity = entities; entity->name != NULL; entity++)
404: if (strcmp(name, entity->name) == 0)
405: break;
406:
407: if (entity->roff == NULL) {
1.23 schwarze 408: if (p->doctype != NULL) {
409: TAILQ_FOREACH(dat, &p->doctype->childq, child) {
410: if ((ccp = pnode_getattr_raw(dat,
411: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 412: strcmp(ccp, name) != 0)
413: continue;
414: if ((ccp = pnode_getattr_raw(dat,
415: ATTRKEY_SYSTEM, NULL)) != NULL) {
416: parse_file(p, -1, ccp);
417: p->flags &= ~PFLAG_SPC;
418: return;
419: }
420: if ((ccp = pnode_getattr_raw(dat,
1.23 schwarze 421: ATTRKEY_DEFINITION, NULL)) == NULL)
422: continue;
423: if ((cp = strdup(ccp)) == NULL) {
424: perror(NULL);
425: exit(1);
426: }
427: pstate = PARSE_ELEM;
428: parse_string(p, cp, strlen(cp), &pstate, 0);
429: p->flags &= ~PFLAG_SPC;
430: free(cp);
431: return;
432: }
433: }
1.9 schwarze 434: error_msg(p, "unknown entity &%s;", name);
435: return;
436: }
437:
438: /* Create, append, and close out an entity node. */
439: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
440: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
441: perror(NULL);
442: exit(1);
443: }
444: dat->node = NODE_ESCAPE;
445: dat->bsz = strlen(dat->b);
1.23 schwarze 446: dat->spc = (p->flags & PFLAG_SPC) != 0;
1.9 schwarze 447: dat->parent = p->cur;
448: TAILQ_INIT(&dat->childq);
449: TAILQ_INIT(&dat->attrq);
450: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
1.23 schwarze 451: p->flags &= ~PFLAG_SPC;
1.9 schwarze 452: }
453:
1.1 schwarze 454: /*
455: * Begin an element.
456: */
457: static void
1.5 schwarze 458: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 459: {
1.5 schwarze 460: const struct element *elem;
461: struct pnode *dat;
1.1 schwarze 462:
1.4 schwarze 463: /*
464: * An ancestor is excluded from the tree;
465: * keep track of the number of levels excluded.
466: */
467: if (ps->del > 0) {
1.23 schwarze 468: if (*name != '!' && *name != '?')
469: ps->del++;
1.4 schwarze 470: return;
471: }
472:
1.16 schwarze 473: pnode_closetext(ps);
1.1 schwarze 474:
475: for (elem = elements; elem->name != NULL; elem++)
476: if (strcmp(elem->name, name) == 0)
477: break;
478:
1.23 schwarze 479: if (elem->name == NULL) {
480: if (*name == '!' || *name == '?')
481: return;
1.6 schwarze 482: error_msg(ps, "unknown element <%s>", name);
1.23 schwarze 483: }
1.6 schwarze 484:
1.5 schwarze 485: ps->ncur = elem->node;
1.1 schwarze 486:
1.5 schwarze 487: switch (ps->ncur) {
1.4 schwarze 488: case NODE_DELETE_WARN:
1.6 schwarze 489: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 490: /* FALLTHROUGH */
1.4 schwarze 491: case NODE_DELETE:
492: ps->del = 1;
493: /* FALLTHROUGH */
1.2 schwarze 494: case NODE_IGNORE:
495: return;
496: case NODE_INLINEEQUATION:
1.1 schwarze 497: ps->tree->flags |= TREE_EQN;
1.2 schwarze 498: break;
499: default:
500: break;
501: }
1.1 schwarze 502:
1.6 schwarze 503: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
504: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 505:
1.1 schwarze 506: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
507: perror(NULL);
508: exit(1);
509: }
1.17 schwarze 510:
511: /*
512: * Nodes that begin a new macro or request line or start by
513: * printing text always want whitespace before themselves.
514: */
515:
516: switch (dat->node = elem->node) {
1.23 schwarze 517: case NODE_DOCTYPE:
518: case NODE_ENTITY:
519: case NODE_SBR:
520: ps->flags |= PFLAG_EEND;
521: /* FALLTHROUGH */
1.22 schwarze 522: case NODE_APPENDIX:
1.17 schwarze 523: case NODE_AUTHORGROUP:
1.20 schwarze 524: case NODE_BLOCKQUOTE:
1.17 schwarze 525: case NODE_BOOKINFO:
526: case NODE_CAUTION:
527: case NODE_EDITOR:
528: case NODE_ENTRY:
529: case NODE_FUNCDEF:
530: case NODE_FUNCPROTOTYPE:
531: case NODE_INFORMALEQUATION:
532: case NODE_INLINEEQUATION:
533: case NODE_ITEMIZEDLIST:
534: case NODE_LEGALNOTICE:
535: case NODE_LITERALLAYOUT:
536: case NODE_NOTE:
537: case NODE_ORDEREDLIST:
538: case NODE_PARA:
539: case NODE_PREFACE:
540: case NODE_PROGRAMLISTING:
541: case NODE_REFMETA:
542: case NODE_REFNAMEDIV:
543: case NODE_REFSYNOPSISDIV:
544: case NODE_ROW:
545: case NODE_SCREEN:
546: case NODE_SECTION:
547: case NODE_SYNOPSIS:
548: case NODE_TGROUP:
549: case NODE_TIP:
550: case NODE_TITLE:
551: case NODE_VARIABLELIST:
552: case NODE_VARLISTENTRY:
553: case NODE_WARNING:
554: dat->spc = 1;
555: break;
556: default:
1.23 schwarze 557: dat->spc = (ps->flags & PFLAG_SPC) != 0;
1.17 schwarze 558: break;
559: }
1.1 schwarze 560: dat->parent = ps->cur;
561: TAILQ_INIT(&dat->childq);
562: TAILQ_INIT(&dat->attrq);
563:
564: if (ps->cur != NULL)
565: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
566:
567: ps->cur = dat;
1.23 schwarze 568: if (dat->node == NODE_DOCTYPE) {
569: if (ps->doctype == NULL)
570: ps->doctype = dat;
571: else
572: error_msg(ps, "duplicate doctype");
573: } else if (dat->parent == NULL && ps->tree->root == NULL)
1.1 schwarze 574: ps->tree->root = dat;
1.5 schwarze 575: }
576:
577: static void
578: xml_attrkey(struct parse *ps, const char *name)
579: {
580: struct pattr *attr;
1.23 schwarze 581: const char *value;
1.5 schwarze 582: enum attrkey key;
1.1 schwarze 583:
1.19 schwarze 584: if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 585: return;
1.23 schwarze 586:
587: if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) &&
588: TAILQ_FIRST(&ps->cur->attrq) == NULL) {
589: value = name;
590: name = "NAME";
591: } else
592: value = NULL;
593:
1.5 schwarze 594: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.23 schwarze 595: ps->flags &= ~PFLAG_ATTR;
1.5 schwarze 596: return;
597: }
598: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
599: perror(NULL);
600: exit(1);
601: }
602: attr->key = key;
603: attr->val = ATTRVAL__MAX;
1.23 schwarze 604: if (value == NULL) {
605: attr->rawval = NULL;
606: ps->flags |= PFLAG_ATTR;
607: } else {
608: if ((attr->rawval = strdup(value)) == NULL) {
609: perror(NULL);
610: exit(1);
611: }
612: ps->flags &= ~PFLAG_ATTR;
613: }
1.5 schwarze 614: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
1.23 schwarze 615: if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
616: xml_attrkey(ps, "DEFINITION");
1.5 schwarze 617: }
618:
619: static void
620: xml_attrval(struct parse *ps, const char *name)
621: {
622: struct pattr *attr;
623:
1.23 schwarze 624: if (ps->del > 0 || ps->ncur == NODE_IGNORE ||
625: (ps->flags & PFLAG_ATTR) == 0)
1.5 schwarze 626: return;
627: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
628: return;
629: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
630: (attr->rawval = strdup(name)) == NULL) {
631: perror(NULL);
632: exit(1);
1.1 schwarze 633: }
1.27 ! schwarze 634: ps->flags &= ~PFLAG_ATTR;
1.1 schwarze 635: }
636:
637: /*
638: * Roll up the parse tree.
639: * If we're at a text node, roll that one up first.
640: */
641: static void
1.5 schwarze 642: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 643: {
1.5 schwarze 644: const struct element *elem;
1.26 schwarze 645: struct pnode *n;
646: const char *cp;
1.5 schwarze 647: enum nodeid node;
1.1 schwarze 648:
1.4 schwarze 649: /*
650: * An ancestor is excluded from the tree;
651: * keep track of the number of levels excluded.
652: */
653: if (ps->del > 1) {
654: ps->del--;
655: return;
656: }
657:
1.16 schwarze 658: if (ps->del == 0)
659: pnode_closetext(ps);
1.2 schwarze 660:
1.5 schwarze 661: if (name != NULL) {
662: for (elem = elements; elem->name != NULL; elem++)
663: if (strcmp(elem->name, name) == 0)
664: break;
665: node = elem->node;
666: } else
667: node = ps->ncur;
1.2 schwarze 668:
1.5 schwarze 669: switch (node) {
1.4 schwarze 670: case NODE_DELETE_WARN:
671: case NODE_DELETE:
1.5 schwarze 672: if (ps->del > 0)
673: ps->del--;
1.4 schwarze 674: break;
1.2 schwarze 675: case NODE_IGNORE:
1.26 schwarze 676: break;
677: case NODE_INCLUDE:
678: n = ps->cur;
679: ps->cur = ps->cur->parent;
680: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
681: if (cp == NULL)
682: error_msg(ps, "<xi:include> element "
683: "without href attribute");
684: else
685: parse_file(ps, -1, cp);
686: pnode_unlink(n);
687: ps->flags &= ~PFLAG_SPC;
1.2 schwarze 688: break;
1.23 schwarze 689: case NODE_DOCTYPE:
690: ps->flags &= ~PFLAG_EEND;
691: /* FALLTHROUGH */
1.2 schwarze 692: default:
1.5 schwarze 693: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 694: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 695: break;
696: }
697:
698: /*
699: * Refrain from actually closing the document element.
700: * If no more content follows, no harm is done, but if
701: * some content still follows, simply processing it is
702: * obviously better than discarding it or crashing.
703: */
704:
1.23 schwarze 705: if (ps->cur->parent != NULL || node == NODE_DOCTYPE) {
706: ps->cur = ps->cur->parent;
707: if (ps->cur != NULL)
708: ps->ncur = ps->cur->node;
709: } else
1.5 schwarze 710: ps->tree->flags |= TREE_CLOSED;
1.23 schwarze 711: ps->flags &= ~PFLAG_SPC;
1.4 schwarze 712: break;
1.2 schwarze 713: }
1.4 schwarze 714: assert(ps->del == 0);
1.1 schwarze 715: }
716:
717: struct parse *
718: parse_alloc(int warn)
719: {
720: struct parse *p;
721:
722: if ((p = calloc(1, sizeof(*p))) == NULL)
723: return NULL;
724:
725: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
726: free(p);
727: return NULL;
728: }
1.23 schwarze 729: if (warn)
730: p->flags |= PFLAG_WARN;
731: else
732: p->flags &= ~PFLAG_WARN;
1.1 schwarze 733: return p;
734: }
735:
736: void
737: parse_free(struct parse *p)
738: {
739: if (p == NULL)
740: return;
741: if (p->tree != NULL) {
742: pnode_unlink(p->tree->root);
743: free(p->tree);
744: }
745: free(p);
746: }
747:
1.14 schwarze 748: static void
749: increment(struct parse *p, char *b, size_t *pend, int refill)
750: {
751: if (refill) {
752: if (b[*pend] == '\n') {
753: p->nline++;
754: p->ncol = 1;
755: } else
756: p->ncol++;
757: }
758: ++*pend;
759: }
760:
1.5 schwarze 761: /*
762: * Advance the pend pointer to the next character in the charset.
763: * If the charset starts with a space, it stands for any whitespace.
764: * Update the new input file position, used for messages.
765: * Do not overrun the buffer b of length rlen.
766: * When reaching the end, NUL-terminate the buffer and return 1;
767: * otherwise, return 0.
768: */
769: static int
770: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 771: const char *charset, int refill)
1.5 schwarze 772: {
773: int space;
774:
775: if (*charset == ' ') {
776: space = 1;
777: charset++;
778: } else
779: space = 0;
780:
1.14 schwarze 781: if (refill) {
782: p->nline = p->line;
783: p->ncol = p->col;
784: }
1.5 schwarze 785: while (*pend < rlen) {
786: if (space && isspace((unsigned char)b[*pend]))
787: break;
788: if (strchr(charset, b[*pend]) != NULL)
789: break;
1.14 schwarze 790: increment(p, b, pend, refill);
1.5 schwarze 791: }
792: if (*pend == rlen) {
793: b[rlen] = '\0';
1.14 schwarze 794: return refill;
1.5 schwarze 795: } else
796: return 0;
797: }
798:
1.14 schwarze 799: size_t
800: parse_string(struct parse *p, char *b, size_t rlen,
801: enum pstate *pstate, int refill)
802: {
803: char *cp;
804: size_t poff; /* Parse offset in b[]. */
805: size_t pend; /* Offset of the end of the current word. */
806: int elem_end;
807:
808: pend = 0;
809: for (;;) {
810:
811: /* Proceed to the next token, skipping whitespace. */
812:
813: if (refill) {
814: p->line = p->nline;
815: p->col = p->ncol;
816: }
817: if ((poff = pend) == rlen)
818: break;
819: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 820: p->flags |= PFLAG_SPC;
1.14 schwarze 821: increment(p, b, &pend, refill);
822: continue;
823: }
824:
825: /*
826: * The following four cases (ARG, TAG, and starting an
827: * entity or a tag) all parse a word or quoted string.
828: * If that extends beyond the read buffer and the last
829: * read(2) still got data, they all break out of the
830: * token loop to request more data from the read loop.
831: *
832: * Also, three of them detect self-closing tags, those
833: * ending with "/>", setting the flag elem_end and
834: * calling xml_elem_end() at the very end, after
835: * handling the attribute value, attribute name, or
836: * tag name, respectively.
837: */
838:
839: /* Parse an attribute value. */
840:
841: if (*pstate >= PARSE_ARG) {
842: if (*pstate == PARSE_ARG &&
843: (b[pend] == '\'' || b[pend] == '"')) {
844: *pstate = b[pend] == '"' ?
845: PARSE_DQ : PARSE_SQ;
846: increment(p, b, &pend, refill);
847: continue;
848: }
849: if (advance(p, b, rlen, &pend,
850: *pstate == PARSE_DQ ? "\"" :
851: *pstate == PARSE_SQ ? "'" : " >", refill))
852: break;
853: *pstate = PARSE_TAG;
854: elem_end = 0;
855: if (b[pend] == '>') {
856: *pstate = PARSE_ELEM;
857: if (pend > 0 && b[pend - 1] == '/') {
858: b[pend - 1] = '\0';
859: elem_end = 1;
860: }
1.23 schwarze 861: if (p->flags & PFLAG_EEND)
862: elem_end = 1;
1.14 schwarze 863: }
864: b[pend] = '\0';
865: if (pend < rlen)
866: increment(p, b, &pend, refill);
867: xml_attrval(p, b + poff);
868: if (elem_end)
869: xml_elem_end(p, NULL);
870:
871: /* Look for an attribute name. */
872:
873: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 874: switch (p->ncur) {
875: case NODE_DOCTYPE:
876: if (b[pend] == '[') {
877: *pstate = PARSE_ELEM;
878: increment(p, b, &pend, refill);
879: continue;
880: }
881: /* FALLTHROUGH */
882: case NODE_ENTITY:
883: if (b[pend] == '"' || b[pend] == '\'') {
884: *pstate = PARSE_ARG;
885: continue;
886: }
887: break;
888: default:
889: break;
890: }
1.14 schwarze 891: if (advance(p, b, rlen, &pend, " =>", refill))
892: break;
893: elem_end = 0;
894: switch (b[pend]) {
895: case '>':
896: *pstate = PARSE_ELEM;
897: if (pend > 0 && b[pend - 1] == '/') {
898: b[pend - 1] = '\0';
899: elem_end = 1;
900: }
1.23 schwarze 901: if (p->flags & PFLAG_EEND)
902: elem_end = 1;
1.14 schwarze 903: break;
904: case '=':
905: *pstate = PARSE_ARG;
906: break;
907: default:
908: break;
909: }
910: b[pend] = '\0';
911: if (pend < rlen)
912: increment(p, b, &pend, refill);
913: xml_attrkey(p, b + poff);
914: if (elem_end)
915: xml_elem_end(p, NULL);
916:
917: /* Begin an opening or closing tag. */
918:
919: } else if (b[poff] == '<') {
920: if (advance(p, b, rlen, &pend, " >", refill))
921: break;
922: if (pend > poff + 3 &&
923: strncmp(b + poff, "<!--", 4) == 0) {
924:
925: /* Skip a comment. */
926:
927: cp = strstr(b + pend - 2, "-->");
928: if (cp == NULL) {
929: if (refill)
930: break;
931: cp = b + rlen;
932: } else
933: cp += 3;
934: while (b + pend < cp)
935: increment(p, b, &pend, refill);
936: continue;
937: }
938: elem_end = 0;
939: if (b[pend] != '>')
940: *pstate = PARSE_TAG;
941: else if (pend > 0 && b[pend - 1] == '/') {
942: b[pend - 1] = '\0';
943: elem_end = 1;
944: }
945: b[pend] = '\0';
946: if (pend < rlen)
947: increment(p, b, &pend, refill);
948: if (b[++poff] == '/') {
949: elem_end = 1;
950: poff++;
1.23 schwarze 951: } else {
1.14 schwarze 952: xml_elem_start(p, b + poff);
1.23 schwarze 953: if (*pstate == PARSE_ELEM &&
954: p->flags & PFLAG_EEND)
955: elem_end = 1;
956: }
1.14 schwarze 957: if (elem_end)
958: xml_elem_end(p, b + poff);
959:
1.23 schwarze 960: /* Close a doctype. */
961:
962: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
963: *pstate = PARSE_TAG;
964: increment(p, b, &pend, refill);
965:
1.14 schwarze 966: /* Process an entity. */
967:
968: } else if (b[poff] == '&') {
969: if (advance(p, b, rlen, &pend, ";", refill))
970: break;
971: b[pend] = '\0';
972: if (pend < rlen)
973: increment(p, b, &pend, refill);
974: xml_entity(p, b + poff + 1);
975:
976: /* Process text up to the next tag, entity, or EOL. */
977:
978: } else {
979: advance(p, b, rlen, &pend, "<&", refill);
980: xml_char(p, b + poff, pend - poff);
981: }
982: }
983: return poff;
984: }
985:
1.24 schwarze 986:
987: /*
988: * The read loop.
989: * If the previous token was incomplete and asked for more input,
990: * we have to enter the read loop once more even on EOF.
991: * Once rsz is 0, incomplete tokens will no longer ask for more input
992: * but instead use whatever there is, and then exit the read loop.
993: * The minus one on the size limit for read(2) is needed such that
994: * advance() can set b[rlen] to NUL when needed.
995: */
996: static void
997: parse_fd(struct parse *p, int fd)
1.1 schwarze 998: {
999: char b[4096];
1.5 schwarze 1000: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 1001: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 1002: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1003: enum pstate pstate;
1.1 schwarze 1004:
1.24 schwarze 1005: rlen = 0;
1.14 schwarze 1006: pstate = PARSE_ELEM;
1007: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1008: (rlen += rsz) > 0) {
1009: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1010: /* Buffer exhausted; shift left and re-fill. */
1011: assert(poff > 0);
1012: rlen -= poff;
1.14 schwarze 1013: memmove(b, b + poff, rlen);
1.5 schwarze 1014: }
1.24 schwarze 1015: if (rsz < 0)
1016: error_msg(p, "read: %s", strerror(errno));
1017: }
1018:
1019: /*
1020: * Open and parse a file.
1021: */
1022: struct ptree *
1023: parse_file(struct parse *p, int fd, const char *fname)
1024: {
1025: const char *save_fname;
1026: int save_line, save_col;
1027:
1028: /* Save and initialize reporting data. */
1029:
1030: save_fname = p->fname;
1031: save_line = p->nline;
1032: save_col = p->ncol;
1033: p->fname = fname;
1034: p->line = 0;
1035: p->col = 0;
1036:
1037: /* Open the file, unless it is already open. */
1038:
1039: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1040: error_msg(p, "open: %s", strerror(errno));
1041: p->fname = save_fname;
1042: return p->tree;
1.5 schwarze 1043: }
1.24 schwarze 1044:
1045: /*
1046: * After opening the starting file, change to the directory it
1047: * is located in, in case it wants to include any further files,
1048: * which are typically given with relative paths in DocBook.
1049: * Do this on a best-effort basis; don't complain about failure.
1050: */
1051:
1052: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1053: strcmp(fname, ".") != 0)
1054: (void)chdir(fname);
1055:
1056: /* Run the read loop. */
1057:
1058: p->nline = 1;
1059: p->ncol = 1;
1060: parse_fd(p, fd);
1061:
1062: /* On the top level, finalize the parse tree. */
1063:
1064: if (save_fname == NULL) {
1065: pnode_closetext(p);
1066: if (p->tree->root == NULL)
1067: error_msg(p, "empty document");
1068: else if ((p->tree->flags & TREE_CLOSED) == 0)
1069: warn_msg(p, "document not closed");
1070: pnode_unlink(p->doctype);
1071: }
1072:
1073: /* Clean up. */
1074:
1075: if (fd != STDIN_FILENO)
1076: close(fd);
1077: p->fname = save_fname;
1078: p->nline = save_line;
1079: p->ncol = save_col;
1.1 schwarze 1080: return p->tree;
1081: }
CVSweb