Annotation of docbook2mdoc/parse.c, Revision 1.42
1.42 ! schwarze 1: /* $Id: parse.c,v 1.41 2019/04/13 13:06:35 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.4 schwarze 73: { "anchor", NODE_DELETE },
1.42 ! schwarze 74: { "application", NODE_COMMAND },
1.22 schwarze 75: { "article", NODE_SECTION },
1.41 schwarze 76: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 77: { "book", NODE_SECTION },
1.1 schwarze 78: { "chapter", NODE_SECTION },
1.13 schwarze 79: { "code", NODE_LITERAL },
1.36 schwarze 80: { "computeroutput", NODE_LITERAL },
1.23 schwarze 81: { "!doctype", NODE_DOCTYPE },
1.7 schwarze 82: { "firstname", NODE_PERSONNAME },
1.21 schwarze 83: { "glossary", NODE_VARIABLELIST },
84: { "glossdef", NODE_IGNORE },
85: { "glossdiv", NODE_IGNORE },
86: { "glossentry", NODE_VARLISTENTRY },
87: { "glosslist", NODE_VARIABLELIST },
1.4 schwarze 88: { "indexterm", NODE_DELETE },
1.11 schwarze 89: { "informaltable", NODE_TABLE },
1.42 ! schwarze 90: { "keycap", NODE_KEYSYM },
! 91: { "keycode", NODE_IGNORE },
1.40 schwarze 92: { "othercredit", NODE_AUTHOR },
1.7 schwarze 93: { "othername", NODE_PERSONNAME },
1.1 schwarze 94: { "part", NODE_SECTION },
1.3 schwarze 95: { "phrase", NODE_IGNORE },
1.4 schwarze 96: { "primary", NODE_DELETE },
1.42 ! schwarze 97: { "property", NODE_PARAMETER },
1.1 schwarze 98: { "refsect1", NODE_SECTION },
99: { "refsect2", NODE_SECTION },
100: { "refsect3", NODE_SECTION },
101: { "refsection", NODE_SECTION },
1.42 ! schwarze 102: { "returnvalue", NODE_IGNORE },
1.4 schwarze 103: { "secondary", NODE_DELETE },
1.1 schwarze 104: { "sect1", NODE_SECTION },
105: { "sect2", NODE_SECTION },
1.36 schwarze 106: { "sgmltag", NODE_MARKUP },
1.15 schwarze 107: { "simpara", NODE_PARA },
1.13 schwarze 108: { "structfield", NODE_PARAMETER },
109: { "structname", NODE_TYPE },
1.7 schwarze 110: { "surname", NODE_PERSONNAME },
1.12 schwarze 111: { "symbol", NODE_CONSTANT },
1.3 schwarze 112: { "trademark", NODE_IGNORE },
1.18 schwarze 113: { "ulink", NODE_LINK },
1.13 schwarze 114: { "userinput", NODE_LITERAL },
1.5 schwarze 115: { NULL, NODE_IGNORE }
1.1 schwarze 116: };
117:
1.9 schwarze 118: struct entity {
119: const char *name;
120: const char *roff;
121: };
122:
123: /*
124: * XML character entity references found in the wild.
125: * Those that don't have an exact mandoc_char(7) representation
126: * are approximated, and the desired codepoint is given as a comment.
127: * Encoding them as \\[u...] would leave -Tascii out in the cold.
128: */
129: static const struct entity entities[] = {
130: { "alpha", "\\(*a" },
131: { "amp", "&" },
132: { "apos", "'" },
133: { "auml", "\\(:a" },
134: { "beta", "\\(*b" },
135: { "circ", "^" }, /* U+02C6 */
136: { "copy", "\\(co" },
137: { "dagger", "\\(dg" },
138: { "Delta", "\\(*D" },
139: { "eacute", "\\('e" },
140: { "emsp", "\\ " }, /* U+2003 */
141: { "gt", ">" },
142: { "hairsp", "\\^" },
143: { "kappa", "\\(*k" },
144: { "larr", "\\(<-" },
145: { "ldquo", "\\(lq" },
146: { "le", "\\(<=" },
147: { "lowbar", "_" },
148: { "lsqb", "[" },
149: { "lt", "<" },
150: { "mdash", "\\(em" },
151: { "minus", "\\-" },
152: { "ndash", "\\(en" },
153: { "nbsp", "\\ " },
154: { "num", "#" },
155: { "oslash", "\\(/o" },
156: { "ouml", "\\(:o" },
157: { "percnt", "%" },
158: { "quot", "\\(dq" },
159: { "rarr", "\\(->" },
160: { "rArr", "\\(rA" },
161: { "rdquo", "\\(rq" },
162: { "reg", "\\(rg" },
163: { "rho", "\\(*r" },
164: { "rsqb", "]" },
165: { "sigma", "\\(*s" },
166: { "shy", "\\&" }, /* U+00AD */
167: { "tau", "\\(*t" },
168: { "tilde", "\\[u02DC]" },
169: { "times", "\\[tmu]" },
170: { "uuml", "\\(:u" },
171: { NULL, NULL }
172: };
173:
1.23 schwarze 174: static size_t parse_string(struct parse *, char *, size_t,
175: enum pstate *, int);
1.24 schwarze 176: static void parse_fd(struct parse *, int);
1.23 schwarze 177:
178:
1.6 schwarze 179: static void
1.29 schwarze 180: fatal(struct parse *p)
181: {
182: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
183: perror(NULL);
184: exit(6);
185: }
186:
187: static void
1.6 schwarze 188: error_msg(struct parse *p, const char *fmt, ...)
189: {
190: va_list ap;
191:
1.29 schwarze 192: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 193: va_start(ap, fmt);
194: vfprintf(stderr, fmt, ap);
195: va_end(ap);
196: fputc('\n', stderr);
1.29 schwarze 197: p->tree->flags |= TREE_ERROR;
1.6 schwarze 198: }
199:
200: static void
201: warn_msg(struct parse *p, const char *fmt, ...)
202: {
203: va_list ap;
204:
1.23 schwarze 205: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 206: return;
207:
1.29 schwarze 208: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 209: va_start(ap, fmt);
210: vfprintf(stderr, fmt, ap);
211: va_end(ap);
212: fputc('\n', stderr);
1.29 schwarze 213: p->tree->flags |= TREE_WARN;
1.6 schwarze 214: }
215:
1.1 schwarze 216: /*
217: * Process a string of characters.
218: * If a text node is already open, append to it.
219: * Otherwise, create a new one as a child of the current node.
220: */
221: static void
1.35 schwarze 222: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 223: {
1.35 schwarze 224: struct pnode *n, *np;
1.32 schwarze 225: size_t oldsz, newsz;
1.35 schwarze 226: int i;
1.1 schwarze 227:
1.32 schwarze 228: assert(sz > 0);
1.30 schwarze 229: if (p->del > 0)
1.1 schwarze 230: return;
231:
1.32 schwarze 232: if ((n = p->cur) == NULL) {
1.35 schwarze 233: error_msg(p, "discarding text before document: %.*s",
234: sz, word);
1.5 schwarze 235: return;
236: }
237:
1.35 schwarze 238: /* Append to the current text node, if one is open. */
239:
240: if (n->node == NODE_TEXT) {
241: oldsz = strlen(n->b);
242: newsz = oldsz + sz;
243: if (oldsz && (p->flags & PFLAG_SPC))
244: newsz++;
245: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 246: fatal(p);
1.35 schwarze 247: if (oldsz && (p->flags & PFLAG_SPC))
248: n->b[oldsz++] = ' ';
249: memcpy(n->b + oldsz, word, sz);
250: n->b[newsz] = '\0';
251: p->flags &= ~PFLAG_SPC;
252: return;
1.1 schwarze 253: }
254:
1.35 schwarze 255: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 256: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 257:
1.35 schwarze 258: /* Create a new text node. */
1.1 schwarze 259:
1.35 schwarze 260: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 261: fatal(p);
1.35 schwarze 262: n->node = NODE_TEXT;
263: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 264: p->flags &= ~PFLAG_SPC;
1.35 schwarze 265:
266: /*
1.39 schwarze 267: * If this node follows an in-line macro without intervening
1.35 schwarze 268: * whitespace, keep the text in it as short as possible,
269: * and do not keep it open.
270: */
271:
1.39 schwarze 272: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
273: while (np != NULL) {
274: switch (pnode_class(np->node)) {
275: case CLASS_VOID:
276: case CLASS_TEXT:
277: case CLASS_BLOCK:
278: np = NULL;
279: break;
280: case CLASS_TRANS:
281: np = TAILQ_LAST(&np->childq, pnodeq);
282: continue;
283: case CLASS_LINE:
284: case CLASS_ENCL:
285: break;
286: }
287: break;
288: }
289: if (np != NULL) {
1.35 schwarze 290: i = 0;
291: while (i < sz && !isspace((unsigned char)word[i]))
292: i++;
293: if ((n->b = strndup(word, i)) == NULL)
294: fatal(p);
295: if (i == sz)
296: return;
297: while (i < sz && isspace((unsigned char)word[i]))
298: i++;
299: if (i == sz) {
300: p->flags |= PFLAG_SPC;
301: return;
302: }
303:
304: /* Put any remaining text into a second node. */
305:
306: if ((n = pnode_alloc(p->cur)) == NULL)
307: fatal(p);
308: n->node = NODE_TEXT;
309: n->spc = 1;
310: word += i;
311: sz -= i;
312: }
313: if ((n->b = strndup(word, sz)) == NULL)
314: fatal(p);
315:
316: /* The new node remains open for later pnode_closetext(). */
317:
318: p->cur = n;
1.1 schwarze 319: }
320:
1.16 schwarze 321: /*
322: * Close out the text node and strip trailing whitespace, if one is open.
323: */
1.1 schwarze 324: static void
1.37 schwarze 325: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 326: {
1.16 schwarze 327: struct pnode *n;
1.37 schwarze 328: char *cp, *last_word;
1.16 schwarze 329:
330: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
331: return;
332: p->cur = n->parent;
1.32 schwarze 333: for (cp = strchr(n->b, '\0');
334: cp > n->b && isspace((unsigned char)cp[-1]);
335: *--cp = '\0')
1.23 schwarze 336: p->flags |= PFLAG_SPC;
1.37 schwarze 337:
338: if (p->flags & PFLAG_SPC || !check_last_word)
339: return;
340:
341: /*
342: * Find the beginning of the last word
343: * and delete whitespace before it.
344: */
345:
346: while (cp > n->b && !isspace((unsigned char)cp[-1]))
347: cp--;
348: if (cp == n->b)
349: return;
350:
351: last_word = cp;
352: while (cp > n->b && isspace((unsigned char)cp[-1]))
353: *--cp = '\0';
354:
355: /* Move the last word into its own node, for use with .Pf. */
356:
357: if ((n = pnode_alloc(p->cur)) == NULL)
358: fatal(p);
359: n->node = NODE_TEXT;
360: n->spc = 1;
361: if ((n->b = strdup(last_word)) == NULL)
362: fatal(p);
1.1 schwarze 363: }
364:
1.9 schwarze 365: static void
366: xml_entity(struct parse *p, const char *name)
367: {
368: const struct entity *entity;
1.30 schwarze 369: struct pnode *n;
1.23 schwarze 370: const char *ccp;
371: char *cp;
372: enum pstate pstate;
1.9 schwarze 373:
374: if (p->del > 0)
375: return;
376:
377: if (p->cur == NULL) {
378: error_msg(p, "discarding entity before document: &%s;", name);
379: return;
380: }
381:
1.37 schwarze 382: pnode_closetext(p, 0);
1.9 schwarze 383:
384: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
385: warn_msg(p, "entity after end of document: &%s;", name);
386:
387: for (entity = entities; entity->name != NULL; entity++)
388: if (strcmp(name, entity->name) == 0)
389: break;
390:
391: if (entity->roff == NULL) {
1.23 schwarze 392: if (p->doctype != NULL) {
1.30 schwarze 393: TAILQ_FOREACH(n, &p->doctype->childq, child) {
394: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 395: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 396: strcmp(ccp, name) != 0)
397: continue;
1.30 schwarze 398: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 399: ATTRKEY_SYSTEM, NULL)) != NULL) {
400: parse_file(p, -1, ccp);
401: p->flags &= ~PFLAG_SPC;
402: return;
403: }
1.30 schwarze 404: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 405: ATTRKEY_DEFINITION, NULL)) == NULL)
406: continue;
1.29 schwarze 407: if ((cp = strdup(ccp)) == NULL)
408: fatal(p);
1.23 schwarze 409: pstate = PARSE_ELEM;
410: parse_string(p, cp, strlen(cp), &pstate, 0);
411: p->flags &= ~PFLAG_SPC;
412: free(cp);
413: return;
414: }
415: }
1.9 schwarze 416: error_msg(p, "unknown entity &%s;", name);
417: return;
418: }
419:
420: /* Create, append, and close out an entity node. */
1.34 schwarze 421: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 422: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 423: fatal(p);
1.30 schwarze 424: n->node = NODE_ESCAPE;
425: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 426: p->flags &= ~PFLAG_SPC;
1.9 schwarze 427: }
428:
1.1 schwarze 429: /*
1.39 schwarze 430: * Parse an element name.
431: */
432: static enum nodeid
433: xml_name2node(struct parse *p, const char *name)
434: {
435: const struct alias *alias;
436: enum nodeid node;
437:
438: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
439: return node;
440:
441: for (alias = aliases; alias->name != NULL; alias++)
442: if (strcmp(alias->name, name) == 0)
443: return alias->node;
444:
445: return NODE_UNKNOWN;
446: }
447:
448: /*
1.1 schwarze 449: * Begin an element.
450: */
451: static void
1.30 schwarze 452: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 453: {
1.30 schwarze 454: struct pnode *n;
1.1 schwarze 455:
1.4 schwarze 456: /*
457: * An ancestor is excluded from the tree;
458: * keep track of the number of levels excluded.
459: */
1.30 schwarze 460: if (p->del > 0) {
1.23 schwarze 461: if (*name != '!' && *name != '?')
1.30 schwarze 462: p->del++;
1.4 schwarze 463: return;
464: }
465:
1.39 schwarze 466: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 467: case NODE_DELETE_WARN:
1.30 schwarze 468: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 469: /* FALLTHROUGH */
1.4 schwarze 470: case NODE_DELETE:
1.30 schwarze 471: p->del = 1;
1.4 schwarze 472: /* FALLTHROUGH */
1.2 schwarze 473: case NODE_IGNORE:
474: return;
1.39 schwarze 475: case NODE_UNKNOWN:
476: if (*name != '!' && *name != '?')
477: error_msg(p, "unknown element <%s>", name);
478: return;
1.2 schwarze 479: default:
480: break;
481: }
1.1 schwarze 482:
1.30 schwarze 483: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
484: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 485:
1.39 schwarze 486: switch (pnode_class(p->ncur)) {
487: case CLASS_LINE:
488: case CLASS_ENCL:
489: pnode_closetext(p, 1);
490: break;
491: default:
492: pnode_closetext(p, 0);
493: break;
494: }
495:
1.34 schwarze 496: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 497: fatal(p);
1.17 schwarze 498:
499: /*
1.39 schwarze 500: * Some elements are self-closing.
1.17 schwarze 501: * Nodes that begin a new macro or request line or start by
502: * printing text always want whitespace before themselves.
503: */
504:
1.39 schwarze 505: switch (n->node = p->ncur) {
1.23 schwarze 506: case NODE_DOCTYPE:
507: case NODE_ENTITY:
508: case NODE_SBR:
1.30 schwarze 509: p->flags |= PFLAG_EEND;
1.17 schwarze 510: break;
511: default:
1.39 schwarze 512: break;
513: }
514: switch (pnode_class(p->ncur)) {
515: case CLASS_LINE:
516: case CLASS_ENCL:
1.30 schwarze 517: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 518: break;
1.39 schwarze 519: default:
520: n->spc = 1;
521: break;
1.17 schwarze 522: }
1.30 schwarze 523: p->cur = n;
524: if (n->node == NODE_DOCTYPE) {
525: if (p->doctype == NULL)
526: p->doctype = n;
1.23 schwarze 527: else
1.30 schwarze 528: error_msg(p, "duplicate doctype");
529: } else if (n->parent == NULL && p->tree->root == NULL)
530: p->tree->root = n;
1.5 schwarze 531: }
532:
533: static void
1.30 schwarze 534: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 535: {
1.30 schwarze 536: struct pattr *a;
1.23 schwarze 537: const char *value;
1.5 schwarze 538: enum attrkey key;
1.1 schwarze 539:
1.30 schwarze 540: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 541: return;
1.23 schwarze 542:
1.30 schwarze 543: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
544: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 545: value = name;
546: name = "NAME";
547: } else
548: value = NULL;
549:
1.5 schwarze 550: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 551: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 552: return;
553: }
1.30 schwarze 554: if ((a = calloc(1, sizeof(*a))) == NULL)
555: fatal(p);
1.29 schwarze 556:
1.30 schwarze 557: a->key = key;
558: a->val = ATTRVAL__MAX;
1.23 schwarze 559: if (value == NULL) {
1.30 schwarze 560: a->rawval = NULL;
561: p->flags |= PFLAG_ATTR;
1.23 schwarze 562: } else {
1.30 schwarze 563: if ((a->rawval = strdup(value)) == NULL)
564: fatal(p);
565: p->flags &= ~PFLAG_ATTR;
566: }
567: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
568: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
569: xml_attrkey(p, "DEFINITION");
1.5 schwarze 570: }
571:
572: static void
1.30 schwarze 573: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 574: {
1.30 schwarze 575: struct pattr *a;
1.5 schwarze 576:
1.30 schwarze 577: if (p->del > 0 || p->ncur == NODE_IGNORE ||
578: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 579: return;
1.30 schwarze 580: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 581: return;
1.30 schwarze 582: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
583: (a->rawval = strdup(name)) == NULL)
584: fatal(p);
585: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 586: }
587:
588: /*
589: * Roll up the parse tree.
590: * If we're at a text node, roll that one up first.
591: */
592: static void
1.31 schwarze 593: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 594: {
1.26 schwarze 595: struct pnode *n;
596: const char *cp;
1.5 schwarze 597: enum nodeid node;
1.1 schwarze 598:
1.4 schwarze 599: /*
600: * An ancestor is excluded from the tree;
601: * keep track of the number of levels excluded.
602: */
1.31 schwarze 603: if (p->del > 1) {
604: p->del--;
1.4 schwarze 605: return;
606: }
607:
1.31 schwarze 608: if (p->del == 0)
1.37 schwarze 609: pnode_closetext(p, 0);
1.2 schwarze 610:
1.39 schwarze 611: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 612:
1.5 schwarze 613: switch (node) {
1.4 schwarze 614: case NODE_DELETE_WARN:
615: case NODE_DELETE:
1.31 schwarze 616: if (p->del > 0)
617: p->del--;
1.4 schwarze 618: break;
1.2 schwarze 619: case NODE_IGNORE:
1.39 schwarze 620: case NODE_UNKNOWN:
1.26 schwarze 621: break;
622: case NODE_INCLUDE:
1.31 schwarze 623: n = p->cur;
624: p->cur = p->cur->parent;
1.26 schwarze 625: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
626: if (cp == NULL)
1.31 schwarze 627: error_msg(p, "<xi:include> element "
1.26 schwarze 628: "without href attribute");
629: else
1.31 schwarze 630: parse_file(p, -1, cp);
1.26 schwarze 631: pnode_unlink(n);
1.31 schwarze 632: p->flags &= ~PFLAG_SPC;
1.2 schwarze 633: break;
1.23 schwarze 634: case NODE_DOCTYPE:
1.32 schwarze 635: case NODE_SBR:
1.31 schwarze 636: p->flags &= ~PFLAG_EEND;
1.23 schwarze 637: /* FALLTHROUGH */
1.2 schwarze 638: default:
1.31 schwarze 639: if (p->cur == NULL || node != p->cur->node) {
640: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 641: break;
642: }
643:
644: /*
645: * Refrain from actually closing the document element.
646: * If no more content follows, no harm is done, but if
647: * some content still follows, simply processing it is
648: * obviously better than discarding it or crashing.
649: */
650:
1.31 schwarze 651: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
652: p->cur = p->cur->parent;
653: if (p->cur != NULL)
654: p->ncur = p->cur->node;
1.23 schwarze 655: } else
1.31 schwarze 656: p->tree->flags |= TREE_CLOSED;
657: p->flags &= ~PFLAG_SPC;
1.4 schwarze 658: break;
1.2 schwarze 659: }
1.31 schwarze 660: assert(p->del == 0);
1.1 schwarze 661: }
662:
663: struct parse *
664: parse_alloc(int warn)
665: {
666: struct parse *p;
667:
668: if ((p = calloc(1, sizeof(*p))) == NULL)
669: return NULL;
670:
671: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
672: free(p);
673: return NULL;
674: }
1.23 schwarze 675: if (warn)
676: p->flags |= PFLAG_WARN;
677: else
678: p->flags &= ~PFLAG_WARN;
1.1 schwarze 679: return p;
680: }
681:
682: void
683: parse_free(struct parse *p)
684: {
685: if (p == NULL)
686: return;
687: if (p->tree != NULL) {
688: pnode_unlink(p->tree->root);
689: free(p->tree);
690: }
691: free(p);
692: }
693:
1.14 schwarze 694: static void
695: increment(struct parse *p, char *b, size_t *pend, int refill)
696: {
697: if (refill) {
698: if (b[*pend] == '\n') {
699: p->nline++;
700: p->ncol = 1;
701: } else
702: p->ncol++;
703: }
704: ++*pend;
705: }
706:
1.5 schwarze 707: /*
708: * Advance the pend pointer to the next character in the charset.
709: * If the charset starts with a space, it stands for any whitespace.
710: * Update the new input file position, used for messages.
711: * Do not overrun the buffer b of length rlen.
712: * When reaching the end, NUL-terminate the buffer and return 1;
713: * otherwise, return 0.
714: */
715: static int
716: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 717: const char *charset, int refill)
1.5 schwarze 718: {
719: int space;
720:
721: if (*charset == ' ') {
722: space = 1;
723: charset++;
724: } else
725: space = 0;
726:
1.14 schwarze 727: if (refill) {
728: p->nline = p->line;
729: p->ncol = p->col;
730: }
1.5 schwarze 731: while (*pend < rlen) {
732: if (space && isspace((unsigned char)b[*pend]))
733: break;
734: if (strchr(charset, b[*pend]) != NULL)
735: break;
1.14 schwarze 736: increment(p, b, pend, refill);
1.5 schwarze 737: }
738: if (*pend == rlen) {
739: b[rlen] = '\0';
1.14 schwarze 740: return refill;
1.5 schwarze 741: } else
742: return 0;
743: }
744:
1.14 schwarze 745: size_t
746: parse_string(struct parse *p, char *b, size_t rlen,
747: enum pstate *pstate, int refill)
748: {
749: char *cp;
750: size_t poff; /* Parse offset in b[]. */
751: size_t pend; /* Offset of the end of the current word. */
752: int elem_end;
753:
754: pend = 0;
755: for (;;) {
756:
757: /* Proceed to the next token, skipping whitespace. */
758:
759: if (refill) {
760: p->line = p->nline;
761: p->col = p->ncol;
762: }
763: if ((poff = pend) == rlen)
764: break;
765: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 766: p->flags |= PFLAG_SPC;
1.14 schwarze 767: increment(p, b, &pend, refill);
768: continue;
769: }
770:
771: /*
772: * The following four cases (ARG, TAG, and starting an
773: * entity or a tag) all parse a word or quoted string.
774: * If that extends beyond the read buffer and the last
775: * read(2) still got data, they all break out of the
776: * token loop to request more data from the read loop.
777: *
778: * Also, three of them detect self-closing tags, those
779: * ending with "/>", setting the flag elem_end and
780: * calling xml_elem_end() at the very end, after
781: * handling the attribute value, attribute name, or
782: * tag name, respectively.
783: */
784:
785: /* Parse an attribute value. */
786:
787: if (*pstate >= PARSE_ARG) {
788: if (*pstate == PARSE_ARG &&
789: (b[pend] == '\'' || b[pend] == '"')) {
790: *pstate = b[pend] == '"' ?
791: PARSE_DQ : PARSE_SQ;
792: increment(p, b, &pend, refill);
793: continue;
794: }
795: if (advance(p, b, rlen, &pend,
796: *pstate == PARSE_DQ ? "\"" :
797: *pstate == PARSE_SQ ? "'" : " >", refill))
798: break;
799: *pstate = PARSE_TAG;
800: elem_end = 0;
801: if (b[pend] == '>') {
802: *pstate = PARSE_ELEM;
803: if (pend > 0 && b[pend - 1] == '/') {
804: b[pend - 1] = '\0';
805: elem_end = 1;
806: }
1.23 schwarze 807: if (p->flags & PFLAG_EEND)
808: elem_end = 1;
1.14 schwarze 809: }
810: b[pend] = '\0';
811: if (pend < rlen)
812: increment(p, b, &pend, refill);
813: xml_attrval(p, b + poff);
814: if (elem_end)
815: xml_elem_end(p, NULL);
816:
817: /* Look for an attribute name. */
818:
819: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 820: switch (p->ncur) {
821: case NODE_DOCTYPE:
822: if (b[pend] == '[') {
823: *pstate = PARSE_ELEM;
824: increment(p, b, &pend, refill);
825: continue;
826: }
827: /* FALLTHROUGH */
828: case NODE_ENTITY:
829: if (b[pend] == '"' || b[pend] == '\'') {
830: *pstate = PARSE_ARG;
831: continue;
832: }
833: break;
834: default:
835: break;
836: }
1.14 schwarze 837: if (advance(p, b, rlen, &pend, " =>", refill))
838: break;
839: elem_end = 0;
840: switch (b[pend]) {
841: case '>':
842: *pstate = PARSE_ELEM;
843: if (pend > 0 && b[pend - 1] == '/') {
844: b[pend - 1] = '\0';
845: elem_end = 1;
846: }
1.23 schwarze 847: if (p->flags & PFLAG_EEND)
848: elem_end = 1;
1.14 schwarze 849: break;
850: case '=':
851: *pstate = PARSE_ARG;
852: break;
853: default:
854: break;
855: }
856: b[pend] = '\0';
857: if (pend < rlen)
858: increment(p, b, &pend, refill);
859: xml_attrkey(p, b + poff);
860: if (elem_end)
861: xml_elem_end(p, NULL);
862:
863: /* Begin an opening or closing tag. */
864:
865: } else if (b[poff] == '<') {
866: if (advance(p, b, rlen, &pend, " >", refill))
867: break;
868: if (pend > poff + 3 &&
869: strncmp(b + poff, "<!--", 4) == 0) {
870:
871: /* Skip a comment. */
872:
873: cp = strstr(b + pend - 2, "-->");
874: if (cp == NULL) {
875: if (refill)
876: break;
877: cp = b + rlen;
878: } else
879: cp += 3;
880: while (b + pend < cp)
881: increment(p, b, &pend, refill);
882: continue;
883: }
884: elem_end = 0;
885: if (b[pend] != '>')
886: *pstate = PARSE_TAG;
887: else if (pend > 0 && b[pend - 1] == '/') {
888: b[pend - 1] = '\0';
889: elem_end = 1;
890: }
891: b[pend] = '\0';
892: if (pend < rlen)
893: increment(p, b, &pend, refill);
894: if (b[++poff] == '/') {
895: elem_end = 1;
896: poff++;
1.23 schwarze 897: } else {
1.14 schwarze 898: xml_elem_start(p, b + poff);
1.23 schwarze 899: if (*pstate == PARSE_ELEM &&
900: p->flags & PFLAG_EEND)
901: elem_end = 1;
902: }
1.14 schwarze 903: if (elem_end)
904: xml_elem_end(p, b + poff);
905:
1.23 schwarze 906: /* Close a doctype. */
907:
908: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
909: *pstate = PARSE_TAG;
910: increment(p, b, &pend, refill);
911:
1.14 schwarze 912: /* Process an entity. */
913:
914: } else if (b[poff] == '&') {
915: if (advance(p, b, rlen, &pend, ";", refill))
916: break;
917: b[pend] = '\0';
918: if (pend < rlen)
919: increment(p, b, &pend, refill);
920: xml_entity(p, b + poff + 1);
921:
922: /* Process text up to the next tag, entity, or EOL. */
923:
924: } else {
1.28 schwarze 925: advance(p, b, rlen, &pend,
1.33 schwarze 926: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 927: refill);
1.35 schwarze 928: xml_text(p, b + poff, pend - poff);
1.33 schwarze 929: if (b[pend] == '\n')
1.37 schwarze 930: pnode_closetext(p, 0);
1.14 schwarze 931: }
932: }
933: return poff;
934: }
935:
1.24 schwarze 936:
937: /*
938: * The read loop.
939: * If the previous token was incomplete and asked for more input,
940: * we have to enter the read loop once more even on EOF.
941: * Once rsz is 0, incomplete tokens will no longer ask for more input
942: * but instead use whatever there is, and then exit the read loop.
943: * The minus one on the size limit for read(2) is needed such that
944: * advance() can set b[rlen] to NUL when needed.
945: */
946: static void
947: parse_fd(struct parse *p, int fd)
1.1 schwarze 948: {
949: char b[4096];
1.5 schwarze 950: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 951: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 952: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 953: enum pstate pstate;
1.1 schwarze 954:
1.24 schwarze 955: rlen = 0;
1.14 schwarze 956: pstate = PARSE_ELEM;
957: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
958: (rlen += rsz) > 0) {
959: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 960: /* Buffer exhausted; shift left and re-fill. */
961: assert(poff > 0);
962: rlen -= poff;
1.14 schwarze 963: memmove(b, b + poff, rlen);
1.5 schwarze 964: }
1.24 schwarze 965: if (rsz < 0)
966: error_msg(p, "read: %s", strerror(errno));
967: }
968:
969: /*
970: * Open and parse a file.
971: */
972: struct ptree *
973: parse_file(struct parse *p, int fd, const char *fname)
974: {
975: const char *save_fname;
976: int save_line, save_col;
977:
978: /* Save and initialize reporting data. */
979:
980: save_fname = p->fname;
981: save_line = p->nline;
982: save_col = p->ncol;
983: p->fname = fname;
984: p->line = 0;
985: p->col = 0;
986:
987: /* Open the file, unless it is already open. */
988:
989: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
990: error_msg(p, "open: %s", strerror(errno));
991: p->fname = save_fname;
992: return p->tree;
1.5 schwarze 993: }
1.24 schwarze 994:
995: /*
996: * After opening the starting file, change to the directory it
997: * is located in, in case it wants to include any further files,
998: * which are typically given with relative paths in DocBook.
999: * Do this on a best-effort basis; don't complain about failure.
1000: */
1001:
1002: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1003: strcmp(fname, ".") != 0)
1004: (void)chdir(fname);
1005:
1006: /* Run the read loop. */
1007:
1008: p->nline = 1;
1009: p->ncol = 1;
1010: parse_fd(p, fd);
1011:
1012: /* On the top level, finalize the parse tree. */
1013:
1014: if (save_fname == NULL) {
1.37 schwarze 1015: pnode_closetext(p, 0);
1.24 schwarze 1016: if (p->tree->root == NULL)
1017: error_msg(p, "empty document");
1018: else if ((p->tree->flags & TREE_CLOSED) == 0)
1019: warn_msg(p, "document not closed");
1020: pnode_unlink(p->doctype);
1021: }
1022:
1023: /* Clean up. */
1024:
1025: if (fd != STDIN_FILENO)
1026: close(fd);
1027: p->fname = save_fname;
1028: p->nline = save_line;
1029: p->ncol = save_col;
1.1 schwarze 1030: return p->tree;
1031: }
CVSweb