Annotation of docbook2mdoc/parse.c, Revision 1.41
1.41 ! schwarze 1: /* $Id: parse.c,v 1.40 2019/04/12 21:37:07 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.4 schwarze 73: { "anchor", NODE_DELETE },
1.22 schwarze 74: { "article", NODE_SECTION },
1.41 ! schwarze 75: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 76: { "book", NODE_SECTION },
1.1 schwarze 77: { "chapter", NODE_SECTION },
1.13 schwarze 78: { "code", NODE_LITERAL },
1.36 schwarze 79: { "computeroutput", NODE_LITERAL },
1.23 schwarze 80: { "!doctype", NODE_DOCTYPE },
1.7 schwarze 81: { "firstname", NODE_PERSONNAME },
1.21 schwarze 82: { "glossary", NODE_VARIABLELIST },
83: { "glossdef", NODE_IGNORE },
84: { "glossdiv", NODE_IGNORE },
85: { "glossentry", NODE_VARLISTENTRY },
86: { "glosslist", NODE_VARIABLELIST },
1.4 schwarze 87: { "indexterm", NODE_DELETE },
1.11 schwarze 88: { "informaltable", NODE_TABLE },
1.40 schwarze 89: { "othercredit", NODE_AUTHOR },
1.7 schwarze 90: { "othername", NODE_PERSONNAME },
1.1 schwarze 91: { "part", NODE_SECTION },
1.3 schwarze 92: { "phrase", NODE_IGNORE },
1.4 schwarze 93: { "primary", NODE_DELETE },
1.1 schwarze 94: { "refsect1", NODE_SECTION },
95: { "refsect2", NODE_SECTION },
96: { "refsect3", NODE_SECTION },
97: { "refsection", NODE_SECTION },
1.4 schwarze 98: { "secondary", NODE_DELETE },
1.1 schwarze 99: { "sect1", NODE_SECTION },
100: { "sect2", NODE_SECTION },
1.36 schwarze 101: { "sgmltag", NODE_MARKUP },
1.15 schwarze 102: { "simpara", NODE_PARA },
1.13 schwarze 103: { "structfield", NODE_PARAMETER },
104: { "structname", NODE_TYPE },
1.7 schwarze 105: { "surname", NODE_PERSONNAME },
1.12 schwarze 106: { "symbol", NODE_CONSTANT },
1.3 schwarze 107: { "trademark", NODE_IGNORE },
1.18 schwarze 108: { "ulink", NODE_LINK },
1.13 schwarze 109: { "userinput", NODE_LITERAL },
1.5 schwarze 110: { NULL, NODE_IGNORE }
1.1 schwarze 111: };
112:
1.9 schwarze 113: struct entity {
114: const char *name;
115: const char *roff;
116: };
117:
118: /*
119: * XML character entity references found in the wild.
120: * Those that don't have an exact mandoc_char(7) representation
121: * are approximated, and the desired codepoint is given as a comment.
122: * Encoding them as \\[u...] would leave -Tascii out in the cold.
123: */
124: static const struct entity entities[] = {
125: { "alpha", "\\(*a" },
126: { "amp", "&" },
127: { "apos", "'" },
128: { "auml", "\\(:a" },
129: { "beta", "\\(*b" },
130: { "circ", "^" }, /* U+02C6 */
131: { "copy", "\\(co" },
132: { "dagger", "\\(dg" },
133: { "Delta", "\\(*D" },
134: { "eacute", "\\('e" },
135: { "emsp", "\\ " }, /* U+2003 */
136: { "gt", ">" },
137: { "hairsp", "\\^" },
138: { "kappa", "\\(*k" },
139: { "larr", "\\(<-" },
140: { "ldquo", "\\(lq" },
141: { "le", "\\(<=" },
142: { "lowbar", "_" },
143: { "lsqb", "[" },
144: { "lt", "<" },
145: { "mdash", "\\(em" },
146: { "minus", "\\-" },
147: { "ndash", "\\(en" },
148: { "nbsp", "\\ " },
149: { "num", "#" },
150: { "oslash", "\\(/o" },
151: { "ouml", "\\(:o" },
152: { "percnt", "%" },
153: { "quot", "\\(dq" },
154: { "rarr", "\\(->" },
155: { "rArr", "\\(rA" },
156: { "rdquo", "\\(rq" },
157: { "reg", "\\(rg" },
158: { "rho", "\\(*r" },
159: { "rsqb", "]" },
160: { "sigma", "\\(*s" },
161: { "shy", "\\&" }, /* U+00AD */
162: { "tau", "\\(*t" },
163: { "tilde", "\\[u02DC]" },
164: { "times", "\\[tmu]" },
165: { "uuml", "\\(:u" },
166: { NULL, NULL }
167: };
168:
1.23 schwarze 169: static size_t parse_string(struct parse *, char *, size_t,
170: enum pstate *, int);
1.24 schwarze 171: static void parse_fd(struct parse *, int);
1.23 schwarze 172:
173:
1.6 schwarze 174: static void
1.29 schwarze 175: fatal(struct parse *p)
176: {
177: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
178: perror(NULL);
179: exit(6);
180: }
181:
182: static void
1.6 schwarze 183: error_msg(struct parse *p, const char *fmt, ...)
184: {
185: va_list ap;
186:
1.29 schwarze 187: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 188: va_start(ap, fmt);
189: vfprintf(stderr, fmt, ap);
190: va_end(ap);
191: fputc('\n', stderr);
1.29 schwarze 192: p->tree->flags |= TREE_ERROR;
1.6 schwarze 193: }
194:
195: static void
196: warn_msg(struct parse *p, const char *fmt, ...)
197: {
198: va_list ap;
199:
1.23 schwarze 200: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 201: return;
202:
1.29 schwarze 203: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 204: va_start(ap, fmt);
205: vfprintf(stderr, fmt, ap);
206: va_end(ap);
207: fputc('\n', stderr);
1.29 schwarze 208: p->tree->flags |= TREE_WARN;
1.6 schwarze 209: }
210:
1.1 schwarze 211: /*
212: * Process a string of characters.
213: * If a text node is already open, append to it.
214: * Otherwise, create a new one as a child of the current node.
215: */
216: static void
1.35 schwarze 217: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 218: {
1.35 schwarze 219: struct pnode *n, *np;
1.32 schwarze 220: size_t oldsz, newsz;
1.35 schwarze 221: int i;
1.1 schwarze 222:
1.32 schwarze 223: assert(sz > 0);
1.30 schwarze 224: if (p->del > 0)
1.1 schwarze 225: return;
226:
1.32 schwarze 227: if ((n = p->cur) == NULL) {
1.35 schwarze 228: error_msg(p, "discarding text before document: %.*s",
229: sz, word);
1.5 schwarze 230: return;
231: }
232:
1.35 schwarze 233: /* Append to the current text node, if one is open. */
234:
235: if (n->node == NODE_TEXT) {
236: oldsz = strlen(n->b);
237: newsz = oldsz + sz;
238: if (oldsz && (p->flags & PFLAG_SPC))
239: newsz++;
240: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 241: fatal(p);
1.35 schwarze 242: if (oldsz && (p->flags & PFLAG_SPC))
243: n->b[oldsz++] = ' ';
244: memcpy(n->b + oldsz, word, sz);
245: n->b[newsz] = '\0';
246: p->flags &= ~PFLAG_SPC;
247: return;
1.1 schwarze 248: }
249:
1.35 schwarze 250: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 251: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 252:
1.35 schwarze 253: /* Create a new text node. */
1.1 schwarze 254:
1.35 schwarze 255: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 256: fatal(p);
1.35 schwarze 257: n->node = NODE_TEXT;
258: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 259: p->flags &= ~PFLAG_SPC;
1.35 schwarze 260:
261: /*
1.39 schwarze 262: * If this node follows an in-line macro without intervening
1.35 schwarze 263: * whitespace, keep the text in it as short as possible,
264: * and do not keep it open.
265: */
266:
1.39 schwarze 267: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
268: while (np != NULL) {
269: switch (pnode_class(np->node)) {
270: case CLASS_VOID:
271: case CLASS_TEXT:
272: case CLASS_BLOCK:
273: np = NULL;
274: break;
275: case CLASS_TRANS:
276: np = TAILQ_LAST(&np->childq, pnodeq);
277: continue;
278: case CLASS_LINE:
279: case CLASS_ENCL:
280: break;
281: }
282: break;
283: }
284: if (np != NULL) {
1.35 schwarze 285: i = 0;
286: while (i < sz && !isspace((unsigned char)word[i]))
287: i++;
288: if ((n->b = strndup(word, i)) == NULL)
289: fatal(p);
290: if (i == sz)
291: return;
292: while (i < sz && isspace((unsigned char)word[i]))
293: i++;
294: if (i == sz) {
295: p->flags |= PFLAG_SPC;
296: return;
297: }
298:
299: /* Put any remaining text into a second node. */
300:
301: if ((n = pnode_alloc(p->cur)) == NULL)
302: fatal(p);
303: n->node = NODE_TEXT;
304: n->spc = 1;
305: word += i;
306: sz -= i;
307: }
308: if ((n->b = strndup(word, sz)) == NULL)
309: fatal(p);
310:
311: /* The new node remains open for later pnode_closetext(). */
312:
313: p->cur = n;
1.1 schwarze 314: }
315:
1.16 schwarze 316: /*
317: * Close out the text node and strip trailing whitespace, if one is open.
318: */
1.1 schwarze 319: static void
1.37 schwarze 320: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 321: {
1.16 schwarze 322: struct pnode *n;
1.37 schwarze 323: char *cp, *last_word;
1.16 schwarze 324:
325: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
326: return;
327: p->cur = n->parent;
1.32 schwarze 328: for (cp = strchr(n->b, '\0');
329: cp > n->b && isspace((unsigned char)cp[-1]);
330: *--cp = '\0')
1.23 schwarze 331: p->flags |= PFLAG_SPC;
1.37 schwarze 332:
333: if (p->flags & PFLAG_SPC || !check_last_word)
334: return;
335:
336: /*
337: * Find the beginning of the last word
338: * and delete whitespace before it.
339: */
340:
341: while (cp > n->b && !isspace((unsigned char)cp[-1]))
342: cp--;
343: if (cp == n->b)
344: return;
345:
346: last_word = cp;
347: while (cp > n->b && isspace((unsigned char)cp[-1]))
348: *--cp = '\0';
349:
350: /* Move the last word into its own node, for use with .Pf. */
351:
352: if ((n = pnode_alloc(p->cur)) == NULL)
353: fatal(p);
354: n->node = NODE_TEXT;
355: n->spc = 1;
356: if ((n->b = strdup(last_word)) == NULL)
357: fatal(p);
1.1 schwarze 358: }
359:
1.9 schwarze 360: static void
361: xml_entity(struct parse *p, const char *name)
362: {
363: const struct entity *entity;
1.30 schwarze 364: struct pnode *n;
1.23 schwarze 365: const char *ccp;
366: char *cp;
367: enum pstate pstate;
1.9 schwarze 368:
369: if (p->del > 0)
370: return;
371:
372: if (p->cur == NULL) {
373: error_msg(p, "discarding entity before document: &%s;", name);
374: return;
375: }
376:
1.37 schwarze 377: pnode_closetext(p, 0);
1.9 schwarze 378:
379: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
380: warn_msg(p, "entity after end of document: &%s;", name);
381:
382: for (entity = entities; entity->name != NULL; entity++)
383: if (strcmp(name, entity->name) == 0)
384: break;
385:
386: if (entity->roff == NULL) {
1.23 schwarze 387: if (p->doctype != NULL) {
1.30 schwarze 388: TAILQ_FOREACH(n, &p->doctype->childq, child) {
389: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 390: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 391: strcmp(ccp, name) != 0)
392: continue;
1.30 schwarze 393: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 394: ATTRKEY_SYSTEM, NULL)) != NULL) {
395: parse_file(p, -1, ccp);
396: p->flags &= ~PFLAG_SPC;
397: return;
398: }
1.30 schwarze 399: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 400: ATTRKEY_DEFINITION, NULL)) == NULL)
401: continue;
1.29 schwarze 402: if ((cp = strdup(ccp)) == NULL)
403: fatal(p);
1.23 schwarze 404: pstate = PARSE_ELEM;
405: parse_string(p, cp, strlen(cp), &pstate, 0);
406: p->flags &= ~PFLAG_SPC;
407: free(cp);
408: return;
409: }
410: }
1.9 schwarze 411: error_msg(p, "unknown entity &%s;", name);
412: return;
413: }
414:
415: /* Create, append, and close out an entity node. */
1.34 schwarze 416: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 417: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 418: fatal(p);
1.30 schwarze 419: n->node = NODE_ESCAPE;
420: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 421: p->flags &= ~PFLAG_SPC;
1.9 schwarze 422: }
423:
1.1 schwarze 424: /*
1.39 schwarze 425: * Parse an element name.
426: */
427: static enum nodeid
428: xml_name2node(struct parse *p, const char *name)
429: {
430: const struct alias *alias;
431: enum nodeid node;
432:
433: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
434: return node;
435:
436: for (alias = aliases; alias->name != NULL; alias++)
437: if (strcmp(alias->name, name) == 0)
438: return alias->node;
439:
440: return NODE_UNKNOWN;
441: }
442:
443: /*
1.1 schwarze 444: * Begin an element.
445: */
446: static void
1.30 schwarze 447: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 448: {
1.30 schwarze 449: struct pnode *n;
1.1 schwarze 450:
1.4 schwarze 451: /*
452: * An ancestor is excluded from the tree;
453: * keep track of the number of levels excluded.
454: */
1.30 schwarze 455: if (p->del > 0) {
1.23 schwarze 456: if (*name != '!' && *name != '?')
1.30 schwarze 457: p->del++;
1.4 schwarze 458: return;
459: }
460:
1.39 schwarze 461: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 462: case NODE_DELETE_WARN:
1.30 schwarze 463: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 464: /* FALLTHROUGH */
1.4 schwarze 465: case NODE_DELETE:
1.30 schwarze 466: p->del = 1;
1.4 schwarze 467: /* FALLTHROUGH */
1.2 schwarze 468: case NODE_IGNORE:
469: return;
1.39 schwarze 470: case NODE_UNKNOWN:
471: if (*name != '!' && *name != '?')
472: error_msg(p, "unknown element <%s>", name);
473: return;
1.2 schwarze 474: default:
475: break;
476: }
1.1 schwarze 477:
1.30 schwarze 478: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
479: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 480:
1.39 schwarze 481: switch (pnode_class(p->ncur)) {
482: case CLASS_LINE:
483: case CLASS_ENCL:
484: pnode_closetext(p, 1);
485: break;
486: default:
487: pnode_closetext(p, 0);
488: break;
489: }
490:
1.34 schwarze 491: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 492: fatal(p);
1.17 schwarze 493:
494: /*
1.39 schwarze 495: * Some elements are self-closing.
1.17 schwarze 496: * Nodes that begin a new macro or request line or start by
497: * printing text always want whitespace before themselves.
498: */
499:
1.39 schwarze 500: switch (n->node = p->ncur) {
1.23 schwarze 501: case NODE_DOCTYPE:
502: case NODE_ENTITY:
503: case NODE_SBR:
1.30 schwarze 504: p->flags |= PFLAG_EEND;
1.17 schwarze 505: break;
506: default:
1.39 schwarze 507: break;
508: }
509: switch (pnode_class(p->ncur)) {
510: case CLASS_LINE:
511: case CLASS_ENCL:
1.30 schwarze 512: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 513: break;
1.39 schwarze 514: default:
515: n->spc = 1;
516: break;
1.17 schwarze 517: }
1.30 schwarze 518: p->cur = n;
519: if (n->node == NODE_DOCTYPE) {
520: if (p->doctype == NULL)
521: p->doctype = n;
1.23 schwarze 522: else
1.30 schwarze 523: error_msg(p, "duplicate doctype");
524: } else if (n->parent == NULL && p->tree->root == NULL)
525: p->tree->root = n;
1.5 schwarze 526: }
527:
528: static void
1.30 schwarze 529: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 530: {
1.30 schwarze 531: struct pattr *a;
1.23 schwarze 532: const char *value;
1.5 schwarze 533: enum attrkey key;
1.1 schwarze 534:
1.30 schwarze 535: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 536: return;
1.23 schwarze 537:
1.30 schwarze 538: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
539: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 540: value = name;
541: name = "NAME";
542: } else
543: value = NULL;
544:
1.5 schwarze 545: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 546: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 547: return;
548: }
1.30 schwarze 549: if ((a = calloc(1, sizeof(*a))) == NULL)
550: fatal(p);
1.29 schwarze 551:
1.30 schwarze 552: a->key = key;
553: a->val = ATTRVAL__MAX;
1.23 schwarze 554: if (value == NULL) {
1.30 schwarze 555: a->rawval = NULL;
556: p->flags |= PFLAG_ATTR;
1.23 schwarze 557: } else {
1.30 schwarze 558: if ((a->rawval = strdup(value)) == NULL)
559: fatal(p);
560: p->flags &= ~PFLAG_ATTR;
561: }
562: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
563: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
564: xml_attrkey(p, "DEFINITION");
1.5 schwarze 565: }
566:
567: static void
1.30 schwarze 568: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 569: {
1.30 schwarze 570: struct pattr *a;
1.5 schwarze 571:
1.30 schwarze 572: if (p->del > 0 || p->ncur == NODE_IGNORE ||
573: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 574: return;
1.30 schwarze 575: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 576: return;
1.30 schwarze 577: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
578: (a->rawval = strdup(name)) == NULL)
579: fatal(p);
580: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 581: }
582:
583: /*
584: * Roll up the parse tree.
585: * If we're at a text node, roll that one up first.
586: */
587: static void
1.31 schwarze 588: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 589: {
1.26 schwarze 590: struct pnode *n;
591: const char *cp;
1.5 schwarze 592: enum nodeid node;
1.1 schwarze 593:
1.4 schwarze 594: /*
595: * An ancestor is excluded from the tree;
596: * keep track of the number of levels excluded.
597: */
1.31 schwarze 598: if (p->del > 1) {
599: p->del--;
1.4 schwarze 600: return;
601: }
602:
1.31 schwarze 603: if (p->del == 0)
1.37 schwarze 604: pnode_closetext(p, 0);
1.2 schwarze 605:
1.39 schwarze 606: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 607:
1.5 schwarze 608: switch (node) {
1.4 schwarze 609: case NODE_DELETE_WARN:
610: case NODE_DELETE:
1.31 schwarze 611: if (p->del > 0)
612: p->del--;
1.4 schwarze 613: break;
1.2 schwarze 614: case NODE_IGNORE:
1.39 schwarze 615: case NODE_UNKNOWN:
1.26 schwarze 616: break;
617: case NODE_INCLUDE:
1.31 schwarze 618: n = p->cur;
619: p->cur = p->cur->parent;
1.26 schwarze 620: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
621: if (cp == NULL)
1.31 schwarze 622: error_msg(p, "<xi:include> element "
1.26 schwarze 623: "without href attribute");
624: else
1.31 schwarze 625: parse_file(p, -1, cp);
1.26 schwarze 626: pnode_unlink(n);
1.31 schwarze 627: p->flags &= ~PFLAG_SPC;
1.2 schwarze 628: break;
1.23 schwarze 629: case NODE_DOCTYPE:
1.32 schwarze 630: case NODE_SBR:
1.31 schwarze 631: p->flags &= ~PFLAG_EEND;
1.23 schwarze 632: /* FALLTHROUGH */
1.2 schwarze 633: default:
1.31 schwarze 634: if (p->cur == NULL || node != p->cur->node) {
635: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 636: break;
637: }
638:
639: /*
640: * Refrain from actually closing the document element.
641: * If no more content follows, no harm is done, but if
642: * some content still follows, simply processing it is
643: * obviously better than discarding it or crashing.
644: */
645:
1.31 schwarze 646: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
647: p->cur = p->cur->parent;
648: if (p->cur != NULL)
649: p->ncur = p->cur->node;
1.23 schwarze 650: } else
1.31 schwarze 651: p->tree->flags |= TREE_CLOSED;
652: p->flags &= ~PFLAG_SPC;
1.4 schwarze 653: break;
1.2 schwarze 654: }
1.31 schwarze 655: assert(p->del == 0);
1.1 schwarze 656: }
657:
658: struct parse *
659: parse_alloc(int warn)
660: {
661: struct parse *p;
662:
663: if ((p = calloc(1, sizeof(*p))) == NULL)
664: return NULL;
665:
666: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
667: free(p);
668: return NULL;
669: }
1.23 schwarze 670: if (warn)
671: p->flags |= PFLAG_WARN;
672: else
673: p->flags &= ~PFLAG_WARN;
1.1 schwarze 674: return p;
675: }
676:
677: void
678: parse_free(struct parse *p)
679: {
680: if (p == NULL)
681: return;
682: if (p->tree != NULL) {
683: pnode_unlink(p->tree->root);
684: free(p->tree);
685: }
686: free(p);
687: }
688:
1.14 schwarze 689: static void
690: increment(struct parse *p, char *b, size_t *pend, int refill)
691: {
692: if (refill) {
693: if (b[*pend] == '\n') {
694: p->nline++;
695: p->ncol = 1;
696: } else
697: p->ncol++;
698: }
699: ++*pend;
700: }
701:
1.5 schwarze 702: /*
703: * Advance the pend pointer to the next character in the charset.
704: * If the charset starts with a space, it stands for any whitespace.
705: * Update the new input file position, used for messages.
706: * Do not overrun the buffer b of length rlen.
707: * When reaching the end, NUL-terminate the buffer and return 1;
708: * otherwise, return 0.
709: */
710: static int
711: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 712: const char *charset, int refill)
1.5 schwarze 713: {
714: int space;
715:
716: if (*charset == ' ') {
717: space = 1;
718: charset++;
719: } else
720: space = 0;
721:
1.14 schwarze 722: if (refill) {
723: p->nline = p->line;
724: p->ncol = p->col;
725: }
1.5 schwarze 726: while (*pend < rlen) {
727: if (space && isspace((unsigned char)b[*pend]))
728: break;
729: if (strchr(charset, b[*pend]) != NULL)
730: break;
1.14 schwarze 731: increment(p, b, pend, refill);
1.5 schwarze 732: }
733: if (*pend == rlen) {
734: b[rlen] = '\0';
1.14 schwarze 735: return refill;
1.5 schwarze 736: } else
737: return 0;
738: }
739:
1.14 schwarze 740: size_t
741: parse_string(struct parse *p, char *b, size_t rlen,
742: enum pstate *pstate, int refill)
743: {
744: char *cp;
745: size_t poff; /* Parse offset in b[]. */
746: size_t pend; /* Offset of the end of the current word. */
747: int elem_end;
748:
749: pend = 0;
750: for (;;) {
751:
752: /* Proceed to the next token, skipping whitespace. */
753:
754: if (refill) {
755: p->line = p->nline;
756: p->col = p->ncol;
757: }
758: if ((poff = pend) == rlen)
759: break;
760: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 761: p->flags |= PFLAG_SPC;
1.14 schwarze 762: increment(p, b, &pend, refill);
763: continue;
764: }
765:
766: /*
767: * The following four cases (ARG, TAG, and starting an
768: * entity or a tag) all parse a word or quoted string.
769: * If that extends beyond the read buffer and the last
770: * read(2) still got data, they all break out of the
771: * token loop to request more data from the read loop.
772: *
773: * Also, three of them detect self-closing tags, those
774: * ending with "/>", setting the flag elem_end and
775: * calling xml_elem_end() at the very end, after
776: * handling the attribute value, attribute name, or
777: * tag name, respectively.
778: */
779:
780: /* Parse an attribute value. */
781:
782: if (*pstate >= PARSE_ARG) {
783: if (*pstate == PARSE_ARG &&
784: (b[pend] == '\'' || b[pend] == '"')) {
785: *pstate = b[pend] == '"' ?
786: PARSE_DQ : PARSE_SQ;
787: increment(p, b, &pend, refill);
788: continue;
789: }
790: if (advance(p, b, rlen, &pend,
791: *pstate == PARSE_DQ ? "\"" :
792: *pstate == PARSE_SQ ? "'" : " >", refill))
793: break;
794: *pstate = PARSE_TAG;
795: elem_end = 0;
796: if (b[pend] == '>') {
797: *pstate = PARSE_ELEM;
798: if (pend > 0 && b[pend - 1] == '/') {
799: b[pend - 1] = '\0';
800: elem_end = 1;
801: }
1.23 schwarze 802: if (p->flags & PFLAG_EEND)
803: elem_end = 1;
1.14 schwarze 804: }
805: b[pend] = '\0';
806: if (pend < rlen)
807: increment(p, b, &pend, refill);
808: xml_attrval(p, b + poff);
809: if (elem_end)
810: xml_elem_end(p, NULL);
811:
812: /* Look for an attribute name. */
813:
814: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 815: switch (p->ncur) {
816: case NODE_DOCTYPE:
817: if (b[pend] == '[') {
818: *pstate = PARSE_ELEM;
819: increment(p, b, &pend, refill);
820: continue;
821: }
822: /* FALLTHROUGH */
823: case NODE_ENTITY:
824: if (b[pend] == '"' || b[pend] == '\'') {
825: *pstate = PARSE_ARG;
826: continue;
827: }
828: break;
829: default:
830: break;
831: }
1.14 schwarze 832: if (advance(p, b, rlen, &pend, " =>", refill))
833: break;
834: elem_end = 0;
835: switch (b[pend]) {
836: case '>':
837: *pstate = PARSE_ELEM;
838: if (pend > 0 && b[pend - 1] == '/') {
839: b[pend - 1] = '\0';
840: elem_end = 1;
841: }
1.23 schwarze 842: if (p->flags & PFLAG_EEND)
843: elem_end = 1;
1.14 schwarze 844: break;
845: case '=':
846: *pstate = PARSE_ARG;
847: break;
848: default:
849: break;
850: }
851: b[pend] = '\0';
852: if (pend < rlen)
853: increment(p, b, &pend, refill);
854: xml_attrkey(p, b + poff);
855: if (elem_end)
856: xml_elem_end(p, NULL);
857:
858: /* Begin an opening or closing tag. */
859:
860: } else if (b[poff] == '<') {
861: if (advance(p, b, rlen, &pend, " >", refill))
862: break;
863: if (pend > poff + 3 &&
864: strncmp(b + poff, "<!--", 4) == 0) {
865:
866: /* Skip a comment. */
867:
868: cp = strstr(b + pend - 2, "-->");
869: if (cp == NULL) {
870: if (refill)
871: break;
872: cp = b + rlen;
873: } else
874: cp += 3;
875: while (b + pend < cp)
876: increment(p, b, &pend, refill);
877: continue;
878: }
879: elem_end = 0;
880: if (b[pend] != '>')
881: *pstate = PARSE_TAG;
882: else if (pend > 0 && b[pend - 1] == '/') {
883: b[pend - 1] = '\0';
884: elem_end = 1;
885: }
886: b[pend] = '\0';
887: if (pend < rlen)
888: increment(p, b, &pend, refill);
889: if (b[++poff] == '/') {
890: elem_end = 1;
891: poff++;
1.23 schwarze 892: } else {
1.14 schwarze 893: xml_elem_start(p, b + poff);
1.23 schwarze 894: if (*pstate == PARSE_ELEM &&
895: p->flags & PFLAG_EEND)
896: elem_end = 1;
897: }
1.14 schwarze 898: if (elem_end)
899: xml_elem_end(p, b + poff);
900:
1.23 schwarze 901: /* Close a doctype. */
902:
903: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
904: *pstate = PARSE_TAG;
905: increment(p, b, &pend, refill);
906:
1.14 schwarze 907: /* Process an entity. */
908:
909: } else if (b[poff] == '&') {
910: if (advance(p, b, rlen, &pend, ";", refill))
911: break;
912: b[pend] = '\0';
913: if (pend < rlen)
914: increment(p, b, &pend, refill);
915: xml_entity(p, b + poff + 1);
916:
917: /* Process text up to the next tag, entity, or EOL. */
918:
919: } else {
1.28 schwarze 920: advance(p, b, rlen, &pend,
1.33 schwarze 921: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 922: refill);
1.35 schwarze 923: xml_text(p, b + poff, pend - poff);
1.33 schwarze 924: if (b[pend] == '\n')
1.37 schwarze 925: pnode_closetext(p, 0);
1.14 schwarze 926: }
927: }
928: return poff;
929: }
930:
1.24 schwarze 931:
932: /*
933: * The read loop.
934: * If the previous token was incomplete and asked for more input,
935: * we have to enter the read loop once more even on EOF.
936: * Once rsz is 0, incomplete tokens will no longer ask for more input
937: * but instead use whatever there is, and then exit the read loop.
938: * The minus one on the size limit for read(2) is needed such that
939: * advance() can set b[rlen] to NUL when needed.
940: */
941: static void
942: parse_fd(struct parse *p, int fd)
1.1 schwarze 943: {
944: char b[4096];
1.5 schwarze 945: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 946: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 947: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 948: enum pstate pstate;
1.1 schwarze 949:
1.24 schwarze 950: rlen = 0;
1.14 schwarze 951: pstate = PARSE_ELEM;
952: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
953: (rlen += rsz) > 0) {
954: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 955: /* Buffer exhausted; shift left and re-fill. */
956: assert(poff > 0);
957: rlen -= poff;
1.14 schwarze 958: memmove(b, b + poff, rlen);
1.5 schwarze 959: }
1.24 schwarze 960: if (rsz < 0)
961: error_msg(p, "read: %s", strerror(errno));
962: }
963:
964: /*
965: * Open and parse a file.
966: */
967: struct ptree *
968: parse_file(struct parse *p, int fd, const char *fname)
969: {
970: const char *save_fname;
971: int save_line, save_col;
972:
973: /* Save and initialize reporting data. */
974:
975: save_fname = p->fname;
976: save_line = p->nline;
977: save_col = p->ncol;
978: p->fname = fname;
979: p->line = 0;
980: p->col = 0;
981:
982: /* Open the file, unless it is already open. */
983:
984: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
985: error_msg(p, "open: %s", strerror(errno));
986: p->fname = save_fname;
987: return p->tree;
1.5 schwarze 988: }
1.24 schwarze 989:
990: /*
991: * After opening the starting file, change to the directory it
992: * is located in, in case it wants to include any further files,
993: * which are typically given with relative paths in DocBook.
994: * Do this on a best-effort basis; don't complain about failure.
995: */
996:
997: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
998: strcmp(fname, ".") != 0)
999: (void)chdir(fname);
1000:
1001: /* Run the read loop. */
1002:
1003: p->nline = 1;
1004: p->ncol = 1;
1005: parse_fd(p, fd);
1006:
1007: /* On the top level, finalize the parse tree. */
1008:
1009: if (save_fname == NULL) {
1.37 schwarze 1010: pnode_closetext(p, 0);
1.24 schwarze 1011: if (p->tree->root == NULL)
1012: error_msg(p, "empty document");
1013: else if ((p->tree->flags & TREE_CLOSED) == 0)
1014: warn_msg(p, "document not closed");
1015: pnode_unlink(p->doctype);
1016: }
1017:
1018: /* Clean up. */
1019:
1020: if (fd != STDIN_FILENO)
1021: close(fd);
1022: p->fname = save_fname;
1023: p->nline = save_line;
1024: p->ncol = save_col;
1.1 schwarze 1025: return p->tree;
1026: }
CVSweb