Annotation of docbook2mdoc/parse.c, Revision 1.40
1.40 ! schwarze 1: /* $Id: parse.c,v 1.39 2019/04/12 16:40:53 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.4 schwarze 73: { "anchor", NODE_DELETE },
1.22 schwarze 74: { "article", NODE_SECTION },
75: { "book", NODE_SECTION },
1.1 schwarze 76: { "chapter", NODE_SECTION },
1.13 schwarze 77: { "code", NODE_LITERAL },
1.36 schwarze 78: { "computeroutput", NODE_LITERAL },
1.23 schwarze 79: { "!doctype", NODE_DOCTYPE },
1.7 schwarze 80: { "firstname", NODE_PERSONNAME },
1.21 schwarze 81: { "glossary", NODE_VARIABLELIST },
82: { "glossdef", NODE_IGNORE },
83: { "glossdiv", NODE_IGNORE },
84: { "glossentry", NODE_VARLISTENTRY },
85: { "glosslist", NODE_VARIABLELIST },
1.4 schwarze 86: { "indexterm", NODE_DELETE },
1.11 schwarze 87: { "informaltable", NODE_TABLE },
1.40 ! schwarze 88: { "othercredit", NODE_AUTHOR },
1.7 schwarze 89: { "othername", NODE_PERSONNAME },
1.1 schwarze 90: { "part", NODE_SECTION },
1.3 schwarze 91: { "phrase", NODE_IGNORE },
1.4 schwarze 92: { "primary", NODE_DELETE },
1.1 schwarze 93: { "refsect1", NODE_SECTION },
94: { "refsect2", NODE_SECTION },
95: { "refsect3", NODE_SECTION },
96: { "refsection", NODE_SECTION },
1.4 schwarze 97: { "secondary", NODE_DELETE },
1.1 schwarze 98: { "sect1", NODE_SECTION },
99: { "sect2", NODE_SECTION },
1.36 schwarze 100: { "sgmltag", NODE_MARKUP },
1.15 schwarze 101: { "simpara", NODE_PARA },
1.13 schwarze 102: { "structfield", NODE_PARAMETER },
103: { "structname", NODE_TYPE },
1.7 schwarze 104: { "surname", NODE_PERSONNAME },
1.12 schwarze 105: { "symbol", NODE_CONSTANT },
1.3 schwarze 106: { "trademark", NODE_IGNORE },
1.18 schwarze 107: { "ulink", NODE_LINK },
1.13 schwarze 108: { "userinput", NODE_LITERAL },
1.5 schwarze 109: { NULL, NODE_IGNORE }
1.1 schwarze 110: };
111:
1.9 schwarze 112: struct entity {
113: const char *name;
114: const char *roff;
115: };
116:
117: /*
118: * XML character entity references found in the wild.
119: * Those that don't have an exact mandoc_char(7) representation
120: * are approximated, and the desired codepoint is given as a comment.
121: * Encoding them as \\[u...] would leave -Tascii out in the cold.
122: */
123: static const struct entity entities[] = {
124: { "alpha", "\\(*a" },
125: { "amp", "&" },
126: { "apos", "'" },
127: { "auml", "\\(:a" },
128: { "beta", "\\(*b" },
129: { "circ", "^" }, /* U+02C6 */
130: { "copy", "\\(co" },
131: { "dagger", "\\(dg" },
132: { "Delta", "\\(*D" },
133: { "eacute", "\\('e" },
134: { "emsp", "\\ " }, /* U+2003 */
135: { "gt", ">" },
136: { "hairsp", "\\^" },
137: { "kappa", "\\(*k" },
138: { "larr", "\\(<-" },
139: { "ldquo", "\\(lq" },
140: { "le", "\\(<=" },
141: { "lowbar", "_" },
142: { "lsqb", "[" },
143: { "lt", "<" },
144: { "mdash", "\\(em" },
145: { "minus", "\\-" },
146: { "ndash", "\\(en" },
147: { "nbsp", "\\ " },
148: { "num", "#" },
149: { "oslash", "\\(/o" },
150: { "ouml", "\\(:o" },
151: { "percnt", "%" },
152: { "quot", "\\(dq" },
153: { "rarr", "\\(->" },
154: { "rArr", "\\(rA" },
155: { "rdquo", "\\(rq" },
156: { "reg", "\\(rg" },
157: { "rho", "\\(*r" },
158: { "rsqb", "]" },
159: { "sigma", "\\(*s" },
160: { "shy", "\\&" }, /* U+00AD */
161: { "tau", "\\(*t" },
162: { "tilde", "\\[u02DC]" },
163: { "times", "\\[tmu]" },
164: { "uuml", "\\(:u" },
165: { NULL, NULL }
166: };
167:
1.23 schwarze 168: static size_t parse_string(struct parse *, char *, size_t,
169: enum pstate *, int);
1.24 schwarze 170: static void parse_fd(struct parse *, int);
1.23 schwarze 171:
172:
1.6 schwarze 173: static void
1.29 schwarze 174: fatal(struct parse *p)
175: {
176: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
177: perror(NULL);
178: exit(6);
179: }
180:
181: static void
1.6 schwarze 182: error_msg(struct parse *p, const char *fmt, ...)
183: {
184: va_list ap;
185:
1.29 schwarze 186: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 187: va_start(ap, fmt);
188: vfprintf(stderr, fmt, ap);
189: va_end(ap);
190: fputc('\n', stderr);
1.29 schwarze 191: p->tree->flags |= TREE_ERROR;
1.6 schwarze 192: }
193:
194: static void
195: warn_msg(struct parse *p, const char *fmt, ...)
196: {
197: va_list ap;
198:
1.23 schwarze 199: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 200: return;
201:
1.29 schwarze 202: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 203: va_start(ap, fmt);
204: vfprintf(stderr, fmt, ap);
205: va_end(ap);
206: fputc('\n', stderr);
1.29 schwarze 207: p->tree->flags |= TREE_WARN;
1.6 schwarze 208: }
209:
1.1 schwarze 210: /*
211: * Process a string of characters.
212: * If a text node is already open, append to it.
213: * Otherwise, create a new one as a child of the current node.
214: */
215: static void
1.35 schwarze 216: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 217: {
1.35 schwarze 218: struct pnode *n, *np;
1.32 schwarze 219: size_t oldsz, newsz;
1.35 schwarze 220: int i;
1.1 schwarze 221:
1.32 schwarze 222: assert(sz > 0);
1.30 schwarze 223: if (p->del > 0)
1.1 schwarze 224: return;
225:
1.32 schwarze 226: if ((n = p->cur) == NULL) {
1.35 schwarze 227: error_msg(p, "discarding text before document: %.*s",
228: sz, word);
1.5 schwarze 229: return;
230: }
231:
1.35 schwarze 232: /* Append to the current text node, if one is open. */
233:
234: if (n->node == NODE_TEXT) {
235: oldsz = strlen(n->b);
236: newsz = oldsz + sz;
237: if (oldsz && (p->flags & PFLAG_SPC))
238: newsz++;
239: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 240: fatal(p);
1.35 schwarze 241: if (oldsz && (p->flags & PFLAG_SPC))
242: n->b[oldsz++] = ' ';
243: memcpy(n->b + oldsz, word, sz);
244: n->b[newsz] = '\0';
245: p->flags &= ~PFLAG_SPC;
246: return;
1.1 schwarze 247: }
248:
1.35 schwarze 249: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 250: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 251:
1.35 schwarze 252: /* Create a new text node. */
1.1 schwarze 253:
1.35 schwarze 254: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 255: fatal(p);
1.35 schwarze 256: n->node = NODE_TEXT;
257: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 258: p->flags &= ~PFLAG_SPC;
1.35 schwarze 259:
260: /*
1.39 schwarze 261: * If this node follows an in-line macro without intervening
1.35 schwarze 262: * whitespace, keep the text in it as short as possible,
263: * and do not keep it open.
264: */
265:
1.39 schwarze 266: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
267: while (np != NULL) {
268: switch (pnode_class(np->node)) {
269: case CLASS_VOID:
270: case CLASS_TEXT:
271: case CLASS_BLOCK:
272: np = NULL;
273: break;
274: case CLASS_TRANS:
275: np = TAILQ_LAST(&np->childq, pnodeq);
276: continue;
277: case CLASS_LINE:
278: case CLASS_ENCL:
279: break;
280: }
281: break;
282: }
283: if (np != NULL) {
1.35 schwarze 284: i = 0;
285: while (i < sz && !isspace((unsigned char)word[i]))
286: i++;
287: if ((n->b = strndup(word, i)) == NULL)
288: fatal(p);
289: if (i == sz)
290: return;
291: while (i < sz && isspace((unsigned char)word[i]))
292: i++;
293: if (i == sz) {
294: p->flags |= PFLAG_SPC;
295: return;
296: }
297:
298: /* Put any remaining text into a second node. */
299:
300: if ((n = pnode_alloc(p->cur)) == NULL)
301: fatal(p);
302: n->node = NODE_TEXT;
303: n->spc = 1;
304: word += i;
305: sz -= i;
306: }
307: if ((n->b = strndup(word, sz)) == NULL)
308: fatal(p);
309:
310: /* The new node remains open for later pnode_closetext(). */
311:
312: p->cur = n;
1.1 schwarze 313: }
314:
1.16 schwarze 315: /*
316: * Close out the text node and strip trailing whitespace, if one is open.
317: */
1.1 schwarze 318: static void
1.37 schwarze 319: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 320: {
1.16 schwarze 321: struct pnode *n;
1.37 schwarze 322: char *cp, *last_word;
1.16 schwarze 323:
324: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
325: return;
326: p->cur = n->parent;
1.32 schwarze 327: for (cp = strchr(n->b, '\0');
328: cp > n->b && isspace((unsigned char)cp[-1]);
329: *--cp = '\0')
1.23 schwarze 330: p->flags |= PFLAG_SPC;
1.37 schwarze 331:
332: if (p->flags & PFLAG_SPC || !check_last_word)
333: return;
334:
335: /*
336: * Find the beginning of the last word
337: * and delete whitespace before it.
338: */
339:
340: while (cp > n->b && !isspace((unsigned char)cp[-1]))
341: cp--;
342: if (cp == n->b)
343: return;
344:
345: last_word = cp;
346: while (cp > n->b && isspace((unsigned char)cp[-1]))
347: *--cp = '\0';
348:
349: /* Move the last word into its own node, for use with .Pf. */
350:
351: if ((n = pnode_alloc(p->cur)) == NULL)
352: fatal(p);
353: n->node = NODE_TEXT;
354: n->spc = 1;
355: if ((n->b = strdup(last_word)) == NULL)
356: fatal(p);
1.1 schwarze 357: }
358:
1.9 schwarze 359: static void
360: xml_entity(struct parse *p, const char *name)
361: {
362: const struct entity *entity;
1.30 schwarze 363: struct pnode *n;
1.23 schwarze 364: const char *ccp;
365: char *cp;
366: enum pstate pstate;
1.9 schwarze 367:
368: if (p->del > 0)
369: return;
370:
371: if (p->cur == NULL) {
372: error_msg(p, "discarding entity before document: &%s;", name);
373: return;
374: }
375:
1.37 schwarze 376: pnode_closetext(p, 0);
1.9 schwarze 377:
378: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
379: warn_msg(p, "entity after end of document: &%s;", name);
380:
381: for (entity = entities; entity->name != NULL; entity++)
382: if (strcmp(name, entity->name) == 0)
383: break;
384:
385: if (entity->roff == NULL) {
1.23 schwarze 386: if (p->doctype != NULL) {
1.30 schwarze 387: TAILQ_FOREACH(n, &p->doctype->childq, child) {
388: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 389: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 390: strcmp(ccp, name) != 0)
391: continue;
1.30 schwarze 392: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 393: ATTRKEY_SYSTEM, NULL)) != NULL) {
394: parse_file(p, -1, ccp);
395: p->flags &= ~PFLAG_SPC;
396: return;
397: }
1.30 schwarze 398: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 399: ATTRKEY_DEFINITION, NULL)) == NULL)
400: continue;
1.29 schwarze 401: if ((cp = strdup(ccp)) == NULL)
402: fatal(p);
1.23 schwarze 403: pstate = PARSE_ELEM;
404: parse_string(p, cp, strlen(cp), &pstate, 0);
405: p->flags &= ~PFLAG_SPC;
406: free(cp);
407: return;
408: }
409: }
1.9 schwarze 410: error_msg(p, "unknown entity &%s;", name);
411: return;
412: }
413:
414: /* Create, append, and close out an entity node. */
1.34 schwarze 415: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 416: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 417: fatal(p);
1.30 schwarze 418: n->node = NODE_ESCAPE;
419: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 420: p->flags &= ~PFLAG_SPC;
1.9 schwarze 421: }
422:
1.1 schwarze 423: /*
1.39 schwarze 424: * Parse an element name.
425: */
426: static enum nodeid
427: xml_name2node(struct parse *p, const char *name)
428: {
429: const struct alias *alias;
430: enum nodeid node;
431:
432: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
433: return node;
434:
435: for (alias = aliases; alias->name != NULL; alias++)
436: if (strcmp(alias->name, name) == 0)
437: return alias->node;
438:
439: return NODE_UNKNOWN;
440: }
441:
442: /*
1.1 schwarze 443: * Begin an element.
444: */
445: static void
1.30 schwarze 446: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 447: {
1.30 schwarze 448: struct pnode *n;
1.1 schwarze 449:
1.4 schwarze 450: /*
451: * An ancestor is excluded from the tree;
452: * keep track of the number of levels excluded.
453: */
1.30 schwarze 454: if (p->del > 0) {
1.23 schwarze 455: if (*name != '!' && *name != '?')
1.30 schwarze 456: p->del++;
1.4 schwarze 457: return;
458: }
459:
1.39 schwarze 460: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 461: case NODE_DELETE_WARN:
1.30 schwarze 462: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 463: /* FALLTHROUGH */
1.4 schwarze 464: case NODE_DELETE:
1.30 schwarze 465: p->del = 1;
1.4 schwarze 466: /* FALLTHROUGH */
1.2 schwarze 467: case NODE_IGNORE:
468: return;
1.39 schwarze 469: case NODE_UNKNOWN:
470: if (*name != '!' && *name != '?')
471: error_msg(p, "unknown element <%s>", name);
472: return;
1.2 schwarze 473: default:
474: break;
475: }
1.1 schwarze 476:
1.30 schwarze 477: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
478: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 479:
1.39 schwarze 480: switch (pnode_class(p->ncur)) {
481: case CLASS_LINE:
482: case CLASS_ENCL:
483: pnode_closetext(p, 1);
484: break;
485: default:
486: pnode_closetext(p, 0);
487: break;
488: }
489:
1.34 schwarze 490: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 491: fatal(p);
1.17 schwarze 492:
493: /*
1.39 schwarze 494: * Some elements are self-closing.
1.17 schwarze 495: * Nodes that begin a new macro or request line or start by
496: * printing text always want whitespace before themselves.
497: */
498:
1.39 schwarze 499: switch (n->node = p->ncur) {
1.23 schwarze 500: case NODE_DOCTYPE:
501: case NODE_ENTITY:
502: case NODE_SBR:
1.30 schwarze 503: p->flags |= PFLAG_EEND;
1.17 schwarze 504: break;
505: default:
1.39 schwarze 506: break;
507: }
508: switch (pnode_class(p->ncur)) {
509: case CLASS_LINE:
510: case CLASS_ENCL:
1.30 schwarze 511: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 512: break;
1.39 schwarze 513: default:
514: n->spc = 1;
515: break;
1.17 schwarze 516: }
1.30 schwarze 517: p->cur = n;
518: if (n->node == NODE_DOCTYPE) {
519: if (p->doctype == NULL)
520: p->doctype = n;
1.23 schwarze 521: else
1.30 schwarze 522: error_msg(p, "duplicate doctype");
523: } else if (n->parent == NULL && p->tree->root == NULL)
524: p->tree->root = n;
1.5 schwarze 525: }
526:
527: static void
1.30 schwarze 528: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 529: {
1.30 schwarze 530: struct pattr *a;
1.23 schwarze 531: const char *value;
1.5 schwarze 532: enum attrkey key;
1.1 schwarze 533:
1.30 schwarze 534: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 535: return;
1.23 schwarze 536:
1.30 schwarze 537: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
538: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 539: value = name;
540: name = "NAME";
541: } else
542: value = NULL;
543:
1.5 schwarze 544: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 545: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 546: return;
547: }
1.30 schwarze 548: if ((a = calloc(1, sizeof(*a))) == NULL)
549: fatal(p);
1.29 schwarze 550:
1.30 schwarze 551: a->key = key;
552: a->val = ATTRVAL__MAX;
1.23 schwarze 553: if (value == NULL) {
1.30 schwarze 554: a->rawval = NULL;
555: p->flags |= PFLAG_ATTR;
1.23 schwarze 556: } else {
1.30 schwarze 557: if ((a->rawval = strdup(value)) == NULL)
558: fatal(p);
559: p->flags &= ~PFLAG_ATTR;
560: }
561: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
562: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
563: xml_attrkey(p, "DEFINITION");
1.5 schwarze 564: }
565:
566: static void
1.30 schwarze 567: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 568: {
1.30 schwarze 569: struct pattr *a;
1.5 schwarze 570:
1.30 schwarze 571: if (p->del > 0 || p->ncur == NODE_IGNORE ||
572: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 573: return;
1.30 schwarze 574: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 575: return;
1.30 schwarze 576: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
577: (a->rawval = strdup(name)) == NULL)
578: fatal(p);
579: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 580: }
581:
582: /*
583: * Roll up the parse tree.
584: * If we're at a text node, roll that one up first.
585: */
586: static void
1.31 schwarze 587: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 588: {
1.26 schwarze 589: struct pnode *n;
590: const char *cp;
1.5 schwarze 591: enum nodeid node;
1.1 schwarze 592:
1.4 schwarze 593: /*
594: * An ancestor is excluded from the tree;
595: * keep track of the number of levels excluded.
596: */
1.31 schwarze 597: if (p->del > 1) {
598: p->del--;
1.4 schwarze 599: return;
600: }
601:
1.31 schwarze 602: if (p->del == 0)
1.37 schwarze 603: pnode_closetext(p, 0);
1.2 schwarze 604:
1.39 schwarze 605: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 606:
1.5 schwarze 607: switch (node) {
1.4 schwarze 608: case NODE_DELETE_WARN:
609: case NODE_DELETE:
1.31 schwarze 610: if (p->del > 0)
611: p->del--;
1.4 schwarze 612: break;
1.2 schwarze 613: case NODE_IGNORE:
1.39 schwarze 614: case NODE_UNKNOWN:
1.26 schwarze 615: break;
616: case NODE_INCLUDE:
1.31 schwarze 617: n = p->cur;
618: p->cur = p->cur->parent;
1.26 schwarze 619: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
620: if (cp == NULL)
1.31 schwarze 621: error_msg(p, "<xi:include> element "
1.26 schwarze 622: "without href attribute");
623: else
1.31 schwarze 624: parse_file(p, -1, cp);
1.26 schwarze 625: pnode_unlink(n);
1.31 schwarze 626: p->flags &= ~PFLAG_SPC;
1.2 schwarze 627: break;
1.23 schwarze 628: case NODE_DOCTYPE:
1.32 schwarze 629: case NODE_SBR:
1.31 schwarze 630: p->flags &= ~PFLAG_EEND;
1.23 schwarze 631: /* FALLTHROUGH */
1.2 schwarze 632: default:
1.31 schwarze 633: if (p->cur == NULL || node != p->cur->node) {
634: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 635: break;
636: }
637:
638: /*
639: * Refrain from actually closing the document element.
640: * If no more content follows, no harm is done, but if
641: * some content still follows, simply processing it is
642: * obviously better than discarding it or crashing.
643: */
644:
1.31 schwarze 645: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
646: p->cur = p->cur->parent;
647: if (p->cur != NULL)
648: p->ncur = p->cur->node;
1.23 schwarze 649: } else
1.31 schwarze 650: p->tree->flags |= TREE_CLOSED;
651: p->flags &= ~PFLAG_SPC;
1.4 schwarze 652: break;
1.2 schwarze 653: }
1.31 schwarze 654: assert(p->del == 0);
1.1 schwarze 655: }
656:
657: struct parse *
658: parse_alloc(int warn)
659: {
660: struct parse *p;
661:
662: if ((p = calloc(1, sizeof(*p))) == NULL)
663: return NULL;
664:
665: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
666: free(p);
667: return NULL;
668: }
1.23 schwarze 669: if (warn)
670: p->flags |= PFLAG_WARN;
671: else
672: p->flags &= ~PFLAG_WARN;
1.1 schwarze 673: return p;
674: }
675:
676: void
677: parse_free(struct parse *p)
678: {
679: if (p == NULL)
680: return;
681: if (p->tree != NULL) {
682: pnode_unlink(p->tree->root);
683: free(p->tree);
684: }
685: free(p);
686: }
687:
1.14 schwarze 688: static void
689: increment(struct parse *p, char *b, size_t *pend, int refill)
690: {
691: if (refill) {
692: if (b[*pend] == '\n') {
693: p->nline++;
694: p->ncol = 1;
695: } else
696: p->ncol++;
697: }
698: ++*pend;
699: }
700:
1.5 schwarze 701: /*
702: * Advance the pend pointer to the next character in the charset.
703: * If the charset starts with a space, it stands for any whitespace.
704: * Update the new input file position, used for messages.
705: * Do not overrun the buffer b of length rlen.
706: * When reaching the end, NUL-terminate the buffer and return 1;
707: * otherwise, return 0.
708: */
709: static int
710: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 711: const char *charset, int refill)
1.5 schwarze 712: {
713: int space;
714:
715: if (*charset == ' ') {
716: space = 1;
717: charset++;
718: } else
719: space = 0;
720:
1.14 schwarze 721: if (refill) {
722: p->nline = p->line;
723: p->ncol = p->col;
724: }
1.5 schwarze 725: while (*pend < rlen) {
726: if (space && isspace((unsigned char)b[*pend]))
727: break;
728: if (strchr(charset, b[*pend]) != NULL)
729: break;
1.14 schwarze 730: increment(p, b, pend, refill);
1.5 schwarze 731: }
732: if (*pend == rlen) {
733: b[rlen] = '\0';
1.14 schwarze 734: return refill;
1.5 schwarze 735: } else
736: return 0;
737: }
738:
1.14 schwarze 739: size_t
740: parse_string(struct parse *p, char *b, size_t rlen,
741: enum pstate *pstate, int refill)
742: {
743: char *cp;
744: size_t poff; /* Parse offset in b[]. */
745: size_t pend; /* Offset of the end of the current word. */
746: int elem_end;
747:
748: pend = 0;
749: for (;;) {
750:
751: /* Proceed to the next token, skipping whitespace. */
752:
753: if (refill) {
754: p->line = p->nline;
755: p->col = p->ncol;
756: }
757: if ((poff = pend) == rlen)
758: break;
759: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 760: p->flags |= PFLAG_SPC;
1.14 schwarze 761: increment(p, b, &pend, refill);
762: continue;
763: }
764:
765: /*
766: * The following four cases (ARG, TAG, and starting an
767: * entity or a tag) all parse a word or quoted string.
768: * If that extends beyond the read buffer and the last
769: * read(2) still got data, they all break out of the
770: * token loop to request more data from the read loop.
771: *
772: * Also, three of them detect self-closing tags, those
773: * ending with "/>", setting the flag elem_end and
774: * calling xml_elem_end() at the very end, after
775: * handling the attribute value, attribute name, or
776: * tag name, respectively.
777: */
778:
779: /* Parse an attribute value. */
780:
781: if (*pstate >= PARSE_ARG) {
782: if (*pstate == PARSE_ARG &&
783: (b[pend] == '\'' || b[pend] == '"')) {
784: *pstate = b[pend] == '"' ?
785: PARSE_DQ : PARSE_SQ;
786: increment(p, b, &pend, refill);
787: continue;
788: }
789: if (advance(p, b, rlen, &pend,
790: *pstate == PARSE_DQ ? "\"" :
791: *pstate == PARSE_SQ ? "'" : " >", refill))
792: break;
793: *pstate = PARSE_TAG;
794: elem_end = 0;
795: if (b[pend] == '>') {
796: *pstate = PARSE_ELEM;
797: if (pend > 0 && b[pend - 1] == '/') {
798: b[pend - 1] = '\0';
799: elem_end = 1;
800: }
1.23 schwarze 801: if (p->flags & PFLAG_EEND)
802: elem_end = 1;
1.14 schwarze 803: }
804: b[pend] = '\0';
805: if (pend < rlen)
806: increment(p, b, &pend, refill);
807: xml_attrval(p, b + poff);
808: if (elem_end)
809: xml_elem_end(p, NULL);
810:
811: /* Look for an attribute name. */
812:
813: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 814: switch (p->ncur) {
815: case NODE_DOCTYPE:
816: if (b[pend] == '[') {
817: *pstate = PARSE_ELEM;
818: increment(p, b, &pend, refill);
819: continue;
820: }
821: /* FALLTHROUGH */
822: case NODE_ENTITY:
823: if (b[pend] == '"' || b[pend] == '\'') {
824: *pstate = PARSE_ARG;
825: continue;
826: }
827: break;
828: default:
829: break;
830: }
1.14 schwarze 831: if (advance(p, b, rlen, &pend, " =>", refill))
832: break;
833: elem_end = 0;
834: switch (b[pend]) {
835: case '>':
836: *pstate = PARSE_ELEM;
837: if (pend > 0 && b[pend - 1] == '/') {
838: b[pend - 1] = '\0';
839: elem_end = 1;
840: }
1.23 schwarze 841: if (p->flags & PFLAG_EEND)
842: elem_end = 1;
1.14 schwarze 843: break;
844: case '=':
845: *pstate = PARSE_ARG;
846: break;
847: default:
848: break;
849: }
850: b[pend] = '\0';
851: if (pend < rlen)
852: increment(p, b, &pend, refill);
853: xml_attrkey(p, b + poff);
854: if (elem_end)
855: xml_elem_end(p, NULL);
856:
857: /* Begin an opening or closing tag. */
858:
859: } else if (b[poff] == '<') {
860: if (advance(p, b, rlen, &pend, " >", refill))
861: break;
862: if (pend > poff + 3 &&
863: strncmp(b + poff, "<!--", 4) == 0) {
864:
865: /* Skip a comment. */
866:
867: cp = strstr(b + pend - 2, "-->");
868: if (cp == NULL) {
869: if (refill)
870: break;
871: cp = b + rlen;
872: } else
873: cp += 3;
874: while (b + pend < cp)
875: increment(p, b, &pend, refill);
876: continue;
877: }
878: elem_end = 0;
879: if (b[pend] != '>')
880: *pstate = PARSE_TAG;
881: else if (pend > 0 && b[pend - 1] == '/') {
882: b[pend - 1] = '\0';
883: elem_end = 1;
884: }
885: b[pend] = '\0';
886: if (pend < rlen)
887: increment(p, b, &pend, refill);
888: if (b[++poff] == '/') {
889: elem_end = 1;
890: poff++;
1.23 schwarze 891: } else {
1.14 schwarze 892: xml_elem_start(p, b + poff);
1.23 schwarze 893: if (*pstate == PARSE_ELEM &&
894: p->flags & PFLAG_EEND)
895: elem_end = 1;
896: }
1.14 schwarze 897: if (elem_end)
898: xml_elem_end(p, b + poff);
899:
1.23 schwarze 900: /* Close a doctype. */
901:
902: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
903: *pstate = PARSE_TAG;
904: increment(p, b, &pend, refill);
905:
1.14 schwarze 906: /* Process an entity. */
907:
908: } else if (b[poff] == '&') {
909: if (advance(p, b, rlen, &pend, ";", refill))
910: break;
911: b[pend] = '\0';
912: if (pend < rlen)
913: increment(p, b, &pend, refill);
914: xml_entity(p, b + poff + 1);
915:
916: /* Process text up to the next tag, entity, or EOL. */
917:
918: } else {
1.28 schwarze 919: advance(p, b, rlen, &pend,
1.33 schwarze 920: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 921: refill);
1.35 schwarze 922: xml_text(p, b + poff, pend - poff);
1.33 schwarze 923: if (b[pend] == '\n')
1.37 schwarze 924: pnode_closetext(p, 0);
1.14 schwarze 925: }
926: }
927: return poff;
928: }
929:
1.24 schwarze 930:
931: /*
932: * The read loop.
933: * If the previous token was incomplete and asked for more input,
934: * we have to enter the read loop once more even on EOF.
935: * Once rsz is 0, incomplete tokens will no longer ask for more input
936: * but instead use whatever there is, and then exit the read loop.
937: * The minus one on the size limit for read(2) is needed such that
938: * advance() can set b[rlen] to NUL when needed.
939: */
940: static void
941: parse_fd(struct parse *p, int fd)
1.1 schwarze 942: {
943: char b[4096];
1.5 schwarze 944: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 945: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 946: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 947: enum pstate pstate;
1.1 schwarze 948:
1.24 schwarze 949: rlen = 0;
1.14 schwarze 950: pstate = PARSE_ELEM;
951: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
952: (rlen += rsz) > 0) {
953: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 954: /* Buffer exhausted; shift left and re-fill. */
955: assert(poff > 0);
956: rlen -= poff;
1.14 schwarze 957: memmove(b, b + poff, rlen);
1.5 schwarze 958: }
1.24 schwarze 959: if (rsz < 0)
960: error_msg(p, "read: %s", strerror(errno));
961: }
962:
963: /*
964: * Open and parse a file.
965: */
966: struct ptree *
967: parse_file(struct parse *p, int fd, const char *fname)
968: {
969: const char *save_fname;
970: int save_line, save_col;
971:
972: /* Save and initialize reporting data. */
973:
974: save_fname = p->fname;
975: save_line = p->nline;
976: save_col = p->ncol;
977: p->fname = fname;
978: p->line = 0;
979: p->col = 0;
980:
981: /* Open the file, unless it is already open. */
982:
983: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
984: error_msg(p, "open: %s", strerror(errno));
985: p->fname = save_fname;
986: return p->tree;
1.5 schwarze 987: }
1.24 schwarze 988:
989: /*
990: * After opening the starting file, change to the directory it
991: * is located in, in case it wants to include any further files,
992: * which are typically given with relative paths in DocBook.
993: * Do this on a best-effort basis; don't complain about failure.
994: */
995:
996: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
997: strcmp(fname, ".") != 0)
998: (void)chdir(fname);
999:
1000: /* Run the read loop. */
1001:
1002: p->nline = 1;
1003: p->ncol = 1;
1004: parse_fd(p, fd);
1005:
1006: /* On the top level, finalize the parse tree. */
1007:
1008: if (save_fname == NULL) {
1.37 schwarze 1009: pnode_closetext(p, 0);
1.24 schwarze 1010: if (p->tree->root == NULL)
1011: error_msg(p, "empty document");
1012: else if ((p->tree->flags & TREE_CLOSED) == 0)
1013: warn_msg(p, "document not closed");
1014: pnode_unlink(p->doctype);
1015: }
1016:
1017: /* Clean up. */
1018:
1019: if (fd != STDIN_FILENO)
1020: close(fd);
1021: p->fname = save_fname;
1022: p->nline = save_line;
1023: p->ncol = save_col;
1.1 schwarze 1024: return p->tree;
1025: }
CVSweb