Annotation of docbook2mdoc/parse.c, Revision 1.50
1.50 ! schwarze 1: /* $Id: parse.c,v 1.49 2019/04/23 14:01:55 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 59: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 60: int flags;
61: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
62: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
63: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
64: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 65: };
66:
1.39 schwarze 67: struct alias {
1.1 schwarze 68: const char *name; /* DocBook element name. */
69: enum nodeid node; /* Node type to generate. */
70: };
71:
1.39 schwarze 72: static const struct alias aliases[] = {
1.3 schwarze 73: { "acronym", NODE_IGNORE },
1.43 schwarze 74: { "affiliation", NODE_IGNORE },
1.4 schwarze 75: { "anchor", NODE_DELETE },
1.42 schwarze 76: { "application", NODE_COMMAND },
1.22 schwarze 77: { "article", NODE_SECTION },
1.41 schwarze 78: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 79: { "book", NODE_SECTION },
1.1 schwarze 80: { "chapter", NODE_SECTION },
1.44 schwarze 81: { "caption", NODE_IGNORE },
1.13 schwarze 82: { "code", NODE_LITERAL },
1.36 schwarze 83: { "computeroutput", NODE_LITERAL },
1.23 schwarze 84: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 85: { "figure", NODE_IGNORE },
1.7 schwarze 86: { "firstname", NODE_PERSONNAME },
1.21 schwarze 87: { "glossary", NODE_VARIABLELIST },
88: { "glossdef", NODE_IGNORE },
89: { "glossdiv", NODE_IGNORE },
90: { "glossentry", NODE_VARLISTENTRY },
91: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 92: { "holder", NODE_IGNORE },
1.44 schwarze 93: { "imageobject", NODE_IGNORE },
1.4 schwarze 94: { "indexterm", NODE_DELETE },
1.11 schwarze 95: { "informaltable", NODE_TABLE },
1.42 schwarze 96: { "keycap", NODE_KEYSYM },
97: { "keycode", NODE_IGNORE },
1.44 schwarze 98: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 99: { "orgname", NODE_IGNORE },
1.40 schwarze 100: { "othercredit", NODE_AUTHOR },
1.7 schwarze 101: { "othername", NODE_PERSONNAME },
1.1 schwarze 102: { "part", NODE_SECTION },
1.3 schwarze 103: { "phrase", NODE_IGNORE },
1.4 schwarze 104: { "primary", NODE_DELETE },
1.42 schwarze 105: { "property", NODE_PARAMETER },
1.1 schwarze 106: { "refsect1", NODE_SECTION },
107: { "refsect2", NODE_SECTION },
108: { "refsect3", NODE_SECTION },
109: { "refsection", NODE_SECTION },
1.43 schwarze 110: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 111: { "returnvalue", NODE_IGNORE },
1.4 schwarze 112: { "secondary", NODE_DELETE },
1.1 schwarze 113: { "sect1", NODE_SECTION },
114: { "sect2", NODE_SECTION },
1.46 schwarze 115: { "sect3", NODE_SECTION },
116: { "sect4", NODE_SECTION },
1.36 schwarze 117: { "sgmltag", NODE_MARKUP },
1.15 schwarze 118: { "simpara", NODE_PARA },
1.13 schwarze 119: { "structfield", NODE_PARAMETER },
120: { "structname", NODE_TYPE },
1.7 schwarze 121: { "surname", NODE_PERSONNAME },
1.12 schwarze 122: { "symbol", NODE_CONSTANT },
1.48 schwarze 123: { "tag", NODE_MARKUP },
1.3 schwarze 124: { "trademark", NODE_IGNORE },
1.18 schwarze 125: { "ulink", NODE_LINK },
1.13 schwarze 126: { "userinput", NODE_LITERAL },
1.43 schwarze 127: { "year", NODE_IGNORE },
1.5 schwarze 128: { NULL, NODE_IGNORE }
1.1 schwarze 129: };
130:
1.9 schwarze 131: struct entity {
132: const char *name;
133: const char *roff;
134: };
135:
136: /*
137: * XML character entity references found in the wild.
138: * Those that don't have an exact mandoc_char(7) representation
139: * are approximated, and the desired codepoint is given as a comment.
140: * Encoding them as \\[u...] would leave -Tascii out in the cold.
141: */
142: static const struct entity entities[] = {
143: { "alpha", "\\(*a" },
144: { "amp", "&" },
145: { "apos", "'" },
146: { "auml", "\\(:a" },
147: { "beta", "\\(*b" },
148: { "circ", "^" }, /* U+02C6 */
149: { "copy", "\\(co" },
150: { "dagger", "\\(dg" },
151: { "Delta", "\\(*D" },
152: { "eacute", "\\('e" },
153: { "emsp", "\\ " }, /* U+2003 */
154: { "gt", ">" },
155: { "hairsp", "\\^" },
156: { "kappa", "\\(*k" },
157: { "larr", "\\(<-" },
158: { "ldquo", "\\(lq" },
159: { "le", "\\(<=" },
160: { "lowbar", "_" },
161: { "lsqb", "[" },
162: { "lt", "<" },
163: { "mdash", "\\(em" },
164: { "minus", "\\-" },
165: { "ndash", "\\(en" },
166: { "nbsp", "\\ " },
167: { "num", "#" },
168: { "oslash", "\\(/o" },
169: { "ouml", "\\(:o" },
170: { "percnt", "%" },
171: { "quot", "\\(dq" },
172: { "rarr", "\\(->" },
173: { "rArr", "\\(rA" },
174: { "rdquo", "\\(rq" },
175: { "reg", "\\(rg" },
176: { "rho", "\\(*r" },
177: { "rsqb", "]" },
178: { "sigma", "\\(*s" },
179: { "shy", "\\&" }, /* U+00AD */
180: { "tau", "\\(*t" },
181: { "tilde", "\\[u02DC]" },
182: { "times", "\\[tmu]" },
183: { "uuml", "\\(:u" },
184: { NULL, NULL }
185: };
186:
1.23 schwarze 187: static size_t parse_string(struct parse *, char *, size_t,
188: enum pstate *, int);
1.24 schwarze 189: static void parse_fd(struct parse *, int);
1.23 schwarze 190:
191:
1.6 schwarze 192: static void
1.29 schwarze 193: fatal(struct parse *p)
194: {
195: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
196: perror(NULL);
197: exit(6);
198: }
199:
200: static void
1.6 schwarze 201: error_msg(struct parse *p, const char *fmt, ...)
202: {
203: va_list ap;
204:
1.29 schwarze 205: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 206: va_start(ap, fmt);
207: vfprintf(stderr, fmt, ap);
208: va_end(ap);
209: fputc('\n', stderr);
1.29 schwarze 210: p->tree->flags |= TREE_ERROR;
1.6 schwarze 211: }
212:
213: static void
214: warn_msg(struct parse *p, const char *fmt, ...)
215: {
216: va_list ap;
217:
1.23 schwarze 218: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 219: return;
220:
1.29 schwarze 221: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 222: va_start(ap, fmt);
223: vfprintf(stderr, fmt, ap);
224: va_end(ap);
225: fputc('\n', stderr);
1.29 schwarze 226: p->tree->flags |= TREE_WARN;
1.6 schwarze 227: }
228:
1.1 schwarze 229: /*
230: * Process a string of characters.
231: * If a text node is already open, append to it.
232: * Otherwise, create a new one as a child of the current node.
233: */
234: static void
1.35 schwarze 235: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 236: {
1.35 schwarze 237: struct pnode *n, *np;
1.32 schwarze 238: size_t oldsz, newsz;
1.35 schwarze 239: int i;
1.1 schwarze 240:
1.32 schwarze 241: assert(sz > 0);
1.30 schwarze 242: if (p->del > 0)
1.1 schwarze 243: return;
244:
1.32 schwarze 245: if ((n = p->cur) == NULL) {
1.35 schwarze 246: error_msg(p, "discarding text before document: %.*s",
247: sz, word);
1.5 schwarze 248: return;
249: }
250:
1.35 schwarze 251: /* Append to the current text node, if one is open. */
252:
253: if (n->node == NODE_TEXT) {
254: oldsz = strlen(n->b);
255: newsz = oldsz + sz;
256: if (oldsz && (p->flags & PFLAG_SPC))
257: newsz++;
258: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 259: fatal(p);
1.35 schwarze 260: if (oldsz && (p->flags & PFLAG_SPC))
261: n->b[oldsz++] = ' ';
262: memcpy(n->b + oldsz, word, sz);
263: n->b[newsz] = '\0';
264: p->flags &= ~PFLAG_SPC;
265: return;
1.1 schwarze 266: }
267:
1.35 schwarze 268: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 269: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 270:
1.35 schwarze 271: /* Create a new text node. */
1.1 schwarze 272:
1.35 schwarze 273: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 274: fatal(p);
1.35 schwarze 275: n->node = NODE_TEXT;
276: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 277: p->flags &= ~PFLAG_SPC;
1.35 schwarze 278:
279: /*
1.39 schwarze 280: * If this node follows an in-line macro without intervening
1.35 schwarze 281: * whitespace, keep the text in it as short as possible,
282: * and do not keep it open.
283: */
284:
1.39 schwarze 285: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
286: while (np != NULL) {
287: switch (pnode_class(np->node)) {
288: case CLASS_VOID:
289: case CLASS_TEXT:
290: case CLASS_BLOCK:
1.45 schwarze 291: case CLASS_NOFILL:
1.39 schwarze 292: np = NULL;
293: break;
294: case CLASS_TRANS:
295: np = TAILQ_LAST(&np->childq, pnodeq);
296: continue;
297: case CLASS_LINE:
298: case CLASS_ENCL:
299: break;
300: }
301: break;
302: }
303: if (np != NULL) {
1.35 schwarze 304: i = 0;
305: while (i < sz && !isspace((unsigned char)word[i]))
306: i++;
307: if ((n->b = strndup(word, i)) == NULL)
308: fatal(p);
309: if (i == sz)
310: return;
311: while (i < sz && isspace((unsigned char)word[i]))
312: i++;
313: if (i == sz) {
314: p->flags |= PFLAG_SPC;
315: return;
316: }
317:
318: /* Put any remaining text into a second node. */
319:
320: if ((n = pnode_alloc(p->cur)) == NULL)
321: fatal(p);
322: n->node = NODE_TEXT;
323: n->spc = 1;
324: word += i;
325: sz -= i;
326: }
327: if ((n->b = strndup(word, sz)) == NULL)
328: fatal(p);
329:
330: /* The new node remains open for later pnode_closetext(). */
331:
332: p->cur = n;
1.1 schwarze 333: }
334:
1.16 schwarze 335: /*
336: * Close out the text node and strip trailing whitespace, if one is open.
337: */
1.1 schwarze 338: static void
1.37 schwarze 339: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 340: {
1.16 schwarze 341: struct pnode *n;
1.37 schwarze 342: char *cp, *last_word;
1.16 schwarze 343:
344: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
345: return;
346: p->cur = n->parent;
1.32 schwarze 347: for (cp = strchr(n->b, '\0');
348: cp > n->b && isspace((unsigned char)cp[-1]);
349: *--cp = '\0')
1.23 schwarze 350: p->flags |= PFLAG_SPC;
1.37 schwarze 351:
352: if (p->flags & PFLAG_SPC || !check_last_word)
353: return;
354:
355: /*
356: * Find the beginning of the last word
357: * and delete whitespace before it.
358: */
359:
360: while (cp > n->b && !isspace((unsigned char)cp[-1]))
361: cp--;
362: if (cp == n->b)
363: return;
364:
365: last_word = cp;
366: while (cp > n->b && isspace((unsigned char)cp[-1]))
367: *--cp = '\0';
368:
369: /* Move the last word into its own node, for use with .Pf. */
370:
371: if ((n = pnode_alloc(p->cur)) == NULL)
372: fatal(p);
373: n->node = NODE_TEXT;
374: n->spc = 1;
375: if ((n->b = strdup(last_word)) == NULL)
376: fatal(p);
1.1 schwarze 377: }
378:
1.9 schwarze 379: static void
380: xml_entity(struct parse *p, const char *name)
381: {
382: const struct entity *entity;
1.30 schwarze 383: struct pnode *n;
1.23 schwarze 384: const char *ccp;
385: char *cp;
1.49 schwarze 386: unsigned int codepoint;
1.23 schwarze 387: enum pstate pstate;
1.9 schwarze 388:
389: if (p->del > 0)
390: return;
391:
392: if (p->cur == NULL) {
393: error_msg(p, "discarding entity before document: &%s;", name);
394: return;
395: }
396:
1.37 schwarze 397: pnode_closetext(p, 0);
1.9 schwarze 398:
399: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
400: warn_msg(p, "entity after end of document: &%s;", name);
401:
402: for (entity = entities; entity->name != NULL; entity++)
403: if (strcmp(name, entity->name) == 0)
404: break;
405:
406: if (entity->roff == NULL) {
1.23 schwarze 407: if (p->doctype != NULL) {
1.30 schwarze 408: TAILQ_FOREACH(n, &p->doctype->childq, child) {
409: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 410: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 411: strcmp(ccp, name) != 0)
412: continue;
1.30 schwarze 413: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 414: ATTRKEY_SYSTEM, NULL)) != NULL) {
415: parse_file(p, -1, ccp);
416: p->flags &= ~PFLAG_SPC;
417: return;
418: }
1.30 schwarze 419: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 420: ATTRKEY_DEFINITION, NULL)) == NULL)
421: continue;
1.29 schwarze 422: if ((cp = strdup(ccp)) == NULL)
423: fatal(p);
1.23 schwarze 424: pstate = PARSE_ELEM;
425: parse_string(p, cp, strlen(cp), &pstate, 0);
426: p->flags &= ~PFLAG_SPC;
427: free(cp);
428: return;
429: }
430: }
1.49 schwarze 431: if (*name == '#') {
432: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
433: if (ccp == NULL) {
434: if ((n = pnode_alloc(p->cur)) == NULL ||
435: asprintf(&n->b, "\\[u%4.4X]",
436: codepoint) < 0)
437: fatal(p);
438: goto done;
439: }
440: }
1.9 schwarze 441: error_msg(p, "unknown entity &%s;", name);
442: return;
443: }
444:
445: /* Create, append, and close out an entity node. */
1.34 schwarze 446: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 447: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 448: fatal(p);
1.49 schwarze 449: done:
1.30 schwarze 450: n->node = NODE_ESCAPE;
451: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 452: p->flags &= ~PFLAG_SPC;
1.9 schwarze 453: }
454:
1.1 schwarze 455: /*
1.39 schwarze 456: * Parse an element name.
457: */
458: static enum nodeid
459: xml_name2node(struct parse *p, const char *name)
460: {
461: const struct alias *alias;
462: enum nodeid node;
463:
464: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
465: return node;
466:
467: for (alias = aliases; alias->name != NULL; alias++)
468: if (strcmp(alias->name, name) == 0)
469: return alias->node;
470:
471: return NODE_UNKNOWN;
472: }
473:
474: /*
1.1 schwarze 475: * Begin an element.
476: */
477: static void
1.30 schwarze 478: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 479: {
1.30 schwarze 480: struct pnode *n;
1.1 schwarze 481:
1.4 schwarze 482: /*
483: * An ancestor is excluded from the tree;
484: * keep track of the number of levels excluded.
485: */
1.30 schwarze 486: if (p->del > 0) {
1.23 schwarze 487: if (*name != '!' && *name != '?')
1.30 schwarze 488: p->del++;
1.4 schwarze 489: return;
490: }
491:
1.39 schwarze 492: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 493: case NODE_DELETE_WARN:
1.30 schwarze 494: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 495: /* FALLTHROUGH */
1.4 schwarze 496: case NODE_DELETE:
1.30 schwarze 497: p->del = 1;
1.4 schwarze 498: /* FALLTHROUGH */
1.2 schwarze 499: case NODE_IGNORE:
500: return;
1.39 schwarze 501: case NODE_UNKNOWN:
502: if (*name != '!' && *name != '?')
503: error_msg(p, "unknown element <%s>", name);
504: return;
1.2 schwarze 505: default:
506: break;
507: }
1.1 schwarze 508:
1.30 schwarze 509: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
510: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 511:
1.39 schwarze 512: switch (pnode_class(p->ncur)) {
513: case CLASS_LINE:
514: case CLASS_ENCL:
515: pnode_closetext(p, 1);
516: break;
517: default:
518: pnode_closetext(p, 0);
519: break;
520: }
521:
1.34 schwarze 522: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 523: fatal(p);
1.17 schwarze 524:
525: /*
1.39 schwarze 526: * Some elements are self-closing.
1.17 schwarze 527: * Nodes that begin a new macro or request line or start by
528: * printing text always want whitespace before themselves.
529: */
530:
1.39 schwarze 531: switch (n->node = p->ncur) {
1.23 schwarze 532: case NODE_DOCTYPE:
533: case NODE_ENTITY:
534: case NODE_SBR:
1.48 schwarze 535: case NODE_VOID:
1.30 schwarze 536: p->flags |= PFLAG_EEND;
1.17 schwarze 537: break;
538: default:
1.39 schwarze 539: break;
540: }
541: switch (pnode_class(p->ncur)) {
542: case CLASS_LINE:
543: case CLASS_ENCL:
1.30 schwarze 544: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 545: break;
1.45 schwarze 546: case CLASS_NOFILL:
547: p->nofill++;
548: /* FALLTHROUGH */
1.39 schwarze 549: default:
550: n->spc = 1;
551: break;
1.17 schwarze 552: }
1.30 schwarze 553: p->cur = n;
554: if (n->node == NODE_DOCTYPE) {
555: if (p->doctype == NULL)
556: p->doctype = n;
1.23 schwarze 557: else
1.30 schwarze 558: error_msg(p, "duplicate doctype");
559: } else if (n->parent == NULL && p->tree->root == NULL)
560: p->tree->root = n;
1.5 schwarze 561: }
562:
563: static void
1.30 schwarze 564: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 565: {
1.30 schwarze 566: struct pattr *a;
1.23 schwarze 567: const char *value;
1.5 schwarze 568: enum attrkey key;
1.1 schwarze 569:
1.47 schwarze 570: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 571: return;
1.23 schwarze 572:
1.30 schwarze 573: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
574: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 575: value = name;
576: name = "NAME";
577: } else
578: value = NULL;
579:
1.5 schwarze 580: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 581: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 582: return;
583: }
1.30 schwarze 584: if ((a = calloc(1, sizeof(*a))) == NULL)
585: fatal(p);
1.29 schwarze 586:
1.30 schwarze 587: a->key = key;
588: a->val = ATTRVAL__MAX;
1.23 schwarze 589: if (value == NULL) {
1.30 schwarze 590: a->rawval = NULL;
591: p->flags |= PFLAG_ATTR;
1.23 schwarze 592: } else {
1.30 schwarze 593: if ((a->rawval = strdup(value)) == NULL)
594: fatal(p);
595: p->flags &= ~PFLAG_ATTR;
596: }
597: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
598: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
599: xml_attrkey(p, "DEFINITION");
1.5 schwarze 600: }
601:
602: static void
1.30 schwarze 603: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 604: {
1.30 schwarze 605: struct pattr *a;
1.5 schwarze 606:
1.47 schwarze 607: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 608: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 609: return;
1.30 schwarze 610: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 611: return;
1.30 schwarze 612: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
613: (a->rawval = strdup(name)) == NULL)
614: fatal(p);
615: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 616: }
617:
618: /*
619: * Roll up the parse tree.
620: * If we're at a text node, roll that one up first.
621: */
622: static void
1.31 schwarze 623: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 624: {
1.26 schwarze 625: struct pnode *n;
626: const char *cp;
1.5 schwarze 627: enum nodeid node;
1.1 schwarze 628:
1.4 schwarze 629: /*
630: * An ancestor is excluded from the tree;
631: * keep track of the number of levels excluded.
632: */
1.31 schwarze 633: if (p->del > 1) {
634: p->del--;
1.4 schwarze 635: return;
636: }
637:
1.31 schwarze 638: if (p->del == 0)
1.37 schwarze 639: pnode_closetext(p, 0);
1.2 schwarze 640:
1.50 ! schwarze 641: n = p->cur;
1.39 schwarze 642: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 643:
1.5 schwarze 644: switch (node) {
1.4 schwarze 645: case NODE_DELETE_WARN:
646: case NODE_DELETE:
1.31 schwarze 647: if (p->del > 0)
648: p->del--;
1.4 schwarze 649: break;
1.2 schwarze 650: case NODE_IGNORE:
1.39 schwarze 651: case NODE_UNKNOWN:
1.26 schwarze 652: break;
653: case NODE_INCLUDE:
1.50 ! schwarze 654: p->cur = n->parent;
1.26 schwarze 655: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
656: if (cp == NULL)
1.31 schwarze 657: error_msg(p, "<xi:include> element "
1.26 schwarze 658: "without href attribute");
659: else
1.31 schwarze 660: parse_file(p, -1, cp);
1.26 schwarze 661: pnode_unlink(n);
1.31 schwarze 662: p->flags &= ~PFLAG_SPC;
1.2 schwarze 663: break;
1.23 schwarze 664: case NODE_DOCTYPE:
1.32 schwarze 665: case NODE_SBR:
1.48 schwarze 666: case NODE_VOID:
1.31 schwarze 667: p->flags &= ~PFLAG_EEND;
1.23 schwarze 668: /* FALLTHROUGH */
1.2 schwarze 669: default:
1.50 ! schwarze 670: if (n == NULL || node != n->node) {
1.31 schwarze 671: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 672: break;
673: }
1.45 schwarze 674: if (pnode_class(node) == CLASS_NOFILL)
675: p->nofill--;
1.5 schwarze 676:
677: /*
678: * Refrain from actually closing the document element.
679: * If no more content follows, no harm is done, but if
680: * some content still follows, simply processing it is
681: * obviously better than discarding it or crashing.
682: */
683:
1.50 ! schwarze 684: if (n->parent != NULL || node == NODE_DOCTYPE) {
! 685: p->cur = n->parent;
1.31 schwarze 686: if (p->cur != NULL)
687: p->ncur = p->cur->node;
1.23 schwarze 688: } else
1.31 schwarze 689: p->tree->flags |= TREE_CLOSED;
690: p->flags &= ~PFLAG_SPC;
1.50 ! schwarze 691:
! 692: /* Include a file containing entity declarations. */
! 693:
! 694: if (node == NODE_ENTITY && strcmp("%",
! 695: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
! 696: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
! 697: parse_file(p, -1, cp);
! 698:
1.4 schwarze 699: break;
1.2 schwarze 700: }
1.31 schwarze 701: assert(p->del == 0);
1.1 schwarze 702: }
703:
704: struct parse *
705: parse_alloc(int warn)
706: {
707: struct parse *p;
708:
709: if ((p = calloc(1, sizeof(*p))) == NULL)
710: return NULL;
711:
712: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
713: free(p);
714: return NULL;
715: }
1.23 schwarze 716: if (warn)
717: p->flags |= PFLAG_WARN;
718: else
719: p->flags &= ~PFLAG_WARN;
1.1 schwarze 720: return p;
721: }
722:
723: void
724: parse_free(struct parse *p)
725: {
726: if (p == NULL)
727: return;
728: if (p->tree != NULL) {
729: pnode_unlink(p->tree->root);
730: free(p->tree);
731: }
732: free(p);
733: }
734:
1.14 schwarze 735: static void
736: increment(struct parse *p, char *b, size_t *pend, int refill)
737: {
738: if (refill) {
739: if (b[*pend] == '\n') {
740: p->nline++;
741: p->ncol = 1;
742: } else
743: p->ncol++;
744: }
745: ++*pend;
746: }
747:
1.5 schwarze 748: /*
749: * Advance the pend pointer to the next character in the charset.
750: * If the charset starts with a space, it stands for any whitespace.
751: * Update the new input file position, used for messages.
752: * Do not overrun the buffer b of length rlen.
753: * When reaching the end, NUL-terminate the buffer and return 1;
754: * otherwise, return 0.
755: */
756: static int
757: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 758: const char *charset, int refill)
1.5 schwarze 759: {
760: int space;
761:
762: if (*charset == ' ') {
763: space = 1;
764: charset++;
765: } else
766: space = 0;
767:
1.14 schwarze 768: if (refill) {
769: p->nline = p->line;
770: p->ncol = p->col;
771: }
1.5 schwarze 772: while (*pend < rlen) {
773: if (space && isspace((unsigned char)b[*pend]))
774: break;
775: if (strchr(charset, b[*pend]) != NULL)
776: break;
1.14 schwarze 777: increment(p, b, pend, refill);
1.5 schwarze 778: }
779: if (*pend == rlen) {
780: b[rlen] = '\0';
1.14 schwarze 781: return refill;
1.5 schwarze 782: } else
783: return 0;
784: }
785:
1.14 schwarze 786: size_t
787: parse_string(struct parse *p, char *b, size_t rlen,
788: enum pstate *pstate, int refill)
789: {
790: char *cp;
1.45 schwarze 791: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 792: size_t poff; /* Parse offset in b[]. */
793: size_t pend; /* Offset of the end of the current word. */
794: int elem_end;
795:
1.45 schwarze 796: pend = pws = 0;
1.14 schwarze 797: for (;;) {
798:
799: /* Proceed to the next token, skipping whitespace. */
800:
801: if (refill) {
802: p->line = p->nline;
803: p->col = p->ncol;
804: }
805: if ((poff = pend) == rlen)
806: break;
807: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 808: p->flags |= PFLAG_SPC;
1.45 schwarze 809: if (b[pend] == '\n')
810: pws = pend + 1;
1.14 schwarze 811: increment(p, b, &pend, refill);
812: continue;
813: }
814:
815: /*
816: * The following four cases (ARG, TAG, and starting an
817: * entity or a tag) all parse a word or quoted string.
818: * If that extends beyond the read buffer and the last
819: * read(2) still got data, they all break out of the
820: * token loop to request more data from the read loop.
821: *
822: * Also, three of them detect self-closing tags, those
823: * ending with "/>", setting the flag elem_end and
824: * calling xml_elem_end() at the very end, after
825: * handling the attribute value, attribute name, or
826: * tag name, respectively.
827: */
828:
829: /* Parse an attribute value. */
830:
831: if (*pstate >= PARSE_ARG) {
832: if (*pstate == PARSE_ARG &&
833: (b[pend] == '\'' || b[pend] == '"')) {
834: *pstate = b[pend] == '"' ?
835: PARSE_DQ : PARSE_SQ;
836: increment(p, b, &pend, refill);
837: continue;
838: }
839: if (advance(p, b, rlen, &pend,
840: *pstate == PARSE_DQ ? "\"" :
841: *pstate == PARSE_SQ ? "'" : " >", refill))
842: break;
843: *pstate = PARSE_TAG;
844: elem_end = 0;
845: if (b[pend] == '>') {
846: *pstate = PARSE_ELEM;
847: if (pend > 0 && b[pend - 1] == '/') {
848: b[pend - 1] = '\0';
849: elem_end = 1;
850: }
1.23 schwarze 851: if (p->flags & PFLAG_EEND)
852: elem_end = 1;
1.14 schwarze 853: }
854: b[pend] = '\0';
855: if (pend < rlen)
856: increment(p, b, &pend, refill);
857: xml_attrval(p, b + poff);
858: if (elem_end)
859: xml_elem_end(p, NULL);
860:
861: /* Look for an attribute name. */
862:
863: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 864: switch (p->ncur) {
865: case NODE_DOCTYPE:
866: if (b[pend] == '[') {
867: *pstate = PARSE_ELEM;
868: increment(p, b, &pend, refill);
869: continue;
870: }
871: /* FALLTHROUGH */
872: case NODE_ENTITY:
873: if (b[pend] == '"' || b[pend] == '\'') {
874: *pstate = PARSE_ARG;
875: continue;
876: }
877: break;
878: default:
879: break;
880: }
1.14 schwarze 881: if (advance(p, b, rlen, &pend, " =>", refill))
882: break;
883: elem_end = 0;
884: switch (b[pend]) {
885: case '>':
886: *pstate = PARSE_ELEM;
887: if (pend > 0 && b[pend - 1] == '/') {
888: b[pend - 1] = '\0';
889: elem_end = 1;
890: }
1.23 schwarze 891: if (p->flags & PFLAG_EEND)
892: elem_end = 1;
1.14 schwarze 893: break;
894: case '=':
895: *pstate = PARSE_ARG;
896: break;
897: default:
898: break;
899: }
900: b[pend] = '\0';
901: if (pend < rlen)
902: increment(p, b, &pend, refill);
903: xml_attrkey(p, b + poff);
904: if (elem_end)
905: xml_elem_end(p, NULL);
906:
907: /* Begin an opening or closing tag. */
908:
909: } else if (b[poff] == '<') {
910: if (advance(p, b, rlen, &pend, " >", refill))
911: break;
912: if (pend > poff + 3 &&
913: strncmp(b + poff, "<!--", 4) == 0) {
914:
915: /* Skip a comment. */
916:
917: cp = strstr(b + pend - 2, "-->");
918: if (cp == NULL) {
919: if (refill)
920: break;
921: cp = b + rlen;
922: } else
923: cp += 3;
924: while (b + pend < cp)
925: increment(p, b, &pend, refill);
926: continue;
927: }
928: elem_end = 0;
929: if (b[pend] != '>')
930: *pstate = PARSE_TAG;
931: else if (pend > 0 && b[pend - 1] == '/') {
932: b[pend - 1] = '\0';
933: elem_end = 1;
934: }
935: b[pend] = '\0';
936: if (pend < rlen)
937: increment(p, b, &pend, refill);
938: if (b[++poff] == '/') {
939: elem_end = 1;
940: poff++;
1.23 schwarze 941: } else {
1.14 schwarze 942: xml_elem_start(p, b + poff);
1.23 schwarze 943: if (*pstate == PARSE_ELEM &&
944: p->flags & PFLAG_EEND)
945: elem_end = 1;
946: }
1.14 schwarze 947: if (elem_end)
948: xml_elem_end(p, b + poff);
949:
1.23 schwarze 950: /* Close a doctype. */
951:
952: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
953: *pstate = PARSE_TAG;
954: increment(p, b, &pend, refill);
955:
1.14 schwarze 956: /* Process an entity. */
957:
958: } else if (b[poff] == '&') {
959: if (advance(p, b, rlen, &pend, ";", refill))
960: break;
961: b[pend] = '\0';
962: if (pend < rlen)
963: increment(p, b, &pend, refill);
964: xml_entity(p, b + poff + 1);
965:
966: /* Process text up to the next tag, entity, or EOL. */
967:
968: } else {
1.28 schwarze 969: advance(p, b, rlen, &pend,
1.33 schwarze 970: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 971: refill);
1.45 schwarze 972: if (p->nofill)
973: poff = pws;
1.35 schwarze 974: xml_text(p, b + poff, pend - poff);
1.33 schwarze 975: if (b[pend] == '\n')
1.37 schwarze 976: pnode_closetext(p, 0);
1.14 schwarze 977: }
1.45 schwarze 978: pws = pend;
1.14 schwarze 979: }
980: return poff;
981: }
982:
1.24 schwarze 983:
984: /*
985: * The read loop.
986: * If the previous token was incomplete and asked for more input,
987: * we have to enter the read loop once more even on EOF.
988: * Once rsz is 0, incomplete tokens will no longer ask for more input
989: * but instead use whatever there is, and then exit the read loop.
990: * The minus one on the size limit for read(2) is needed such that
991: * advance() can set b[rlen] to NUL when needed.
992: */
993: static void
994: parse_fd(struct parse *p, int fd)
1.1 schwarze 995: {
996: char b[4096];
1.5 schwarze 997: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 998: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 999: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1000: enum pstate pstate;
1.1 schwarze 1001:
1.24 schwarze 1002: rlen = 0;
1.14 schwarze 1003: pstate = PARSE_ELEM;
1004: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1005: (rlen += rsz) > 0) {
1006: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1007: /* Buffer exhausted; shift left and re-fill. */
1008: assert(poff > 0);
1009: rlen -= poff;
1.14 schwarze 1010: memmove(b, b + poff, rlen);
1.5 schwarze 1011: }
1.24 schwarze 1012: if (rsz < 0)
1013: error_msg(p, "read: %s", strerror(errno));
1014: }
1015:
1016: /*
1017: * Open and parse a file.
1018: */
1019: struct ptree *
1020: parse_file(struct parse *p, int fd, const char *fname)
1021: {
1022: const char *save_fname;
1023: int save_line, save_col;
1024:
1025: /* Save and initialize reporting data. */
1026:
1027: save_fname = p->fname;
1028: save_line = p->nline;
1029: save_col = p->ncol;
1030: p->fname = fname;
1031: p->line = 0;
1032: p->col = 0;
1033:
1034: /* Open the file, unless it is already open. */
1035:
1036: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1037: error_msg(p, "open: %s", strerror(errno));
1038: p->fname = save_fname;
1039: return p->tree;
1.5 schwarze 1040: }
1.24 schwarze 1041:
1042: /*
1043: * After opening the starting file, change to the directory it
1044: * is located in, in case it wants to include any further files,
1045: * which are typically given with relative paths in DocBook.
1046: * Do this on a best-effort basis; don't complain about failure.
1047: */
1048:
1049: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1050: strcmp(fname, ".") != 0)
1051: (void)chdir(fname);
1052:
1053: /* Run the read loop. */
1054:
1055: p->nline = 1;
1056: p->ncol = 1;
1057: parse_fd(p, fd);
1058:
1059: /* On the top level, finalize the parse tree. */
1060:
1061: if (save_fname == NULL) {
1.37 schwarze 1062: pnode_closetext(p, 0);
1.24 schwarze 1063: if (p->tree->root == NULL)
1064: error_msg(p, "empty document");
1065: else if ((p->tree->flags & TREE_CLOSED) == 0)
1066: warn_msg(p, "document not closed");
1067: pnode_unlink(p->doctype);
1068: }
1069:
1070: /* Clean up. */
1071:
1072: if (fd != STDIN_FILENO)
1073: close(fd);
1074: p->fname = save_fname;
1075: p->nline = save_line;
1076: p->ncol = save_col;
1.1 schwarze 1077: return p->tree;
1078: }
CVSweb