Annotation of docbook2mdoc/parse.c, Revision 1.52
1.52 ! schwarze 1: /* $Id: parse.c,v 1.51 2019/04/24 18:38:02 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 59: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 60: int flags;
61: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 schwarze 62: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
63: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
64: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
65: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 66: };
67:
1.39 schwarze 68: struct alias {
1.1 schwarze 69: const char *name; /* DocBook element name. */
70: enum nodeid node; /* Node type to generate. */
71: };
72:
1.39 schwarze 73: static const struct alias aliases[] = {
1.3 schwarze 74: { "acronym", NODE_IGNORE },
1.43 schwarze 75: { "affiliation", NODE_IGNORE },
1.4 schwarze 76: { "anchor", NODE_DELETE },
1.42 schwarze 77: { "application", NODE_COMMAND },
1.22 schwarze 78: { "article", NODE_SECTION },
1.41 schwarze 79: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 80: { "book", NODE_SECTION },
1.1 schwarze 81: { "chapter", NODE_SECTION },
1.44 schwarze 82: { "caption", NODE_IGNORE },
1.13 schwarze 83: { "code", NODE_LITERAL },
1.36 schwarze 84: { "computeroutput", NODE_LITERAL },
1.23 schwarze 85: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 86: { "figure", NODE_IGNORE },
1.7 schwarze 87: { "firstname", NODE_PERSONNAME },
1.21 schwarze 88: { "glossary", NODE_VARIABLELIST },
89: { "glossdef", NODE_IGNORE },
90: { "glossdiv", NODE_IGNORE },
91: { "glossentry", NODE_VARLISTENTRY },
92: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 93: { "holder", NODE_IGNORE },
1.44 schwarze 94: { "imageobject", NODE_IGNORE },
1.4 schwarze 95: { "indexterm", NODE_DELETE },
1.11 schwarze 96: { "informaltable", NODE_TABLE },
1.42 schwarze 97: { "keycap", NODE_KEYSYM },
98: { "keycode", NODE_IGNORE },
1.44 schwarze 99: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 100: { "orgname", NODE_IGNORE },
1.40 schwarze 101: { "othercredit", NODE_AUTHOR },
1.7 schwarze 102: { "othername", NODE_PERSONNAME },
1.1 schwarze 103: { "part", NODE_SECTION },
1.3 schwarze 104: { "phrase", NODE_IGNORE },
1.4 schwarze 105: { "primary", NODE_DELETE },
1.42 schwarze 106: { "property", NODE_PARAMETER },
1.52 ! schwarze 107: { "reference", NODE_SECTION },
1.1 schwarze 108: { "refsect1", NODE_SECTION },
109: { "refsect2", NODE_SECTION },
110: { "refsect3", NODE_SECTION },
111: { "refsection", NODE_SECTION },
1.43 schwarze 112: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 113: { "returnvalue", NODE_IGNORE },
1.4 schwarze 114: { "secondary", NODE_DELETE },
1.1 schwarze 115: { "sect1", NODE_SECTION },
116: { "sect2", NODE_SECTION },
1.46 schwarze 117: { "sect3", NODE_SECTION },
118: { "sect4", NODE_SECTION },
1.36 schwarze 119: { "sgmltag", NODE_MARKUP },
1.15 schwarze 120: { "simpara", NODE_PARA },
1.13 schwarze 121: { "structfield", NODE_PARAMETER },
122: { "structname", NODE_TYPE },
1.7 schwarze 123: { "surname", NODE_PERSONNAME },
1.12 schwarze 124: { "symbol", NODE_CONSTANT },
1.48 schwarze 125: { "tag", NODE_MARKUP },
1.3 schwarze 126: { "trademark", NODE_IGNORE },
1.18 schwarze 127: { "ulink", NODE_LINK },
1.13 schwarze 128: { "userinput", NODE_LITERAL },
1.43 schwarze 129: { "year", NODE_IGNORE },
1.5 schwarze 130: { NULL, NODE_IGNORE }
1.1 schwarze 131: };
132:
1.9 schwarze 133: struct entity {
134: const char *name;
135: const char *roff;
136: };
137:
138: /*
139: * XML character entity references found in the wild.
140: * Those that don't have an exact mandoc_char(7) representation
141: * are approximated, and the desired codepoint is given as a comment.
142: * Encoding them as \\[u...] would leave -Tascii out in the cold.
143: */
144: static const struct entity entities[] = {
145: { "alpha", "\\(*a" },
146: { "amp", "&" },
147: { "apos", "'" },
148: { "auml", "\\(:a" },
149: { "beta", "\\(*b" },
150: { "circ", "^" }, /* U+02C6 */
151: { "copy", "\\(co" },
152: { "dagger", "\\(dg" },
153: { "Delta", "\\(*D" },
154: { "eacute", "\\('e" },
155: { "emsp", "\\ " }, /* U+2003 */
156: { "gt", ">" },
157: { "hairsp", "\\^" },
158: { "kappa", "\\(*k" },
159: { "larr", "\\(<-" },
160: { "ldquo", "\\(lq" },
161: { "le", "\\(<=" },
162: { "lowbar", "_" },
163: { "lsqb", "[" },
164: { "lt", "<" },
165: { "mdash", "\\(em" },
166: { "minus", "\\-" },
167: { "ndash", "\\(en" },
168: { "nbsp", "\\ " },
169: { "num", "#" },
170: { "oslash", "\\(/o" },
171: { "ouml", "\\(:o" },
172: { "percnt", "%" },
173: { "quot", "\\(dq" },
174: { "rarr", "\\(->" },
175: { "rArr", "\\(rA" },
176: { "rdquo", "\\(rq" },
177: { "reg", "\\(rg" },
178: { "rho", "\\(*r" },
179: { "rsqb", "]" },
180: { "sigma", "\\(*s" },
181: { "shy", "\\&" }, /* U+00AD */
182: { "tau", "\\(*t" },
183: { "tilde", "\\[u02DC]" },
184: { "times", "\\[tmu]" },
185: { "uuml", "\\(:u" },
186: { NULL, NULL }
187: };
188:
1.23 schwarze 189: static size_t parse_string(struct parse *, char *, size_t,
190: enum pstate *, int);
1.24 schwarze 191: static void parse_fd(struct parse *, int);
1.23 schwarze 192:
193:
1.6 schwarze 194: static void
1.29 schwarze 195: fatal(struct parse *p)
196: {
197: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
198: perror(NULL);
199: exit(6);
200: }
201:
202: static void
1.6 schwarze 203: error_msg(struct parse *p, const char *fmt, ...)
204: {
205: va_list ap;
206:
1.29 schwarze 207: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 208: va_start(ap, fmt);
209: vfprintf(stderr, fmt, ap);
210: va_end(ap);
211: fputc('\n', stderr);
1.29 schwarze 212: p->tree->flags |= TREE_ERROR;
1.6 schwarze 213: }
214:
215: static void
216: warn_msg(struct parse *p, const char *fmt, ...)
217: {
218: va_list ap;
219:
1.23 schwarze 220: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 221: return;
222:
1.29 schwarze 223: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 224: va_start(ap, fmt);
225: vfprintf(stderr, fmt, ap);
226: va_end(ap);
227: fputc('\n', stderr);
1.29 schwarze 228: p->tree->flags |= TREE_WARN;
1.6 schwarze 229: }
230:
1.1 schwarze 231: /*
232: * Process a string of characters.
233: * If a text node is already open, append to it.
234: * Otherwise, create a new one as a child of the current node.
235: */
236: static void
1.35 schwarze 237: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 238: {
1.35 schwarze 239: struct pnode *n, *np;
1.32 schwarze 240: size_t oldsz, newsz;
1.35 schwarze 241: int i;
1.1 schwarze 242:
1.32 schwarze 243: assert(sz > 0);
1.30 schwarze 244: if (p->del > 0)
1.1 schwarze 245: return;
246:
1.32 schwarze 247: if ((n = p->cur) == NULL) {
1.35 schwarze 248: error_msg(p, "discarding text before document: %.*s",
249: sz, word);
1.5 schwarze 250: return;
251: }
252:
1.35 schwarze 253: /* Append to the current text node, if one is open. */
254:
255: if (n->node == NODE_TEXT) {
256: oldsz = strlen(n->b);
257: newsz = oldsz + sz;
258: if (oldsz && (p->flags & PFLAG_SPC))
259: newsz++;
260: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 261: fatal(p);
1.35 schwarze 262: if (oldsz && (p->flags & PFLAG_SPC))
263: n->b[oldsz++] = ' ';
264: memcpy(n->b + oldsz, word, sz);
265: n->b[newsz] = '\0';
1.51 schwarze 266: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 267: return;
1.1 schwarze 268: }
269:
1.35 schwarze 270: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 271: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 272:
1.35 schwarze 273: /* Create a new text node. */
1.1 schwarze 274:
1.35 schwarze 275: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 276: fatal(p);
1.35 schwarze 277: n->node = NODE_TEXT;
1.51 schwarze 278: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
279: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
280: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 281:
282: /*
1.39 schwarze 283: * If this node follows an in-line macro without intervening
1.35 schwarze 284: * whitespace, keep the text in it as short as possible,
285: * and do not keep it open.
286: */
287:
1.51 schwarze 288: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 289: while (np != NULL) {
290: switch (pnode_class(np->node)) {
291: case CLASS_VOID:
292: case CLASS_TEXT:
293: case CLASS_BLOCK:
1.45 schwarze 294: case CLASS_NOFILL:
1.39 schwarze 295: np = NULL;
296: break;
297: case CLASS_TRANS:
298: np = TAILQ_LAST(&np->childq, pnodeq);
299: continue;
300: case CLASS_LINE:
301: case CLASS_ENCL:
302: break;
303: }
304: break;
305: }
306: if (np != NULL) {
1.35 schwarze 307: i = 0;
308: while (i < sz && !isspace((unsigned char)word[i]))
309: i++;
310: if ((n->b = strndup(word, i)) == NULL)
311: fatal(p);
312: if (i == sz)
313: return;
314: while (i < sz && isspace((unsigned char)word[i]))
315: i++;
316: if (i == sz) {
317: p->flags |= PFLAG_SPC;
318: return;
319: }
320:
321: /* Put any remaining text into a second node. */
322:
323: if ((n = pnode_alloc(p->cur)) == NULL)
324: fatal(p);
325: n->node = NODE_TEXT;
1.51 schwarze 326: n->flags |= NFLAG_SPC;
1.35 schwarze 327: word += i;
328: sz -= i;
329: }
330: if ((n->b = strndup(word, sz)) == NULL)
331: fatal(p);
332:
333: /* The new node remains open for later pnode_closetext(). */
334:
335: p->cur = n;
1.1 schwarze 336: }
337:
1.16 schwarze 338: /*
339: * Close out the text node and strip trailing whitespace, if one is open.
340: */
1.1 schwarze 341: static void
1.37 schwarze 342: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 343: {
1.16 schwarze 344: struct pnode *n;
1.37 schwarze 345: char *cp, *last_word;
1.16 schwarze 346:
347: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
348: return;
349: p->cur = n->parent;
1.32 schwarze 350: for (cp = strchr(n->b, '\0');
351: cp > n->b && isspace((unsigned char)cp[-1]);
352: *--cp = '\0')
1.23 schwarze 353: p->flags |= PFLAG_SPC;
1.37 schwarze 354:
355: if (p->flags & PFLAG_SPC || !check_last_word)
356: return;
357:
358: /*
359: * Find the beginning of the last word
360: * and delete whitespace before it.
361: */
362:
363: while (cp > n->b && !isspace((unsigned char)cp[-1]))
364: cp--;
365: if (cp == n->b)
366: return;
367:
368: last_word = cp;
369: while (cp > n->b && isspace((unsigned char)cp[-1]))
370: *--cp = '\0';
371:
372: /* Move the last word into its own node, for use with .Pf. */
373:
374: if ((n = pnode_alloc(p->cur)) == NULL)
375: fatal(p);
376: n->node = NODE_TEXT;
1.51 schwarze 377: n->flags |= NFLAG_SPC;
1.37 schwarze 378: if ((n->b = strdup(last_word)) == NULL)
379: fatal(p);
1.1 schwarze 380: }
381:
1.9 schwarze 382: static void
383: xml_entity(struct parse *p, const char *name)
384: {
385: const struct entity *entity;
1.30 schwarze 386: struct pnode *n;
1.23 schwarze 387: const char *ccp;
388: char *cp;
1.49 schwarze 389: unsigned int codepoint;
1.23 schwarze 390: enum pstate pstate;
1.9 schwarze 391:
392: if (p->del > 0)
393: return;
394:
395: if (p->cur == NULL) {
396: error_msg(p, "discarding entity before document: &%s;", name);
397: return;
398: }
399:
1.37 schwarze 400: pnode_closetext(p, 0);
1.9 schwarze 401:
402: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
403: warn_msg(p, "entity after end of document: &%s;", name);
404:
405: for (entity = entities; entity->name != NULL; entity++)
406: if (strcmp(name, entity->name) == 0)
407: break;
408:
409: if (entity->roff == NULL) {
1.23 schwarze 410: if (p->doctype != NULL) {
1.30 schwarze 411: TAILQ_FOREACH(n, &p->doctype->childq, child) {
412: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 413: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 414: strcmp(ccp, name) != 0)
415: continue;
1.30 schwarze 416: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 417: ATTRKEY_SYSTEM, NULL)) != NULL) {
418: parse_file(p, -1, ccp);
1.51 schwarze 419: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 420: return;
421: }
1.30 schwarze 422: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 423: ATTRKEY_DEFINITION, NULL)) == NULL)
424: continue;
1.29 schwarze 425: if ((cp = strdup(ccp)) == NULL)
426: fatal(p);
1.23 schwarze 427: pstate = PARSE_ELEM;
428: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 schwarze 429: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 430: free(cp);
431: return;
432: }
433: }
1.49 schwarze 434: if (*name == '#') {
435: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
436: if (ccp == NULL) {
437: if ((n = pnode_alloc(p->cur)) == NULL ||
438: asprintf(&n->b, "\\[u%4.4X]",
439: codepoint) < 0)
440: fatal(p);
441: goto done;
442: }
443: }
1.9 schwarze 444: error_msg(p, "unknown entity &%s;", name);
445: return;
446: }
447:
448: /* Create, append, and close out an entity node. */
1.34 schwarze 449: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 450: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 451: fatal(p);
1.49 schwarze 452: done:
1.30 schwarze 453: n->node = NODE_ESCAPE;
1.51 schwarze 454: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
455: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
456: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 457: }
458:
1.1 schwarze 459: /*
1.39 schwarze 460: * Parse an element name.
461: */
462: static enum nodeid
463: xml_name2node(struct parse *p, const char *name)
464: {
465: const struct alias *alias;
466: enum nodeid node;
467:
468: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
469: return node;
470:
471: for (alias = aliases; alias->name != NULL; alias++)
472: if (strcmp(alias->name, name) == 0)
473: return alias->node;
474:
475: return NODE_UNKNOWN;
476: }
477:
478: /*
1.1 schwarze 479: * Begin an element.
480: */
481: static void
1.30 schwarze 482: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 483: {
1.30 schwarze 484: struct pnode *n;
1.1 schwarze 485:
1.4 schwarze 486: /*
487: * An ancestor is excluded from the tree;
488: * keep track of the number of levels excluded.
489: */
1.30 schwarze 490: if (p->del > 0) {
1.23 schwarze 491: if (*name != '!' && *name != '?')
1.30 schwarze 492: p->del++;
1.4 schwarze 493: return;
494: }
495:
1.39 schwarze 496: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 497: case NODE_DELETE_WARN:
1.30 schwarze 498: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 499: /* FALLTHROUGH */
1.4 schwarze 500: case NODE_DELETE:
1.30 schwarze 501: p->del = 1;
1.4 schwarze 502: /* FALLTHROUGH */
1.2 schwarze 503: case NODE_IGNORE:
504: return;
1.39 schwarze 505: case NODE_UNKNOWN:
506: if (*name != '!' && *name != '?')
507: error_msg(p, "unknown element <%s>", name);
508: return;
1.2 schwarze 509: default:
510: break;
511: }
1.1 schwarze 512:
1.30 schwarze 513: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
514: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 515:
1.39 schwarze 516: switch (pnode_class(p->ncur)) {
517: case CLASS_LINE:
518: case CLASS_ENCL:
519: pnode_closetext(p, 1);
520: break;
521: default:
522: pnode_closetext(p, 0);
523: break;
524: }
525:
1.34 schwarze 526: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 527: fatal(p);
1.17 schwarze 528:
529: /*
1.39 schwarze 530: * Some elements are self-closing.
1.17 schwarze 531: * Nodes that begin a new macro or request line or start by
532: * printing text always want whitespace before themselves.
533: */
534:
1.39 schwarze 535: switch (n->node = p->ncur) {
1.23 schwarze 536: case NODE_DOCTYPE:
537: case NODE_ENTITY:
538: case NODE_SBR:
1.48 schwarze 539: case NODE_VOID:
1.30 schwarze 540: p->flags |= PFLAG_EEND;
1.17 schwarze 541: break;
542: default:
1.39 schwarze 543: break;
544: }
545: switch (pnode_class(p->ncur)) {
546: case CLASS_LINE:
547: case CLASS_ENCL:
1.51 schwarze 548: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
549: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
1.17 schwarze 550: break;
1.45 schwarze 551: case CLASS_NOFILL:
552: p->nofill++;
553: /* FALLTHROUGH */
1.39 schwarze 554: default:
1.51 schwarze 555: n->flags |= NFLAG_SPC;
1.39 schwarze 556: break;
1.17 schwarze 557: }
1.30 schwarze 558: p->cur = n;
559: if (n->node == NODE_DOCTYPE) {
560: if (p->doctype == NULL)
561: p->doctype = n;
1.23 schwarze 562: else
1.30 schwarze 563: error_msg(p, "duplicate doctype");
564: } else if (n->parent == NULL && p->tree->root == NULL)
565: p->tree->root = n;
1.5 schwarze 566: }
567:
568: static void
1.30 schwarze 569: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 570: {
1.30 schwarze 571: struct pattr *a;
1.23 schwarze 572: const char *value;
1.5 schwarze 573: enum attrkey key;
1.1 schwarze 574:
1.47 schwarze 575: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 576: return;
1.23 schwarze 577:
1.30 schwarze 578: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
579: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 580: value = name;
581: name = "NAME";
582: } else
583: value = NULL;
584:
1.5 schwarze 585: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 586: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 587: return;
588: }
1.30 schwarze 589: if ((a = calloc(1, sizeof(*a))) == NULL)
590: fatal(p);
1.29 schwarze 591:
1.30 schwarze 592: a->key = key;
593: a->val = ATTRVAL__MAX;
1.23 schwarze 594: if (value == NULL) {
1.30 schwarze 595: a->rawval = NULL;
596: p->flags |= PFLAG_ATTR;
1.23 schwarze 597: } else {
1.30 schwarze 598: if ((a->rawval = strdup(value)) == NULL)
599: fatal(p);
600: p->flags &= ~PFLAG_ATTR;
601: }
602: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
603: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
604: xml_attrkey(p, "DEFINITION");
1.5 schwarze 605: }
606:
607: static void
1.30 schwarze 608: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 609: {
1.30 schwarze 610: struct pattr *a;
1.5 schwarze 611:
1.47 schwarze 612: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 613: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 614: return;
1.30 schwarze 615: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 616: return;
1.30 schwarze 617: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
618: (a->rawval = strdup(name)) == NULL)
619: fatal(p);
620: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 621: }
622:
623: /*
624: * Roll up the parse tree.
625: * If we're at a text node, roll that one up first.
626: */
627: static void
1.31 schwarze 628: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 629: {
1.26 schwarze 630: struct pnode *n;
631: const char *cp;
1.5 schwarze 632: enum nodeid node;
1.1 schwarze 633:
1.4 schwarze 634: /*
635: * An ancestor is excluded from the tree;
636: * keep track of the number of levels excluded.
637: */
1.31 schwarze 638: if (p->del > 1) {
639: p->del--;
1.4 schwarze 640: return;
641: }
642:
1.31 schwarze 643: if (p->del == 0)
1.37 schwarze 644: pnode_closetext(p, 0);
1.2 schwarze 645:
1.50 schwarze 646: n = p->cur;
1.39 schwarze 647: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 648:
1.5 schwarze 649: switch (node) {
1.4 schwarze 650: case NODE_DELETE_WARN:
651: case NODE_DELETE:
1.31 schwarze 652: if (p->del > 0)
653: p->del--;
1.4 schwarze 654: break;
1.2 schwarze 655: case NODE_IGNORE:
1.39 schwarze 656: case NODE_UNKNOWN:
1.26 schwarze 657: break;
658: case NODE_INCLUDE:
1.50 schwarze 659: p->cur = n->parent;
1.26 schwarze 660: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
661: if (cp == NULL)
1.31 schwarze 662: error_msg(p, "<xi:include> element "
1.26 schwarze 663: "without href attribute");
664: else
1.31 schwarze 665: parse_file(p, -1, cp);
1.26 schwarze 666: pnode_unlink(n);
1.51 schwarze 667: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 668: break;
1.23 schwarze 669: case NODE_DOCTYPE:
1.32 schwarze 670: case NODE_SBR:
1.48 schwarze 671: case NODE_VOID:
1.31 schwarze 672: p->flags &= ~PFLAG_EEND;
1.23 schwarze 673: /* FALLTHROUGH */
1.2 schwarze 674: default:
1.50 schwarze 675: if (n == NULL || node != n->node) {
1.31 schwarze 676: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 677: break;
678: }
1.45 schwarze 679: if (pnode_class(node) == CLASS_NOFILL)
680: p->nofill--;
1.5 schwarze 681:
682: /*
683: * Refrain from actually closing the document element.
684: * If no more content follows, no harm is done, but if
685: * some content still follows, simply processing it is
686: * obviously better than discarding it or crashing.
687: */
688:
1.50 schwarze 689: if (n->parent != NULL || node == NODE_DOCTYPE) {
690: p->cur = n->parent;
1.31 schwarze 691: if (p->cur != NULL)
692: p->ncur = p->cur->node;
1.23 schwarze 693: } else
1.31 schwarze 694: p->tree->flags |= TREE_CLOSED;
1.51 schwarze 695: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 696:
697: /* Include a file containing entity declarations. */
698:
699: if (node == NODE_ENTITY && strcmp("%",
700: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
701: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
702: parse_file(p, -1, cp);
703:
1.4 schwarze 704: break;
1.2 schwarze 705: }
1.31 schwarze 706: assert(p->del == 0);
1.1 schwarze 707: }
708:
709: struct parse *
710: parse_alloc(int warn)
711: {
712: struct parse *p;
713:
714: if ((p = calloc(1, sizeof(*p))) == NULL)
715: return NULL;
716:
717: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
718: free(p);
719: return NULL;
720: }
1.23 schwarze 721: if (warn)
722: p->flags |= PFLAG_WARN;
723: else
724: p->flags &= ~PFLAG_WARN;
1.1 schwarze 725: return p;
726: }
727:
728: void
729: parse_free(struct parse *p)
730: {
731: if (p == NULL)
732: return;
733: if (p->tree != NULL) {
734: pnode_unlink(p->tree->root);
735: free(p->tree);
736: }
737: free(p);
738: }
739:
1.14 schwarze 740: static void
741: increment(struct parse *p, char *b, size_t *pend, int refill)
742: {
743: if (refill) {
744: if (b[*pend] == '\n') {
745: p->nline++;
746: p->ncol = 1;
747: } else
748: p->ncol++;
749: }
750: ++*pend;
751: }
752:
1.5 schwarze 753: /*
754: * Advance the pend pointer to the next character in the charset.
755: * If the charset starts with a space, it stands for any whitespace.
756: * Update the new input file position, used for messages.
757: * Do not overrun the buffer b of length rlen.
758: * When reaching the end, NUL-terminate the buffer and return 1;
759: * otherwise, return 0.
760: */
761: static int
762: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 763: const char *charset, int refill)
1.5 schwarze 764: {
765: int space;
766:
767: if (*charset == ' ') {
768: space = 1;
769: charset++;
770: } else
771: space = 0;
772:
1.14 schwarze 773: if (refill) {
774: p->nline = p->line;
775: p->ncol = p->col;
776: }
1.5 schwarze 777: while (*pend < rlen) {
778: if (space && isspace((unsigned char)b[*pend]))
779: break;
780: if (strchr(charset, b[*pend]) != NULL)
781: break;
1.14 schwarze 782: increment(p, b, pend, refill);
1.5 schwarze 783: }
784: if (*pend == rlen) {
785: b[rlen] = '\0';
1.14 schwarze 786: return refill;
1.5 schwarze 787: } else
788: return 0;
789: }
790:
1.14 schwarze 791: size_t
792: parse_string(struct parse *p, char *b, size_t rlen,
793: enum pstate *pstate, int refill)
794: {
795: char *cp;
1.45 schwarze 796: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 797: size_t poff; /* Parse offset in b[]. */
798: size_t pend; /* Offset of the end of the current word. */
799: int elem_end;
800:
1.45 schwarze 801: pend = pws = 0;
1.14 schwarze 802: for (;;) {
803:
804: /* Proceed to the next token, skipping whitespace. */
805:
806: if (refill) {
807: p->line = p->nline;
808: p->col = p->ncol;
809: }
810: if ((poff = pend) == rlen)
811: break;
812: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 813: p->flags |= PFLAG_SPC;
1.51 schwarze 814: if (b[pend] == '\n') {
815: p->flags |= PFLAG_LINE;
1.45 schwarze 816: pws = pend + 1;
1.51 schwarze 817: }
1.14 schwarze 818: increment(p, b, &pend, refill);
819: continue;
820: }
821:
822: /*
823: * The following four cases (ARG, TAG, and starting an
824: * entity or a tag) all parse a word or quoted string.
825: * If that extends beyond the read buffer and the last
826: * read(2) still got data, they all break out of the
827: * token loop to request more data from the read loop.
828: *
829: * Also, three of them detect self-closing tags, those
830: * ending with "/>", setting the flag elem_end and
831: * calling xml_elem_end() at the very end, after
832: * handling the attribute value, attribute name, or
833: * tag name, respectively.
834: */
835:
836: /* Parse an attribute value. */
837:
838: if (*pstate >= PARSE_ARG) {
839: if (*pstate == PARSE_ARG &&
840: (b[pend] == '\'' || b[pend] == '"')) {
841: *pstate = b[pend] == '"' ?
842: PARSE_DQ : PARSE_SQ;
843: increment(p, b, &pend, refill);
844: continue;
845: }
846: if (advance(p, b, rlen, &pend,
847: *pstate == PARSE_DQ ? "\"" :
848: *pstate == PARSE_SQ ? "'" : " >", refill))
849: break;
850: *pstate = PARSE_TAG;
851: elem_end = 0;
852: if (b[pend] == '>') {
853: *pstate = PARSE_ELEM;
854: if (pend > 0 && b[pend - 1] == '/') {
855: b[pend - 1] = '\0';
856: elem_end = 1;
857: }
1.23 schwarze 858: if (p->flags & PFLAG_EEND)
859: elem_end = 1;
1.14 schwarze 860: }
861: b[pend] = '\0';
862: if (pend < rlen)
863: increment(p, b, &pend, refill);
864: xml_attrval(p, b + poff);
865: if (elem_end)
866: xml_elem_end(p, NULL);
867:
868: /* Look for an attribute name. */
869:
870: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 871: switch (p->ncur) {
872: case NODE_DOCTYPE:
873: if (b[pend] == '[') {
874: *pstate = PARSE_ELEM;
875: increment(p, b, &pend, refill);
876: continue;
877: }
878: /* FALLTHROUGH */
879: case NODE_ENTITY:
880: if (b[pend] == '"' || b[pend] == '\'') {
881: *pstate = PARSE_ARG;
882: continue;
883: }
884: break;
885: default:
886: break;
887: }
1.14 schwarze 888: if (advance(p, b, rlen, &pend, " =>", refill))
889: break;
890: elem_end = 0;
891: switch (b[pend]) {
892: case '>':
893: *pstate = PARSE_ELEM;
894: if (pend > 0 && b[pend - 1] == '/') {
895: b[pend - 1] = '\0';
896: elem_end = 1;
897: }
1.23 schwarze 898: if (p->flags & PFLAG_EEND)
899: elem_end = 1;
1.14 schwarze 900: break;
901: case '=':
902: *pstate = PARSE_ARG;
903: break;
904: default:
905: break;
906: }
907: b[pend] = '\0';
908: if (pend < rlen)
909: increment(p, b, &pend, refill);
910: xml_attrkey(p, b + poff);
911: if (elem_end)
912: xml_elem_end(p, NULL);
913:
914: /* Begin an opening or closing tag. */
915:
916: } else if (b[poff] == '<') {
917: if (advance(p, b, rlen, &pend, " >", refill))
918: break;
919: if (pend > poff + 3 &&
920: strncmp(b + poff, "<!--", 4) == 0) {
921:
922: /* Skip a comment. */
923:
924: cp = strstr(b + pend - 2, "-->");
925: if (cp == NULL) {
926: if (refill)
927: break;
928: cp = b + rlen;
929: } else
930: cp += 3;
931: while (b + pend < cp)
932: increment(p, b, &pend, refill);
933: continue;
934: }
935: elem_end = 0;
936: if (b[pend] != '>')
937: *pstate = PARSE_TAG;
938: else if (pend > 0 && b[pend - 1] == '/') {
939: b[pend - 1] = '\0';
940: elem_end = 1;
941: }
942: b[pend] = '\0';
943: if (pend < rlen)
944: increment(p, b, &pend, refill);
945: if (b[++poff] == '/') {
946: elem_end = 1;
947: poff++;
1.23 schwarze 948: } else {
1.14 schwarze 949: xml_elem_start(p, b + poff);
1.23 schwarze 950: if (*pstate == PARSE_ELEM &&
951: p->flags & PFLAG_EEND)
952: elem_end = 1;
953: }
1.14 schwarze 954: if (elem_end)
955: xml_elem_end(p, b + poff);
956:
1.23 schwarze 957: /* Close a doctype. */
958:
959: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
960: *pstate = PARSE_TAG;
961: increment(p, b, &pend, refill);
962:
1.14 schwarze 963: /* Process an entity. */
964:
965: } else if (b[poff] == '&') {
966: if (advance(p, b, rlen, &pend, ";", refill))
967: break;
968: b[pend] = '\0';
969: if (pend < rlen)
970: increment(p, b, &pend, refill);
971: xml_entity(p, b + poff + 1);
972:
973: /* Process text up to the next tag, entity, or EOL. */
974:
975: } else {
1.28 schwarze 976: advance(p, b, rlen, &pend,
1.33 schwarze 977: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 978: refill);
1.45 schwarze 979: if (p->nofill)
980: poff = pws;
1.35 schwarze 981: xml_text(p, b + poff, pend - poff);
1.33 schwarze 982: if (b[pend] == '\n')
1.37 schwarze 983: pnode_closetext(p, 0);
1.14 schwarze 984: }
1.45 schwarze 985: pws = pend;
1.14 schwarze 986: }
987: return poff;
988: }
989:
1.24 schwarze 990:
991: /*
992: * The read loop.
993: * If the previous token was incomplete and asked for more input,
994: * we have to enter the read loop once more even on EOF.
995: * Once rsz is 0, incomplete tokens will no longer ask for more input
996: * but instead use whatever there is, and then exit the read loop.
997: * The minus one on the size limit for read(2) is needed such that
998: * advance() can set b[rlen] to NUL when needed.
999: */
1000: static void
1001: parse_fd(struct parse *p, int fd)
1.1 schwarze 1002: {
1003: char b[4096];
1.5 schwarze 1004: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 1005: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 1006: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1007: enum pstate pstate;
1.1 schwarze 1008:
1.24 schwarze 1009: rlen = 0;
1.14 schwarze 1010: pstate = PARSE_ELEM;
1011: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1012: (rlen += rsz) > 0) {
1013: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1014: /* Buffer exhausted; shift left and re-fill. */
1015: assert(poff > 0);
1016: rlen -= poff;
1.14 schwarze 1017: memmove(b, b + poff, rlen);
1.5 schwarze 1018: }
1.24 schwarze 1019: if (rsz < 0)
1020: error_msg(p, "read: %s", strerror(errno));
1021: }
1022:
1023: /*
1024: * Open and parse a file.
1025: */
1026: struct ptree *
1027: parse_file(struct parse *p, int fd, const char *fname)
1028: {
1029: const char *save_fname;
1030: int save_line, save_col;
1031:
1032: /* Save and initialize reporting data. */
1033:
1034: save_fname = p->fname;
1035: save_line = p->nline;
1036: save_col = p->ncol;
1037: p->fname = fname;
1038: p->line = 0;
1039: p->col = 0;
1040:
1041: /* Open the file, unless it is already open. */
1042:
1043: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1044: error_msg(p, "open: %s", strerror(errno));
1045: p->fname = save_fname;
1046: return p->tree;
1.5 schwarze 1047: }
1.24 schwarze 1048:
1049: /*
1050: * After opening the starting file, change to the directory it
1051: * is located in, in case it wants to include any further files,
1052: * which are typically given with relative paths in DocBook.
1053: * Do this on a best-effort basis; don't complain about failure.
1054: */
1055:
1056: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1057: strcmp(fname, ".") != 0)
1058: (void)chdir(fname);
1059:
1060: /* Run the read loop. */
1061:
1062: p->nline = 1;
1063: p->ncol = 1;
1064: parse_fd(p, fd);
1065:
1066: /* On the top level, finalize the parse tree. */
1067:
1068: if (save_fname == NULL) {
1.37 schwarze 1069: pnode_closetext(p, 0);
1.24 schwarze 1070: if (p->tree->root == NULL)
1071: error_msg(p, "empty document");
1072: else if ((p->tree->flags & TREE_CLOSED) == 0)
1073: warn_msg(p, "document not closed");
1074: pnode_unlink(p->doctype);
1075: }
1076:
1077: /* Clean up. */
1078:
1079: if (fd != STDIN_FILENO)
1080: close(fd);
1081: p->fname = save_fname;
1082: p->nline = save_line;
1083: p->ncol = save_col;
1.1 schwarze 1084: return p->tree;
1085: }
CVSweb