Annotation of docbook2mdoc/parse.c, Revision 1.55
1.55 ! schwarze 1: /* $Id: parse.c,v 1.54 2019/04/28 15:32:05 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.53 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.24 schwarze 22: #include <errno.h>
23: #include <fcntl.h>
24: #include <libgen.h>
1.6 schwarze 25: #include <stdarg.h>
1.1 schwarze 26: #include <stdio.h>
1.5 schwarze 27: #include <stdlib.h>
1.1 schwarze 28: #include <string.h>
29: #include <unistd.h>
30:
1.53 schwarze 31: #include "xmalloc.h"
1.1 schwarze 32: #include "node.h"
33: #include "parse.h"
34:
35: /*
36: * The implementation of the DocBook parser.
37: */
38:
1.14 schwarze 39: enum pstate {
40: PARSE_ELEM,
41: PARSE_TAG,
42: PARSE_ARG,
43: PARSE_SQ,
44: PARSE_DQ
45: };
46:
1.1 schwarze 47: /*
48: * Global parse state.
49: * Keep this as simple and small as possible.
50: */
51: struct parse {
52: const char *fname; /* Name of the input file. */
53: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 54: struct pnode *doctype;
1.1 schwarze 55: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 56: enum nodeid ncur; /* Type of the current node. */
57: int line; /* Line number in the input file. */
58: int col; /* Column number in the input file. */
59: int nline; /* Line number of next token. */
60: int ncol; /* Column number of next token. */
1.4 schwarze 61: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 62: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 63: int flags;
64: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 schwarze 65: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
66: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
67: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
68: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 69: };
70:
1.39 schwarze 71: struct alias {
1.1 schwarze 72: const char *name; /* DocBook element name. */
73: enum nodeid node; /* Node type to generate. */
74: };
75:
1.39 schwarze 76: static const struct alias aliases[] = {
1.3 schwarze 77: { "acronym", NODE_IGNORE },
1.43 schwarze 78: { "affiliation", NODE_IGNORE },
1.4 schwarze 79: { "anchor", NODE_DELETE },
1.42 schwarze 80: { "application", NODE_COMMAND },
1.22 schwarze 81: { "article", NODE_SECTION },
1.41 schwarze 82: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 83: { "book", NODE_SECTION },
1.1 schwarze 84: { "chapter", NODE_SECTION },
1.44 schwarze 85: { "caption", NODE_IGNORE },
1.13 schwarze 86: { "code", NODE_LITERAL },
1.36 schwarze 87: { "computeroutput", NODE_LITERAL },
1.23 schwarze 88: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 89: { "figure", NODE_IGNORE },
1.7 schwarze 90: { "firstname", NODE_PERSONNAME },
1.21 schwarze 91: { "glossary", NODE_VARIABLELIST },
92: { "glossdef", NODE_IGNORE },
93: { "glossdiv", NODE_IGNORE },
94: { "glossentry", NODE_VARLISTENTRY },
95: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 96: { "holder", NODE_IGNORE },
1.44 schwarze 97: { "imageobject", NODE_IGNORE },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.11 schwarze 99: { "informaltable", NODE_TABLE },
1.42 schwarze 100: { "keycap", NODE_KEYSYM },
101: { "keycode", NODE_IGNORE },
1.55 ! schwarze 102: { "keycombo", NODE_IGNORE },
1.44 schwarze 103: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 104: { "orgname", NODE_IGNORE },
1.40 schwarze 105: { "othercredit", NODE_AUTHOR },
1.7 schwarze 106: { "othername", NODE_PERSONNAME },
1.1 schwarze 107: { "part", NODE_SECTION },
1.3 schwarze 108: { "phrase", NODE_IGNORE },
1.4 schwarze 109: { "primary", NODE_DELETE },
1.42 schwarze 110: { "property", NODE_PARAMETER },
1.52 schwarze 111: { "reference", NODE_SECTION },
1.1 schwarze 112: { "refsect1", NODE_SECTION },
113: { "refsect2", NODE_SECTION },
114: { "refsect3", NODE_SECTION },
115: { "refsection", NODE_SECTION },
1.43 schwarze 116: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 117: { "returnvalue", NODE_IGNORE },
1.4 schwarze 118: { "secondary", NODE_DELETE },
1.1 schwarze 119: { "sect1", NODE_SECTION },
120: { "sect2", NODE_SECTION },
1.46 schwarze 121: { "sect3", NODE_SECTION },
122: { "sect4", NODE_SECTION },
1.36 schwarze 123: { "sgmltag", NODE_MARKUP },
1.15 schwarze 124: { "simpara", NODE_PARA },
1.13 schwarze 125: { "structfield", NODE_PARAMETER },
126: { "structname", NODE_TYPE },
1.7 schwarze 127: { "surname", NODE_PERSONNAME },
1.12 schwarze 128: { "symbol", NODE_CONSTANT },
1.48 schwarze 129: { "tag", NODE_MARKUP },
1.3 schwarze 130: { "trademark", NODE_IGNORE },
1.18 schwarze 131: { "ulink", NODE_LINK },
1.13 schwarze 132: { "userinput", NODE_LITERAL },
1.43 schwarze 133: { "year", NODE_IGNORE },
1.5 schwarze 134: { NULL, NODE_IGNORE }
1.1 schwarze 135: };
136:
1.9 schwarze 137: struct entity {
138: const char *name;
139: const char *roff;
140: };
141:
142: /*
143: * XML character entity references found in the wild.
144: * Those that don't have an exact mandoc_char(7) representation
145: * are approximated, and the desired codepoint is given as a comment.
146: * Encoding them as \\[u...] would leave -Tascii out in the cold.
147: */
148: static const struct entity entities[] = {
149: { "alpha", "\\(*a" },
150: { "amp", "&" },
151: { "apos", "'" },
152: { "auml", "\\(:a" },
153: { "beta", "\\(*b" },
154: { "circ", "^" }, /* U+02C6 */
155: { "copy", "\\(co" },
156: { "dagger", "\\(dg" },
157: { "Delta", "\\(*D" },
158: { "eacute", "\\('e" },
159: { "emsp", "\\ " }, /* U+2003 */
160: { "gt", ">" },
161: { "hairsp", "\\^" },
162: { "kappa", "\\(*k" },
163: { "larr", "\\(<-" },
164: { "ldquo", "\\(lq" },
165: { "le", "\\(<=" },
166: { "lowbar", "_" },
167: { "lsqb", "[" },
168: { "lt", "<" },
169: { "mdash", "\\(em" },
170: { "minus", "\\-" },
171: { "ndash", "\\(en" },
172: { "nbsp", "\\ " },
173: { "num", "#" },
174: { "oslash", "\\(/o" },
175: { "ouml", "\\(:o" },
176: { "percnt", "%" },
177: { "quot", "\\(dq" },
178: { "rarr", "\\(->" },
179: { "rArr", "\\(rA" },
180: { "rdquo", "\\(rq" },
181: { "reg", "\\(rg" },
182: { "rho", "\\(*r" },
183: { "rsqb", "]" },
184: { "sigma", "\\(*s" },
185: { "shy", "\\&" }, /* U+00AD */
186: { "tau", "\\(*t" },
187: { "tilde", "\\[u02DC]" },
188: { "times", "\\[tmu]" },
189: { "uuml", "\\(:u" },
190: { NULL, NULL }
191: };
192:
1.23 schwarze 193: static size_t parse_string(struct parse *, char *, size_t,
194: enum pstate *, int);
1.24 schwarze 195: static void parse_fd(struct parse *, int);
1.23 schwarze 196:
197:
1.6 schwarze 198: static void
199: error_msg(struct parse *p, const char *fmt, ...)
200: {
201: va_list ap;
202:
1.29 schwarze 203: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 204: va_start(ap, fmt);
205: vfprintf(stderr, fmt, ap);
206: va_end(ap);
207: fputc('\n', stderr);
1.29 schwarze 208: p->tree->flags |= TREE_ERROR;
1.6 schwarze 209: }
210:
211: static void
212: warn_msg(struct parse *p, const char *fmt, ...)
213: {
214: va_list ap;
215:
1.23 schwarze 216: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 217: return;
218:
1.29 schwarze 219: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 220: va_start(ap, fmt);
221: vfprintf(stderr, fmt, ap);
222: va_end(ap);
223: fputc('\n', stderr);
1.29 schwarze 224: p->tree->flags |= TREE_WARN;
1.6 schwarze 225: }
226:
1.1 schwarze 227: /*
228: * Process a string of characters.
229: * If a text node is already open, append to it.
230: * Otherwise, create a new one as a child of the current node.
231: */
232: static void
1.35 schwarze 233: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 234: {
1.35 schwarze 235: struct pnode *n, *np;
1.32 schwarze 236: size_t oldsz, newsz;
1.35 schwarze 237: int i;
1.1 schwarze 238:
1.32 schwarze 239: assert(sz > 0);
1.30 schwarze 240: if (p->del > 0)
1.1 schwarze 241: return;
242:
1.32 schwarze 243: if ((n = p->cur) == NULL) {
1.35 schwarze 244: error_msg(p, "discarding text before document: %.*s",
245: sz, word);
1.5 schwarze 246: return;
247: }
248:
1.35 schwarze 249: /* Append to the current text node, if one is open. */
250:
251: if (n->node == NODE_TEXT) {
252: oldsz = strlen(n->b);
253: newsz = oldsz + sz;
254: if (oldsz && (p->flags & PFLAG_SPC))
255: newsz++;
1.53 schwarze 256: n->b = xrealloc(n->b, newsz + 1);
1.35 schwarze 257: if (oldsz && (p->flags & PFLAG_SPC))
258: n->b[oldsz++] = ' ';
259: memcpy(n->b + oldsz, word, sz);
260: n->b[newsz] = '\0';
1.51 schwarze 261: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 262: return;
1.1 schwarze 263: }
264:
1.35 schwarze 265: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 266: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 267:
1.35 schwarze 268: /* Create a new text node. */
1.1 schwarze 269:
1.53 schwarze 270: n = pnode_alloc(p->cur);
1.35 schwarze 271: n->node = NODE_TEXT;
1.51 schwarze 272: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
273: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
274: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 275:
276: /*
1.39 schwarze 277: * If this node follows an in-line macro without intervening
1.35 schwarze 278: * whitespace, keep the text in it as short as possible,
279: * and do not keep it open.
280: */
281:
1.51 schwarze 282: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 283: while (np != NULL) {
284: switch (pnode_class(np->node)) {
285: case CLASS_VOID:
286: case CLASS_TEXT:
287: case CLASS_BLOCK:
1.45 schwarze 288: case CLASS_NOFILL:
1.39 schwarze 289: np = NULL;
290: break;
291: case CLASS_TRANS:
292: np = TAILQ_LAST(&np->childq, pnodeq);
293: continue;
294: case CLASS_LINE:
295: case CLASS_ENCL:
296: break;
297: }
298: break;
299: }
300: if (np != NULL) {
1.35 schwarze 301: i = 0;
302: while (i < sz && !isspace((unsigned char)word[i]))
303: i++;
1.53 schwarze 304: n->b = xstrndup(word, i);
1.35 schwarze 305: if (i == sz)
306: return;
307: while (i < sz && isspace((unsigned char)word[i]))
308: i++;
309: if (i == sz) {
310: p->flags |= PFLAG_SPC;
311: return;
312: }
313:
314: /* Put any remaining text into a second node. */
315:
1.53 schwarze 316: n = pnode_alloc(p->cur);
1.35 schwarze 317: n->node = NODE_TEXT;
1.51 schwarze 318: n->flags |= NFLAG_SPC;
1.35 schwarze 319: word += i;
320: sz -= i;
321: }
1.53 schwarze 322: n->b = xstrndup(word, sz);
1.35 schwarze 323:
324: /* The new node remains open for later pnode_closetext(). */
325:
326: p->cur = n;
1.1 schwarze 327: }
328:
1.16 schwarze 329: /*
330: * Close out the text node and strip trailing whitespace, if one is open.
331: */
1.1 schwarze 332: static void
1.37 schwarze 333: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 334: {
1.16 schwarze 335: struct pnode *n;
1.37 schwarze 336: char *cp, *last_word;
1.16 schwarze 337:
338: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
339: return;
340: p->cur = n->parent;
1.32 schwarze 341: for (cp = strchr(n->b, '\0');
342: cp > n->b && isspace((unsigned char)cp[-1]);
343: *--cp = '\0')
1.23 schwarze 344: p->flags |= PFLAG_SPC;
1.37 schwarze 345:
346: if (p->flags & PFLAG_SPC || !check_last_word)
347: return;
348:
349: /*
350: * Find the beginning of the last word
351: * and delete whitespace before it.
352: */
353:
354: while (cp > n->b && !isspace((unsigned char)cp[-1]))
355: cp--;
356: if (cp == n->b)
357: return;
358:
359: last_word = cp;
360: while (cp > n->b && isspace((unsigned char)cp[-1]))
361: *--cp = '\0';
362:
363: /* Move the last word into its own node, for use with .Pf. */
364:
1.54 schwarze 365: n = pnode_alloc_text(p->cur, last_word);
1.51 schwarze 366: n->flags |= NFLAG_SPC;
1.1 schwarze 367: }
368:
1.9 schwarze 369: static void
370: xml_entity(struct parse *p, const char *name)
371: {
372: const struct entity *entity;
1.30 schwarze 373: struct pnode *n;
1.23 schwarze 374: const char *ccp;
375: char *cp;
1.49 schwarze 376: unsigned int codepoint;
1.23 schwarze 377: enum pstate pstate;
1.9 schwarze 378:
379: if (p->del > 0)
380: return;
381:
382: if (p->cur == NULL) {
383: error_msg(p, "discarding entity before document: &%s;", name);
384: return;
385: }
386:
1.37 schwarze 387: pnode_closetext(p, 0);
1.9 schwarze 388:
389: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
390: warn_msg(p, "entity after end of document: &%s;", name);
391:
392: for (entity = entities; entity->name != NULL; entity++)
393: if (strcmp(name, entity->name) == 0)
394: break;
395:
396: if (entity->roff == NULL) {
1.23 schwarze 397: if (p->doctype != NULL) {
1.30 schwarze 398: TAILQ_FOREACH(n, &p->doctype->childq, child) {
399: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 400: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 401: strcmp(ccp, name) != 0)
402: continue;
1.30 schwarze 403: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 404: ATTRKEY_SYSTEM, NULL)) != NULL) {
405: parse_file(p, -1, ccp);
1.51 schwarze 406: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 407: return;
408: }
1.30 schwarze 409: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 410: ATTRKEY_DEFINITION, NULL)) == NULL)
411: continue;
1.53 schwarze 412: cp = xstrdup(ccp);
1.23 schwarze 413: pstate = PARSE_ELEM;
414: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 schwarze 415: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 416: free(cp);
417: return;
418: }
419: }
1.49 schwarze 420: if (*name == '#') {
421: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
422: if (ccp == NULL) {
1.53 schwarze 423: n = pnode_alloc(p->cur);
424: xasprintf(&n->b, "\\[u%4.4X]", codepoint);
1.49 schwarze 425: goto done;
426: }
427: }
1.9 schwarze 428: error_msg(p, "unknown entity &%s;", name);
429: return;
430: }
431:
432: /* Create, append, and close out an entity node. */
1.53 schwarze 433: n = pnode_alloc(p->cur);
434: n->b = xstrdup(entity->roff);
1.49 schwarze 435: done:
1.30 schwarze 436: n->node = NODE_ESCAPE;
1.51 schwarze 437: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
438: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
439: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 440: }
441:
1.1 schwarze 442: /*
1.39 schwarze 443: * Parse an element name.
444: */
445: static enum nodeid
446: xml_name2node(struct parse *p, const char *name)
447: {
448: const struct alias *alias;
449: enum nodeid node;
450:
451: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
452: return node;
453:
454: for (alias = aliases; alias->name != NULL; alias++)
455: if (strcmp(alias->name, name) == 0)
456: return alias->node;
457:
458: return NODE_UNKNOWN;
459: }
460:
461: /*
1.1 schwarze 462: * Begin an element.
463: */
464: static void
1.30 schwarze 465: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 466: {
1.30 schwarze 467: struct pnode *n;
1.1 schwarze 468:
1.4 schwarze 469: /*
470: * An ancestor is excluded from the tree;
471: * keep track of the number of levels excluded.
472: */
1.30 schwarze 473: if (p->del > 0) {
1.23 schwarze 474: if (*name != '!' && *name != '?')
1.30 schwarze 475: p->del++;
1.4 schwarze 476: return;
477: }
478:
1.39 schwarze 479: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 480: case NODE_DELETE_WARN:
1.30 schwarze 481: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 482: /* FALLTHROUGH */
1.4 schwarze 483: case NODE_DELETE:
1.30 schwarze 484: p->del = 1;
1.4 schwarze 485: /* FALLTHROUGH */
1.2 schwarze 486: case NODE_IGNORE:
487: return;
1.39 schwarze 488: case NODE_UNKNOWN:
489: if (*name != '!' && *name != '?')
490: error_msg(p, "unknown element <%s>", name);
491: return;
1.2 schwarze 492: default:
493: break;
494: }
1.1 schwarze 495:
1.30 schwarze 496: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
497: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 498:
1.39 schwarze 499: switch (pnode_class(p->ncur)) {
500: case CLASS_LINE:
501: case CLASS_ENCL:
502: pnode_closetext(p, 1);
503: break;
504: default:
505: pnode_closetext(p, 0);
506: break;
507: }
508:
1.53 schwarze 509: n = pnode_alloc(p->cur);
1.17 schwarze 510:
511: /*
1.39 schwarze 512: * Some elements are self-closing.
1.17 schwarze 513: * Nodes that begin a new macro or request line or start by
514: * printing text always want whitespace before themselves.
515: */
516:
1.39 schwarze 517: switch (n->node = p->ncur) {
1.23 schwarze 518: case NODE_DOCTYPE:
519: case NODE_ENTITY:
520: case NODE_SBR:
1.48 schwarze 521: case NODE_VOID:
1.30 schwarze 522: p->flags |= PFLAG_EEND;
1.17 schwarze 523: break;
524: default:
1.39 schwarze 525: break;
526: }
527: switch (pnode_class(p->ncur)) {
528: case CLASS_LINE:
529: case CLASS_ENCL:
1.51 schwarze 530: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
531: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
1.17 schwarze 532: break;
1.45 schwarze 533: case CLASS_NOFILL:
534: p->nofill++;
535: /* FALLTHROUGH */
1.39 schwarze 536: default:
1.51 schwarze 537: n->flags |= NFLAG_SPC;
1.39 schwarze 538: break;
1.17 schwarze 539: }
1.30 schwarze 540: p->cur = n;
541: if (n->node == NODE_DOCTYPE) {
542: if (p->doctype == NULL)
543: p->doctype = n;
1.23 schwarze 544: else
1.30 schwarze 545: error_msg(p, "duplicate doctype");
546: } else if (n->parent == NULL && p->tree->root == NULL)
547: p->tree->root = n;
1.5 schwarze 548: }
549:
550: static void
1.30 schwarze 551: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 552: {
1.30 schwarze 553: struct pattr *a;
1.23 schwarze 554: const char *value;
1.5 schwarze 555: enum attrkey key;
1.1 schwarze 556:
1.47 schwarze 557: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 558: return;
1.23 schwarze 559:
1.30 schwarze 560: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
561: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 562: value = name;
563: name = "NAME";
564: } else
565: value = NULL;
566:
1.5 schwarze 567: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 568: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 569: return;
570: }
1.53 schwarze 571: a = xcalloc(1, sizeof(*a));
1.30 schwarze 572: a->key = key;
573: a->val = ATTRVAL__MAX;
1.23 schwarze 574: if (value == NULL) {
1.30 schwarze 575: a->rawval = NULL;
576: p->flags |= PFLAG_ATTR;
1.23 schwarze 577: } else {
1.53 schwarze 578: a->rawval = xstrdup(value);
1.30 schwarze 579: p->flags &= ~PFLAG_ATTR;
580: }
581: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
582: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
583: xml_attrkey(p, "DEFINITION");
1.5 schwarze 584: }
585:
586: static void
1.30 schwarze 587: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 588: {
1.30 schwarze 589: struct pattr *a;
1.5 schwarze 590:
1.47 schwarze 591: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 592: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 593: return;
1.30 schwarze 594: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 595: return;
1.53 schwarze 596: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX)
597: a->rawval = xstrdup(name);
1.30 schwarze 598: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 599: }
600:
601: /*
602: * Roll up the parse tree.
603: * If we're at a text node, roll that one up first.
604: */
605: static void
1.31 schwarze 606: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 607: {
1.26 schwarze 608: struct pnode *n;
609: const char *cp;
1.5 schwarze 610: enum nodeid node;
1.1 schwarze 611:
1.4 schwarze 612: /*
613: * An ancestor is excluded from the tree;
614: * keep track of the number of levels excluded.
615: */
1.31 schwarze 616: if (p->del > 1) {
617: p->del--;
1.4 schwarze 618: return;
619: }
620:
1.31 schwarze 621: if (p->del == 0)
1.37 schwarze 622: pnode_closetext(p, 0);
1.2 schwarze 623:
1.50 schwarze 624: n = p->cur;
1.39 schwarze 625: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 626:
1.5 schwarze 627: switch (node) {
1.4 schwarze 628: case NODE_DELETE_WARN:
629: case NODE_DELETE:
1.31 schwarze 630: if (p->del > 0)
631: p->del--;
1.4 schwarze 632: break;
1.2 schwarze 633: case NODE_IGNORE:
1.39 schwarze 634: case NODE_UNKNOWN:
1.26 schwarze 635: break;
636: case NODE_INCLUDE:
1.50 schwarze 637: p->cur = n->parent;
1.26 schwarze 638: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
639: if (cp == NULL)
1.31 schwarze 640: error_msg(p, "<xi:include> element "
1.26 schwarze 641: "without href attribute");
642: else
1.31 schwarze 643: parse_file(p, -1, cp);
1.26 schwarze 644: pnode_unlink(n);
1.51 schwarze 645: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 646: break;
1.23 schwarze 647: case NODE_DOCTYPE:
1.32 schwarze 648: case NODE_SBR:
1.48 schwarze 649: case NODE_VOID:
1.31 schwarze 650: p->flags &= ~PFLAG_EEND;
1.23 schwarze 651: /* FALLTHROUGH */
1.2 schwarze 652: default:
1.50 schwarze 653: if (n == NULL || node != n->node) {
1.31 schwarze 654: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 655: break;
656: }
1.45 schwarze 657: if (pnode_class(node) == CLASS_NOFILL)
658: p->nofill--;
1.5 schwarze 659:
660: /*
661: * Refrain from actually closing the document element.
662: * If no more content follows, no harm is done, but if
663: * some content still follows, simply processing it is
664: * obviously better than discarding it or crashing.
665: */
666:
1.50 schwarze 667: if (n->parent != NULL || node == NODE_DOCTYPE) {
668: p->cur = n->parent;
1.31 schwarze 669: if (p->cur != NULL)
670: p->ncur = p->cur->node;
1.23 schwarze 671: } else
1.31 schwarze 672: p->tree->flags |= TREE_CLOSED;
1.51 schwarze 673: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 674:
675: /* Include a file containing entity declarations. */
676:
677: if (node == NODE_ENTITY && strcmp("%",
678: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
679: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
680: parse_file(p, -1, cp);
681:
1.4 schwarze 682: break;
1.2 schwarze 683: }
1.31 schwarze 684: assert(p->del == 0);
1.1 schwarze 685: }
686:
687: struct parse *
688: parse_alloc(int warn)
689: {
690: struct parse *p;
691:
1.53 schwarze 692: p = xcalloc(1, sizeof(*p));
693: p->tree = xcalloc(1, sizeof(*p->tree));
1.23 schwarze 694: if (warn)
695: p->flags |= PFLAG_WARN;
696: else
697: p->flags &= ~PFLAG_WARN;
1.1 schwarze 698: return p;
699: }
700:
701: void
702: parse_free(struct parse *p)
703: {
704: if (p == NULL)
705: return;
706: if (p->tree != NULL) {
707: pnode_unlink(p->tree->root);
708: free(p->tree);
709: }
710: free(p);
711: }
712:
1.14 schwarze 713: static void
714: increment(struct parse *p, char *b, size_t *pend, int refill)
715: {
716: if (refill) {
717: if (b[*pend] == '\n') {
718: p->nline++;
719: p->ncol = 1;
720: } else
721: p->ncol++;
722: }
723: ++*pend;
724: }
725:
1.5 schwarze 726: /*
727: * Advance the pend pointer to the next character in the charset.
728: * If the charset starts with a space, it stands for any whitespace.
729: * Update the new input file position, used for messages.
730: * Do not overrun the buffer b of length rlen.
731: * When reaching the end, NUL-terminate the buffer and return 1;
732: * otherwise, return 0.
733: */
734: static int
735: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 736: const char *charset, int refill)
1.5 schwarze 737: {
738: int space;
739:
740: if (*charset == ' ') {
741: space = 1;
742: charset++;
743: } else
744: space = 0;
745:
1.14 schwarze 746: if (refill) {
747: p->nline = p->line;
748: p->ncol = p->col;
749: }
1.5 schwarze 750: while (*pend < rlen) {
751: if (space && isspace((unsigned char)b[*pend]))
752: break;
753: if (strchr(charset, b[*pend]) != NULL)
754: break;
1.14 schwarze 755: increment(p, b, pend, refill);
1.5 schwarze 756: }
757: if (*pend == rlen) {
758: b[rlen] = '\0';
1.14 schwarze 759: return refill;
1.5 schwarze 760: } else
761: return 0;
762: }
763:
1.14 schwarze 764: size_t
765: parse_string(struct parse *p, char *b, size_t rlen,
766: enum pstate *pstate, int refill)
767: {
768: char *cp;
1.45 schwarze 769: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 770: size_t poff; /* Parse offset in b[]. */
771: size_t pend; /* Offset of the end of the current word. */
772: int elem_end;
773:
1.45 schwarze 774: pend = pws = 0;
1.14 schwarze 775: for (;;) {
776:
777: /* Proceed to the next token, skipping whitespace. */
778:
779: if (refill) {
780: p->line = p->nline;
781: p->col = p->ncol;
782: }
783: if ((poff = pend) == rlen)
784: break;
785: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 786: p->flags |= PFLAG_SPC;
1.51 schwarze 787: if (b[pend] == '\n') {
788: p->flags |= PFLAG_LINE;
1.45 schwarze 789: pws = pend + 1;
1.51 schwarze 790: }
1.14 schwarze 791: increment(p, b, &pend, refill);
792: continue;
793: }
794:
795: /*
796: * The following four cases (ARG, TAG, and starting an
797: * entity or a tag) all parse a word or quoted string.
798: * If that extends beyond the read buffer and the last
799: * read(2) still got data, they all break out of the
800: * token loop to request more data from the read loop.
801: *
802: * Also, three of them detect self-closing tags, those
803: * ending with "/>", setting the flag elem_end and
804: * calling xml_elem_end() at the very end, after
805: * handling the attribute value, attribute name, or
806: * tag name, respectively.
807: */
808:
809: /* Parse an attribute value. */
810:
811: if (*pstate >= PARSE_ARG) {
812: if (*pstate == PARSE_ARG &&
813: (b[pend] == '\'' || b[pend] == '"')) {
814: *pstate = b[pend] == '"' ?
815: PARSE_DQ : PARSE_SQ;
816: increment(p, b, &pend, refill);
817: continue;
818: }
819: if (advance(p, b, rlen, &pend,
820: *pstate == PARSE_DQ ? "\"" :
821: *pstate == PARSE_SQ ? "'" : " >", refill))
822: break;
823: *pstate = PARSE_TAG;
824: elem_end = 0;
825: if (b[pend] == '>') {
826: *pstate = PARSE_ELEM;
827: if (pend > 0 && b[pend - 1] == '/') {
828: b[pend - 1] = '\0';
829: elem_end = 1;
830: }
1.23 schwarze 831: if (p->flags & PFLAG_EEND)
832: elem_end = 1;
1.14 schwarze 833: }
834: b[pend] = '\0';
835: if (pend < rlen)
836: increment(p, b, &pend, refill);
837: xml_attrval(p, b + poff);
838: if (elem_end)
839: xml_elem_end(p, NULL);
840:
841: /* Look for an attribute name. */
842:
843: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 844: switch (p->ncur) {
845: case NODE_DOCTYPE:
846: if (b[pend] == '[') {
847: *pstate = PARSE_ELEM;
848: increment(p, b, &pend, refill);
849: continue;
850: }
851: /* FALLTHROUGH */
852: case NODE_ENTITY:
853: if (b[pend] == '"' || b[pend] == '\'') {
854: *pstate = PARSE_ARG;
855: continue;
856: }
857: break;
858: default:
859: break;
860: }
1.14 schwarze 861: if (advance(p, b, rlen, &pend, " =>", refill))
862: break;
863: elem_end = 0;
864: switch (b[pend]) {
865: case '>':
866: *pstate = PARSE_ELEM;
867: if (pend > 0 && b[pend - 1] == '/') {
868: b[pend - 1] = '\0';
869: elem_end = 1;
870: }
1.23 schwarze 871: if (p->flags & PFLAG_EEND)
872: elem_end = 1;
1.14 schwarze 873: break;
874: case '=':
875: *pstate = PARSE_ARG;
876: break;
877: default:
878: break;
879: }
880: b[pend] = '\0';
881: if (pend < rlen)
882: increment(p, b, &pend, refill);
883: xml_attrkey(p, b + poff);
884: if (elem_end)
885: xml_elem_end(p, NULL);
886:
887: /* Begin an opening or closing tag. */
888:
889: } else if (b[poff] == '<') {
890: if (advance(p, b, rlen, &pend, " >", refill))
891: break;
892: if (pend > poff + 3 &&
893: strncmp(b + poff, "<!--", 4) == 0) {
894:
895: /* Skip a comment. */
896:
897: cp = strstr(b + pend - 2, "-->");
898: if (cp == NULL) {
899: if (refill)
900: break;
901: cp = b + rlen;
902: } else
903: cp += 3;
904: while (b + pend < cp)
905: increment(p, b, &pend, refill);
906: continue;
907: }
908: elem_end = 0;
909: if (b[pend] != '>')
910: *pstate = PARSE_TAG;
911: else if (pend > 0 && b[pend - 1] == '/') {
912: b[pend - 1] = '\0';
913: elem_end = 1;
914: }
915: b[pend] = '\0';
916: if (pend < rlen)
917: increment(p, b, &pend, refill);
918: if (b[++poff] == '/') {
919: elem_end = 1;
920: poff++;
1.23 schwarze 921: } else {
1.14 schwarze 922: xml_elem_start(p, b + poff);
1.23 schwarze 923: if (*pstate == PARSE_ELEM &&
924: p->flags & PFLAG_EEND)
925: elem_end = 1;
926: }
1.14 schwarze 927: if (elem_end)
928: xml_elem_end(p, b + poff);
929:
1.23 schwarze 930: /* Close a doctype. */
931:
932: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
933: *pstate = PARSE_TAG;
934: increment(p, b, &pend, refill);
935:
1.14 schwarze 936: /* Process an entity. */
937:
938: } else if (b[poff] == '&') {
939: if (advance(p, b, rlen, &pend, ";", refill))
940: break;
941: b[pend] = '\0';
942: if (pend < rlen)
943: increment(p, b, &pend, refill);
944: xml_entity(p, b + poff + 1);
945:
946: /* Process text up to the next tag, entity, or EOL. */
947:
948: } else {
1.28 schwarze 949: advance(p, b, rlen, &pend,
1.33 schwarze 950: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 951: refill);
1.45 schwarze 952: if (p->nofill)
953: poff = pws;
1.35 schwarze 954: xml_text(p, b + poff, pend - poff);
1.33 schwarze 955: if (b[pend] == '\n')
1.37 schwarze 956: pnode_closetext(p, 0);
1.14 schwarze 957: }
1.45 schwarze 958: pws = pend;
1.14 schwarze 959: }
960: return poff;
961: }
962:
1.24 schwarze 963:
964: /*
965: * The read loop.
966: * If the previous token was incomplete and asked for more input,
967: * we have to enter the read loop once more even on EOF.
968: * Once rsz is 0, incomplete tokens will no longer ask for more input
969: * but instead use whatever there is, and then exit the read loop.
970: * The minus one on the size limit for read(2) is needed such that
971: * advance() can set b[rlen] to NUL when needed.
972: */
973: static void
974: parse_fd(struct parse *p, int fd)
1.1 schwarze 975: {
976: char b[4096];
1.5 schwarze 977: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 978: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 979: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 980: enum pstate pstate;
1.1 schwarze 981:
1.24 schwarze 982: rlen = 0;
1.14 schwarze 983: pstate = PARSE_ELEM;
984: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
985: (rlen += rsz) > 0) {
986: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 987: /* Buffer exhausted; shift left and re-fill. */
988: assert(poff > 0);
989: rlen -= poff;
1.14 schwarze 990: memmove(b, b + poff, rlen);
1.5 schwarze 991: }
1.24 schwarze 992: if (rsz < 0)
993: error_msg(p, "read: %s", strerror(errno));
994: }
995:
996: /*
997: * Open and parse a file.
998: */
999: struct ptree *
1000: parse_file(struct parse *p, int fd, const char *fname)
1001: {
1002: const char *save_fname;
1003: int save_line, save_col;
1004:
1005: /* Save and initialize reporting data. */
1006:
1007: save_fname = p->fname;
1008: save_line = p->nline;
1009: save_col = p->ncol;
1010: p->fname = fname;
1011: p->line = 0;
1012: p->col = 0;
1013:
1014: /* Open the file, unless it is already open. */
1015:
1016: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1017: error_msg(p, "open: %s", strerror(errno));
1018: p->fname = save_fname;
1019: return p->tree;
1.5 schwarze 1020: }
1.24 schwarze 1021:
1022: /*
1023: * After opening the starting file, change to the directory it
1024: * is located in, in case it wants to include any further files,
1025: * which are typically given with relative paths in DocBook.
1026: * Do this on a best-effort basis; don't complain about failure.
1027: */
1028:
1029: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1030: strcmp(fname, ".") != 0)
1031: (void)chdir(fname);
1032:
1033: /* Run the read loop. */
1034:
1035: p->nline = 1;
1036: p->ncol = 1;
1037: parse_fd(p, fd);
1038:
1039: /* On the top level, finalize the parse tree. */
1040:
1041: if (save_fname == NULL) {
1.37 schwarze 1042: pnode_closetext(p, 0);
1.24 schwarze 1043: if (p->tree->root == NULL)
1044: error_msg(p, "empty document");
1045: else if ((p->tree->flags & TREE_CLOSED) == 0)
1046: warn_msg(p, "document not closed");
1047: pnode_unlink(p->doctype);
1048: }
1049:
1050: /* Clean up. */
1051:
1052: if (fd != STDIN_FILENO)
1053: close(fd);
1054: p->fname = save_fname;
1055: p->nline = save_line;
1056: p->ncol = save_col;
1.1 schwarze 1057: return p->tree;
1058: }
CVSweb