Annotation of docbook2mdoc/parse.c, Revision 1.53
1.53 ! schwarze 1: /* $Id: parse.c,v 1.52 2019/04/25 17:57:59 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.53 ! schwarze 18: #include <sys/types.h>
! 19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.24 schwarze 22: #include <errno.h>
23: #include <fcntl.h>
24: #include <libgen.h>
1.6 schwarze 25: #include <stdarg.h>
1.1 schwarze 26: #include <stdio.h>
1.5 schwarze 27: #include <stdlib.h>
1.1 schwarze 28: #include <string.h>
29: #include <unistd.h>
30:
1.53 ! schwarze 31: #include "xmalloc.h"
1.1 schwarze 32: #include "node.h"
33: #include "parse.h"
34:
35: /*
36: * The implementation of the DocBook parser.
37: */
38:
1.14 schwarze 39: enum pstate {
40: PARSE_ELEM,
41: PARSE_TAG,
42: PARSE_ARG,
43: PARSE_SQ,
44: PARSE_DQ
45: };
46:
1.1 schwarze 47: /*
48: * Global parse state.
49: * Keep this as simple and small as possible.
50: */
51: struct parse {
52: const char *fname; /* Name of the input file. */
53: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 54: struct pnode *doctype;
1.1 schwarze 55: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 56: enum nodeid ncur; /* Type of the current node. */
57: int line; /* Line number in the input file. */
58: int col; /* Column number in the input file. */
59: int nline; /* Line number of next token. */
60: int ncol; /* Column number of next token. */
1.4 schwarze 61: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 62: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 63: int flags;
64: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 schwarze 65: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
66: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
67: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
68: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 69: };
70:
1.39 schwarze 71: struct alias {
1.1 schwarze 72: const char *name; /* DocBook element name. */
73: enum nodeid node; /* Node type to generate. */
74: };
75:
1.39 schwarze 76: static const struct alias aliases[] = {
1.3 schwarze 77: { "acronym", NODE_IGNORE },
1.43 schwarze 78: { "affiliation", NODE_IGNORE },
1.4 schwarze 79: { "anchor", NODE_DELETE },
1.42 schwarze 80: { "application", NODE_COMMAND },
1.22 schwarze 81: { "article", NODE_SECTION },
1.41 schwarze 82: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 83: { "book", NODE_SECTION },
1.1 schwarze 84: { "chapter", NODE_SECTION },
1.44 schwarze 85: { "caption", NODE_IGNORE },
1.13 schwarze 86: { "code", NODE_LITERAL },
1.36 schwarze 87: { "computeroutput", NODE_LITERAL },
1.23 schwarze 88: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 89: { "figure", NODE_IGNORE },
1.7 schwarze 90: { "firstname", NODE_PERSONNAME },
1.21 schwarze 91: { "glossary", NODE_VARIABLELIST },
92: { "glossdef", NODE_IGNORE },
93: { "glossdiv", NODE_IGNORE },
94: { "glossentry", NODE_VARLISTENTRY },
95: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 96: { "holder", NODE_IGNORE },
1.44 schwarze 97: { "imageobject", NODE_IGNORE },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.11 schwarze 99: { "informaltable", NODE_TABLE },
1.42 schwarze 100: { "keycap", NODE_KEYSYM },
101: { "keycode", NODE_IGNORE },
1.44 schwarze 102: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 103: { "orgname", NODE_IGNORE },
1.40 schwarze 104: { "othercredit", NODE_AUTHOR },
1.7 schwarze 105: { "othername", NODE_PERSONNAME },
1.1 schwarze 106: { "part", NODE_SECTION },
1.3 schwarze 107: { "phrase", NODE_IGNORE },
1.4 schwarze 108: { "primary", NODE_DELETE },
1.42 schwarze 109: { "property", NODE_PARAMETER },
1.52 schwarze 110: { "reference", NODE_SECTION },
1.1 schwarze 111: { "refsect1", NODE_SECTION },
112: { "refsect2", NODE_SECTION },
113: { "refsect3", NODE_SECTION },
114: { "refsection", NODE_SECTION },
1.43 schwarze 115: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 116: { "returnvalue", NODE_IGNORE },
1.4 schwarze 117: { "secondary", NODE_DELETE },
1.1 schwarze 118: { "sect1", NODE_SECTION },
119: { "sect2", NODE_SECTION },
1.46 schwarze 120: { "sect3", NODE_SECTION },
121: { "sect4", NODE_SECTION },
1.36 schwarze 122: { "sgmltag", NODE_MARKUP },
1.15 schwarze 123: { "simpara", NODE_PARA },
1.13 schwarze 124: { "structfield", NODE_PARAMETER },
125: { "structname", NODE_TYPE },
1.7 schwarze 126: { "surname", NODE_PERSONNAME },
1.12 schwarze 127: { "symbol", NODE_CONSTANT },
1.48 schwarze 128: { "tag", NODE_MARKUP },
1.3 schwarze 129: { "trademark", NODE_IGNORE },
1.18 schwarze 130: { "ulink", NODE_LINK },
1.13 schwarze 131: { "userinput", NODE_LITERAL },
1.43 schwarze 132: { "year", NODE_IGNORE },
1.5 schwarze 133: { NULL, NODE_IGNORE }
1.1 schwarze 134: };
135:
1.9 schwarze 136: struct entity {
137: const char *name;
138: const char *roff;
139: };
140:
141: /*
142: * XML character entity references found in the wild.
143: * Those that don't have an exact mandoc_char(7) representation
144: * are approximated, and the desired codepoint is given as a comment.
145: * Encoding them as \\[u...] would leave -Tascii out in the cold.
146: */
147: static const struct entity entities[] = {
148: { "alpha", "\\(*a" },
149: { "amp", "&" },
150: { "apos", "'" },
151: { "auml", "\\(:a" },
152: { "beta", "\\(*b" },
153: { "circ", "^" }, /* U+02C6 */
154: { "copy", "\\(co" },
155: { "dagger", "\\(dg" },
156: { "Delta", "\\(*D" },
157: { "eacute", "\\('e" },
158: { "emsp", "\\ " }, /* U+2003 */
159: { "gt", ">" },
160: { "hairsp", "\\^" },
161: { "kappa", "\\(*k" },
162: { "larr", "\\(<-" },
163: { "ldquo", "\\(lq" },
164: { "le", "\\(<=" },
165: { "lowbar", "_" },
166: { "lsqb", "[" },
167: { "lt", "<" },
168: { "mdash", "\\(em" },
169: { "minus", "\\-" },
170: { "ndash", "\\(en" },
171: { "nbsp", "\\ " },
172: { "num", "#" },
173: { "oslash", "\\(/o" },
174: { "ouml", "\\(:o" },
175: { "percnt", "%" },
176: { "quot", "\\(dq" },
177: { "rarr", "\\(->" },
178: { "rArr", "\\(rA" },
179: { "rdquo", "\\(rq" },
180: { "reg", "\\(rg" },
181: { "rho", "\\(*r" },
182: { "rsqb", "]" },
183: { "sigma", "\\(*s" },
184: { "shy", "\\&" }, /* U+00AD */
185: { "tau", "\\(*t" },
186: { "tilde", "\\[u02DC]" },
187: { "times", "\\[tmu]" },
188: { "uuml", "\\(:u" },
189: { NULL, NULL }
190: };
191:
1.23 schwarze 192: static size_t parse_string(struct parse *, char *, size_t,
193: enum pstate *, int);
1.24 schwarze 194: static void parse_fd(struct parse *, int);
1.23 schwarze 195:
196:
1.6 schwarze 197: static void
198: error_msg(struct parse *p, const char *fmt, ...)
199: {
200: va_list ap;
201:
1.29 schwarze 202: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 203: va_start(ap, fmt);
204: vfprintf(stderr, fmt, ap);
205: va_end(ap);
206: fputc('\n', stderr);
1.29 schwarze 207: p->tree->flags |= TREE_ERROR;
1.6 schwarze 208: }
209:
210: static void
211: warn_msg(struct parse *p, const char *fmt, ...)
212: {
213: va_list ap;
214:
1.23 schwarze 215: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 216: return;
217:
1.29 schwarze 218: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 219: va_start(ap, fmt);
220: vfprintf(stderr, fmt, ap);
221: va_end(ap);
222: fputc('\n', stderr);
1.29 schwarze 223: p->tree->flags |= TREE_WARN;
1.6 schwarze 224: }
225:
1.1 schwarze 226: /*
227: * Process a string of characters.
228: * If a text node is already open, append to it.
229: * Otherwise, create a new one as a child of the current node.
230: */
231: static void
1.35 schwarze 232: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 233: {
1.35 schwarze 234: struct pnode *n, *np;
1.32 schwarze 235: size_t oldsz, newsz;
1.35 schwarze 236: int i;
1.1 schwarze 237:
1.32 schwarze 238: assert(sz > 0);
1.30 schwarze 239: if (p->del > 0)
1.1 schwarze 240: return;
241:
1.32 schwarze 242: if ((n = p->cur) == NULL) {
1.35 schwarze 243: error_msg(p, "discarding text before document: %.*s",
244: sz, word);
1.5 schwarze 245: return;
246: }
247:
1.35 schwarze 248: /* Append to the current text node, if one is open. */
249:
250: if (n->node == NODE_TEXT) {
251: oldsz = strlen(n->b);
252: newsz = oldsz + sz;
253: if (oldsz && (p->flags & PFLAG_SPC))
254: newsz++;
1.53 ! schwarze 255: n->b = xrealloc(n->b, newsz + 1);
1.35 schwarze 256: if (oldsz && (p->flags & PFLAG_SPC))
257: n->b[oldsz++] = ' ';
258: memcpy(n->b + oldsz, word, sz);
259: n->b[newsz] = '\0';
1.51 schwarze 260: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 261: return;
1.1 schwarze 262: }
263:
1.35 schwarze 264: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 265: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 266:
1.35 schwarze 267: /* Create a new text node. */
1.1 schwarze 268:
1.53 ! schwarze 269: n = pnode_alloc(p->cur);
1.35 schwarze 270: n->node = NODE_TEXT;
1.51 schwarze 271: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
272: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
273: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 274:
275: /*
1.39 schwarze 276: * If this node follows an in-line macro without intervening
1.35 schwarze 277: * whitespace, keep the text in it as short as possible,
278: * and do not keep it open.
279: */
280:
1.51 schwarze 281: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 282: while (np != NULL) {
283: switch (pnode_class(np->node)) {
284: case CLASS_VOID:
285: case CLASS_TEXT:
286: case CLASS_BLOCK:
1.45 schwarze 287: case CLASS_NOFILL:
1.39 schwarze 288: np = NULL;
289: break;
290: case CLASS_TRANS:
291: np = TAILQ_LAST(&np->childq, pnodeq);
292: continue;
293: case CLASS_LINE:
294: case CLASS_ENCL:
295: break;
296: }
297: break;
298: }
299: if (np != NULL) {
1.35 schwarze 300: i = 0;
301: while (i < sz && !isspace((unsigned char)word[i]))
302: i++;
1.53 ! schwarze 303: n->b = xstrndup(word, i);
1.35 schwarze 304: if (i == sz)
305: return;
306: while (i < sz && isspace((unsigned char)word[i]))
307: i++;
308: if (i == sz) {
309: p->flags |= PFLAG_SPC;
310: return;
311: }
312:
313: /* Put any remaining text into a second node. */
314:
1.53 ! schwarze 315: n = pnode_alloc(p->cur);
1.35 schwarze 316: n->node = NODE_TEXT;
1.51 schwarze 317: n->flags |= NFLAG_SPC;
1.35 schwarze 318: word += i;
319: sz -= i;
320: }
1.53 ! schwarze 321: n->b = xstrndup(word, sz);
1.35 schwarze 322:
323: /* The new node remains open for later pnode_closetext(). */
324:
325: p->cur = n;
1.1 schwarze 326: }
327:
1.16 schwarze 328: /*
329: * Close out the text node and strip trailing whitespace, if one is open.
330: */
1.1 schwarze 331: static void
1.37 schwarze 332: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 333: {
1.16 schwarze 334: struct pnode *n;
1.37 schwarze 335: char *cp, *last_word;
1.16 schwarze 336:
337: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
338: return;
339: p->cur = n->parent;
1.32 schwarze 340: for (cp = strchr(n->b, '\0');
341: cp > n->b && isspace((unsigned char)cp[-1]);
342: *--cp = '\0')
1.23 schwarze 343: p->flags |= PFLAG_SPC;
1.37 schwarze 344:
345: if (p->flags & PFLAG_SPC || !check_last_word)
346: return;
347:
348: /*
349: * Find the beginning of the last word
350: * and delete whitespace before it.
351: */
352:
353: while (cp > n->b && !isspace((unsigned char)cp[-1]))
354: cp--;
355: if (cp == n->b)
356: return;
357:
358: last_word = cp;
359: while (cp > n->b && isspace((unsigned char)cp[-1]))
360: *--cp = '\0';
361:
362: /* Move the last word into its own node, for use with .Pf. */
363:
1.53 ! schwarze 364: n = pnode_alloc(p->cur);
1.37 schwarze 365: n->node = NODE_TEXT;
1.51 schwarze 366: n->flags |= NFLAG_SPC;
1.53 ! schwarze 367: n->b = xstrdup(last_word);
1.1 schwarze 368: }
369:
1.9 schwarze 370: static void
371: xml_entity(struct parse *p, const char *name)
372: {
373: const struct entity *entity;
1.30 schwarze 374: struct pnode *n;
1.23 schwarze 375: const char *ccp;
376: char *cp;
1.49 schwarze 377: unsigned int codepoint;
1.23 schwarze 378: enum pstate pstate;
1.9 schwarze 379:
380: if (p->del > 0)
381: return;
382:
383: if (p->cur == NULL) {
384: error_msg(p, "discarding entity before document: &%s;", name);
385: return;
386: }
387:
1.37 schwarze 388: pnode_closetext(p, 0);
1.9 schwarze 389:
390: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
391: warn_msg(p, "entity after end of document: &%s;", name);
392:
393: for (entity = entities; entity->name != NULL; entity++)
394: if (strcmp(name, entity->name) == 0)
395: break;
396:
397: if (entity->roff == NULL) {
1.23 schwarze 398: if (p->doctype != NULL) {
1.30 schwarze 399: TAILQ_FOREACH(n, &p->doctype->childq, child) {
400: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 401: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 402: strcmp(ccp, name) != 0)
403: continue;
1.30 schwarze 404: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 405: ATTRKEY_SYSTEM, NULL)) != NULL) {
406: parse_file(p, -1, ccp);
1.51 schwarze 407: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 408: return;
409: }
1.30 schwarze 410: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 411: ATTRKEY_DEFINITION, NULL)) == NULL)
412: continue;
1.53 ! schwarze 413: cp = xstrdup(ccp);
1.23 schwarze 414: pstate = PARSE_ELEM;
415: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 schwarze 416: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 417: free(cp);
418: return;
419: }
420: }
1.49 schwarze 421: if (*name == '#') {
422: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
423: if (ccp == NULL) {
1.53 ! schwarze 424: n = pnode_alloc(p->cur);
! 425: xasprintf(&n->b, "\\[u%4.4X]", codepoint);
1.49 schwarze 426: goto done;
427: }
428: }
1.9 schwarze 429: error_msg(p, "unknown entity &%s;", name);
430: return;
431: }
432:
433: /* Create, append, and close out an entity node. */
1.53 ! schwarze 434: n = pnode_alloc(p->cur);
! 435: n->b = xstrdup(entity->roff);
1.49 schwarze 436: done:
1.30 schwarze 437: n->node = NODE_ESCAPE;
1.51 schwarze 438: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
439: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
440: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 441: }
442:
1.1 schwarze 443: /*
1.39 schwarze 444: * Parse an element name.
445: */
446: static enum nodeid
447: xml_name2node(struct parse *p, const char *name)
448: {
449: const struct alias *alias;
450: enum nodeid node;
451:
452: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
453: return node;
454:
455: for (alias = aliases; alias->name != NULL; alias++)
456: if (strcmp(alias->name, name) == 0)
457: return alias->node;
458:
459: return NODE_UNKNOWN;
460: }
461:
462: /*
1.1 schwarze 463: * Begin an element.
464: */
465: static void
1.30 schwarze 466: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 467: {
1.30 schwarze 468: struct pnode *n;
1.1 schwarze 469:
1.4 schwarze 470: /*
471: * An ancestor is excluded from the tree;
472: * keep track of the number of levels excluded.
473: */
1.30 schwarze 474: if (p->del > 0) {
1.23 schwarze 475: if (*name != '!' && *name != '?')
1.30 schwarze 476: p->del++;
1.4 schwarze 477: return;
478: }
479:
1.39 schwarze 480: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 481: case NODE_DELETE_WARN:
1.30 schwarze 482: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 483: /* FALLTHROUGH */
1.4 schwarze 484: case NODE_DELETE:
1.30 schwarze 485: p->del = 1;
1.4 schwarze 486: /* FALLTHROUGH */
1.2 schwarze 487: case NODE_IGNORE:
488: return;
1.39 schwarze 489: case NODE_UNKNOWN:
490: if (*name != '!' && *name != '?')
491: error_msg(p, "unknown element <%s>", name);
492: return;
1.2 schwarze 493: default:
494: break;
495: }
1.1 schwarze 496:
1.30 schwarze 497: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
498: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 499:
1.39 schwarze 500: switch (pnode_class(p->ncur)) {
501: case CLASS_LINE:
502: case CLASS_ENCL:
503: pnode_closetext(p, 1);
504: break;
505: default:
506: pnode_closetext(p, 0);
507: break;
508: }
509:
1.53 ! schwarze 510: n = pnode_alloc(p->cur);
1.17 schwarze 511:
512: /*
1.39 schwarze 513: * Some elements are self-closing.
1.17 schwarze 514: * Nodes that begin a new macro or request line or start by
515: * printing text always want whitespace before themselves.
516: */
517:
1.39 schwarze 518: switch (n->node = p->ncur) {
1.23 schwarze 519: case NODE_DOCTYPE:
520: case NODE_ENTITY:
521: case NODE_SBR:
1.48 schwarze 522: case NODE_VOID:
1.30 schwarze 523: p->flags |= PFLAG_EEND;
1.17 schwarze 524: break;
525: default:
1.39 schwarze 526: break;
527: }
528: switch (pnode_class(p->ncur)) {
529: case CLASS_LINE:
530: case CLASS_ENCL:
1.51 schwarze 531: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
532: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
1.17 schwarze 533: break;
1.45 schwarze 534: case CLASS_NOFILL:
535: p->nofill++;
536: /* FALLTHROUGH */
1.39 schwarze 537: default:
1.51 schwarze 538: n->flags |= NFLAG_SPC;
1.39 schwarze 539: break;
1.17 schwarze 540: }
1.30 schwarze 541: p->cur = n;
542: if (n->node == NODE_DOCTYPE) {
543: if (p->doctype == NULL)
544: p->doctype = n;
1.23 schwarze 545: else
1.30 schwarze 546: error_msg(p, "duplicate doctype");
547: } else if (n->parent == NULL && p->tree->root == NULL)
548: p->tree->root = n;
1.5 schwarze 549: }
550:
551: static void
1.30 schwarze 552: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 553: {
1.30 schwarze 554: struct pattr *a;
1.23 schwarze 555: const char *value;
1.5 schwarze 556: enum attrkey key;
1.1 schwarze 557:
1.47 schwarze 558: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 559: return;
1.23 schwarze 560:
1.30 schwarze 561: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
562: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 563: value = name;
564: name = "NAME";
565: } else
566: value = NULL;
567:
1.5 schwarze 568: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 569: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 570: return;
571: }
1.53 ! schwarze 572: a = xcalloc(1, sizeof(*a));
1.30 schwarze 573: a->key = key;
574: a->val = ATTRVAL__MAX;
1.23 schwarze 575: if (value == NULL) {
1.30 schwarze 576: a->rawval = NULL;
577: p->flags |= PFLAG_ATTR;
1.23 schwarze 578: } else {
1.53 ! schwarze 579: a->rawval = xstrdup(value);
1.30 schwarze 580: p->flags &= ~PFLAG_ATTR;
581: }
582: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
583: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
584: xml_attrkey(p, "DEFINITION");
1.5 schwarze 585: }
586:
587: static void
1.30 schwarze 588: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 589: {
1.30 schwarze 590: struct pattr *a;
1.5 schwarze 591:
1.47 schwarze 592: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 593: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 594: return;
1.30 schwarze 595: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 596: return;
1.53 ! schwarze 597: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX)
! 598: a->rawval = xstrdup(name);
1.30 schwarze 599: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 600: }
601:
602: /*
603: * Roll up the parse tree.
604: * If we're at a text node, roll that one up first.
605: */
606: static void
1.31 schwarze 607: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 608: {
1.26 schwarze 609: struct pnode *n;
610: const char *cp;
1.5 schwarze 611: enum nodeid node;
1.1 schwarze 612:
1.4 schwarze 613: /*
614: * An ancestor is excluded from the tree;
615: * keep track of the number of levels excluded.
616: */
1.31 schwarze 617: if (p->del > 1) {
618: p->del--;
1.4 schwarze 619: return;
620: }
621:
1.31 schwarze 622: if (p->del == 0)
1.37 schwarze 623: pnode_closetext(p, 0);
1.2 schwarze 624:
1.50 schwarze 625: n = p->cur;
1.39 schwarze 626: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 627:
1.5 schwarze 628: switch (node) {
1.4 schwarze 629: case NODE_DELETE_WARN:
630: case NODE_DELETE:
1.31 schwarze 631: if (p->del > 0)
632: p->del--;
1.4 schwarze 633: break;
1.2 schwarze 634: case NODE_IGNORE:
1.39 schwarze 635: case NODE_UNKNOWN:
1.26 schwarze 636: break;
637: case NODE_INCLUDE:
1.50 schwarze 638: p->cur = n->parent;
1.26 schwarze 639: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
640: if (cp == NULL)
1.31 schwarze 641: error_msg(p, "<xi:include> element "
1.26 schwarze 642: "without href attribute");
643: else
1.31 schwarze 644: parse_file(p, -1, cp);
1.26 schwarze 645: pnode_unlink(n);
1.51 schwarze 646: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 647: break;
1.23 schwarze 648: case NODE_DOCTYPE:
1.32 schwarze 649: case NODE_SBR:
1.48 schwarze 650: case NODE_VOID:
1.31 schwarze 651: p->flags &= ~PFLAG_EEND;
1.23 schwarze 652: /* FALLTHROUGH */
1.2 schwarze 653: default:
1.50 schwarze 654: if (n == NULL || node != n->node) {
1.31 schwarze 655: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 656: break;
657: }
1.45 schwarze 658: if (pnode_class(node) == CLASS_NOFILL)
659: p->nofill--;
1.5 schwarze 660:
661: /*
662: * Refrain from actually closing the document element.
663: * If no more content follows, no harm is done, but if
664: * some content still follows, simply processing it is
665: * obviously better than discarding it or crashing.
666: */
667:
1.50 schwarze 668: if (n->parent != NULL || node == NODE_DOCTYPE) {
669: p->cur = n->parent;
1.31 schwarze 670: if (p->cur != NULL)
671: p->ncur = p->cur->node;
1.23 schwarze 672: } else
1.31 schwarze 673: p->tree->flags |= TREE_CLOSED;
1.51 schwarze 674: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 675:
676: /* Include a file containing entity declarations. */
677:
678: if (node == NODE_ENTITY && strcmp("%",
679: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
680: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
681: parse_file(p, -1, cp);
682:
1.4 schwarze 683: break;
1.2 schwarze 684: }
1.31 schwarze 685: assert(p->del == 0);
1.1 schwarze 686: }
687:
688: struct parse *
689: parse_alloc(int warn)
690: {
691: struct parse *p;
692:
1.53 ! schwarze 693: p = xcalloc(1, sizeof(*p));
! 694: p->tree = xcalloc(1, sizeof(*p->tree));
1.23 schwarze 695: if (warn)
696: p->flags |= PFLAG_WARN;
697: else
698: p->flags &= ~PFLAG_WARN;
1.1 schwarze 699: return p;
700: }
701:
702: void
703: parse_free(struct parse *p)
704: {
705: if (p == NULL)
706: return;
707: if (p->tree != NULL) {
708: pnode_unlink(p->tree->root);
709: free(p->tree);
710: }
711: free(p);
712: }
713:
1.14 schwarze 714: static void
715: increment(struct parse *p, char *b, size_t *pend, int refill)
716: {
717: if (refill) {
718: if (b[*pend] == '\n') {
719: p->nline++;
720: p->ncol = 1;
721: } else
722: p->ncol++;
723: }
724: ++*pend;
725: }
726:
1.5 schwarze 727: /*
728: * Advance the pend pointer to the next character in the charset.
729: * If the charset starts with a space, it stands for any whitespace.
730: * Update the new input file position, used for messages.
731: * Do not overrun the buffer b of length rlen.
732: * When reaching the end, NUL-terminate the buffer and return 1;
733: * otherwise, return 0.
734: */
735: static int
736: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 737: const char *charset, int refill)
1.5 schwarze 738: {
739: int space;
740:
741: if (*charset == ' ') {
742: space = 1;
743: charset++;
744: } else
745: space = 0;
746:
1.14 schwarze 747: if (refill) {
748: p->nline = p->line;
749: p->ncol = p->col;
750: }
1.5 schwarze 751: while (*pend < rlen) {
752: if (space && isspace((unsigned char)b[*pend]))
753: break;
754: if (strchr(charset, b[*pend]) != NULL)
755: break;
1.14 schwarze 756: increment(p, b, pend, refill);
1.5 schwarze 757: }
758: if (*pend == rlen) {
759: b[rlen] = '\0';
1.14 schwarze 760: return refill;
1.5 schwarze 761: } else
762: return 0;
763: }
764:
1.14 schwarze 765: size_t
766: parse_string(struct parse *p, char *b, size_t rlen,
767: enum pstate *pstate, int refill)
768: {
769: char *cp;
1.45 schwarze 770: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 771: size_t poff; /* Parse offset in b[]. */
772: size_t pend; /* Offset of the end of the current word. */
773: int elem_end;
774:
1.45 schwarze 775: pend = pws = 0;
1.14 schwarze 776: for (;;) {
777:
778: /* Proceed to the next token, skipping whitespace. */
779:
780: if (refill) {
781: p->line = p->nline;
782: p->col = p->ncol;
783: }
784: if ((poff = pend) == rlen)
785: break;
786: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 787: p->flags |= PFLAG_SPC;
1.51 schwarze 788: if (b[pend] == '\n') {
789: p->flags |= PFLAG_LINE;
1.45 schwarze 790: pws = pend + 1;
1.51 schwarze 791: }
1.14 schwarze 792: increment(p, b, &pend, refill);
793: continue;
794: }
795:
796: /*
797: * The following four cases (ARG, TAG, and starting an
798: * entity or a tag) all parse a word or quoted string.
799: * If that extends beyond the read buffer and the last
800: * read(2) still got data, they all break out of the
801: * token loop to request more data from the read loop.
802: *
803: * Also, three of them detect self-closing tags, those
804: * ending with "/>", setting the flag elem_end and
805: * calling xml_elem_end() at the very end, after
806: * handling the attribute value, attribute name, or
807: * tag name, respectively.
808: */
809:
810: /* Parse an attribute value. */
811:
812: if (*pstate >= PARSE_ARG) {
813: if (*pstate == PARSE_ARG &&
814: (b[pend] == '\'' || b[pend] == '"')) {
815: *pstate = b[pend] == '"' ?
816: PARSE_DQ : PARSE_SQ;
817: increment(p, b, &pend, refill);
818: continue;
819: }
820: if (advance(p, b, rlen, &pend,
821: *pstate == PARSE_DQ ? "\"" :
822: *pstate == PARSE_SQ ? "'" : " >", refill))
823: break;
824: *pstate = PARSE_TAG;
825: elem_end = 0;
826: if (b[pend] == '>') {
827: *pstate = PARSE_ELEM;
828: if (pend > 0 && b[pend - 1] == '/') {
829: b[pend - 1] = '\0';
830: elem_end = 1;
831: }
1.23 schwarze 832: if (p->flags & PFLAG_EEND)
833: elem_end = 1;
1.14 schwarze 834: }
835: b[pend] = '\0';
836: if (pend < rlen)
837: increment(p, b, &pend, refill);
838: xml_attrval(p, b + poff);
839: if (elem_end)
840: xml_elem_end(p, NULL);
841:
842: /* Look for an attribute name. */
843:
844: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 845: switch (p->ncur) {
846: case NODE_DOCTYPE:
847: if (b[pend] == '[') {
848: *pstate = PARSE_ELEM;
849: increment(p, b, &pend, refill);
850: continue;
851: }
852: /* FALLTHROUGH */
853: case NODE_ENTITY:
854: if (b[pend] == '"' || b[pend] == '\'') {
855: *pstate = PARSE_ARG;
856: continue;
857: }
858: break;
859: default:
860: break;
861: }
1.14 schwarze 862: if (advance(p, b, rlen, &pend, " =>", refill))
863: break;
864: elem_end = 0;
865: switch (b[pend]) {
866: case '>':
867: *pstate = PARSE_ELEM;
868: if (pend > 0 && b[pend - 1] == '/') {
869: b[pend - 1] = '\0';
870: elem_end = 1;
871: }
1.23 schwarze 872: if (p->flags & PFLAG_EEND)
873: elem_end = 1;
1.14 schwarze 874: break;
875: case '=':
876: *pstate = PARSE_ARG;
877: break;
878: default:
879: break;
880: }
881: b[pend] = '\0';
882: if (pend < rlen)
883: increment(p, b, &pend, refill);
884: xml_attrkey(p, b + poff);
885: if (elem_end)
886: xml_elem_end(p, NULL);
887:
888: /* Begin an opening or closing tag. */
889:
890: } else if (b[poff] == '<') {
891: if (advance(p, b, rlen, &pend, " >", refill))
892: break;
893: if (pend > poff + 3 &&
894: strncmp(b + poff, "<!--", 4) == 0) {
895:
896: /* Skip a comment. */
897:
898: cp = strstr(b + pend - 2, "-->");
899: if (cp == NULL) {
900: if (refill)
901: break;
902: cp = b + rlen;
903: } else
904: cp += 3;
905: while (b + pend < cp)
906: increment(p, b, &pend, refill);
907: continue;
908: }
909: elem_end = 0;
910: if (b[pend] != '>')
911: *pstate = PARSE_TAG;
912: else if (pend > 0 && b[pend - 1] == '/') {
913: b[pend - 1] = '\0';
914: elem_end = 1;
915: }
916: b[pend] = '\0';
917: if (pend < rlen)
918: increment(p, b, &pend, refill);
919: if (b[++poff] == '/') {
920: elem_end = 1;
921: poff++;
1.23 schwarze 922: } else {
1.14 schwarze 923: xml_elem_start(p, b + poff);
1.23 schwarze 924: if (*pstate == PARSE_ELEM &&
925: p->flags & PFLAG_EEND)
926: elem_end = 1;
927: }
1.14 schwarze 928: if (elem_end)
929: xml_elem_end(p, b + poff);
930:
1.23 schwarze 931: /* Close a doctype. */
932:
933: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
934: *pstate = PARSE_TAG;
935: increment(p, b, &pend, refill);
936:
1.14 schwarze 937: /* Process an entity. */
938:
939: } else if (b[poff] == '&') {
940: if (advance(p, b, rlen, &pend, ";", refill))
941: break;
942: b[pend] = '\0';
943: if (pend < rlen)
944: increment(p, b, &pend, refill);
945: xml_entity(p, b + poff + 1);
946:
947: /* Process text up to the next tag, entity, or EOL. */
948:
949: } else {
1.28 schwarze 950: advance(p, b, rlen, &pend,
1.33 schwarze 951: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 952: refill);
1.45 schwarze 953: if (p->nofill)
954: poff = pws;
1.35 schwarze 955: xml_text(p, b + poff, pend - poff);
1.33 schwarze 956: if (b[pend] == '\n')
1.37 schwarze 957: pnode_closetext(p, 0);
1.14 schwarze 958: }
1.45 schwarze 959: pws = pend;
1.14 schwarze 960: }
961: return poff;
962: }
963:
1.24 schwarze 964:
965: /*
966: * The read loop.
967: * If the previous token was incomplete and asked for more input,
968: * we have to enter the read loop once more even on EOF.
969: * Once rsz is 0, incomplete tokens will no longer ask for more input
970: * but instead use whatever there is, and then exit the read loop.
971: * The minus one on the size limit for read(2) is needed such that
972: * advance() can set b[rlen] to NUL when needed.
973: */
974: static void
975: parse_fd(struct parse *p, int fd)
1.1 schwarze 976: {
977: char b[4096];
1.5 schwarze 978: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 979: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 980: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 981: enum pstate pstate;
1.1 schwarze 982:
1.24 schwarze 983: rlen = 0;
1.14 schwarze 984: pstate = PARSE_ELEM;
985: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
986: (rlen += rsz) > 0) {
987: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 988: /* Buffer exhausted; shift left and re-fill. */
989: assert(poff > 0);
990: rlen -= poff;
1.14 schwarze 991: memmove(b, b + poff, rlen);
1.5 schwarze 992: }
1.24 schwarze 993: if (rsz < 0)
994: error_msg(p, "read: %s", strerror(errno));
995: }
996:
997: /*
998: * Open and parse a file.
999: */
1000: struct ptree *
1001: parse_file(struct parse *p, int fd, const char *fname)
1002: {
1003: const char *save_fname;
1004: int save_line, save_col;
1005:
1006: /* Save and initialize reporting data. */
1007:
1008: save_fname = p->fname;
1009: save_line = p->nline;
1010: save_col = p->ncol;
1011: p->fname = fname;
1012: p->line = 0;
1013: p->col = 0;
1014:
1015: /* Open the file, unless it is already open. */
1016:
1017: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1018: error_msg(p, "open: %s", strerror(errno));
1019: p->fname = save_fname;
1020: return p->tree;
1.5 schwarze 1021: }
1.24 schwarze 1022:
1023: /*
1024: * After opening the starting file, change to the directory it
1025: * is located in, in case it wants to include any further files,
1026: * which are typically given with relative paths in DocBook.
1027: * Do this on a best-effort basis; don't complain about failure.
1028: */
1029:
1030: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1031: strcmp(fname, ".") != 0)
1032: (void)chdir(fname);
1033:
1034: /* Run the read loop. */
1035:
1036: p->nline = 1;
1037: p->ncol = 1;
1038: parse_fd(p, fd);
1039:
1040: /* On the top level, finalize the parse tree. */
1041:
1042: if (save_fname == NULL) {
1.37 schwarze 1043: pnode_closetext(p, 0);
1.24 schwarze 1044: if (p->tree->root == NULL)
1045: error_msg(p, "empty document");
1046: else if ((p->tree->flags & TREE_CLOSED) == 0)
1047: warn_msg(p, "document not closed");
1048: pnode_unlink(p->doctype);
1049: }
1050:
1051: /* Clean up. */
1052:
1053: if (fd != STDIN_FILENO)
1054: close(fd);
1055: p->fname = save_fname;
1056: p->nline = save_line;
1057: p->ncol = save_col;
1.1 schwarze 1058: return p->tree;
1059: }
CVSweb