Annotation of docbook2mdoc/parse.c, Revision 1.60
1.60 ! schwarze 1: /* $Id: parse.c,v 1.59 2019/05/02 11:58:18 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.53 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.24 schwarze 22: #include <errno.h>
23: #include <fcntl.h>
24: #include <libgen.h>
1.6 schwarze 25: #include <stdarg.h>
1.1 schwarze 26: #include <stdio.h>
1.5 schwarze 27: #include <stdlib.h>
1.1 schwarze 28: #include <string.h>
29: #include <unistd.h>
30:
1.53 schwarze 31: #include "xmalloc.h"
1.1 schwarze 32: #include "node.h"
33: #include "parse.h"
34:
35: /*
36: * The implementation of the DocBook parser.
37: */
38:
1.14 schwarze 39: enum pstate {
40: PARSE_ELEM,
41: PARSE_TAG,
42: PARSE_ARG,
43: PARSE_SQ,
44: PARSE_DQ
45: };
46:
1.1 schwarze 47: /*
48: * Global parse state.
49: * Keep this as simple and small as possible.
50: */
51: struct parse {
52: const char *fname; /* Name of the input file. */
53: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 54: struct pnode *doctype;
1.1 schwarze 55: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 56: enum nodeid ncur; /* Type of the current node. */
57: int line; /* Line number in the input file. */
58: int col; /* Column number in the input file. */
59: int nline; /* Line number of next token. */
60: int ncol; /* Column number of next token. */
1.4 schwarze 61: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 62: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 63: int flags;
64: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 schwarze 65: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
66: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
67: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
68: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 69: };
70:
1.39 schwarze 71: struct alias {
1.1 schwarze 72: const char *name; /* DocBook element name. */
73: enum nodeid node; /* Node type to generate. */
74: };
75:
1.39 schwarze 76: static const struct alias aliases[] = {
1.3 schwarze 77: { "acronym", NODE_IGNORE },
1.43 schwarze 78: { "affiliation", NODE_IGNORE },
1.4 schwarze 79: { "anchor", NODE_DELETE },
1.42 schwarze 80: { "application", NODE_COMMAND },
1.22 schwarze 81: { "article", NODE_SECTION },
1.41 schwarze 82: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 83: { "book", NODE_SECTION },
1.1 schwarze 84: { "chapter", NODE_SECTION },
1.44 schwarze 85: { "caption", NODE_IGNORE },
1.13 schwarze 86: { "code", NODE_LITERAL },
1.36 schwarze 87: { "computeroutput", NODE_LITERAL },
1.23 schwarze 88: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 89: { "figure", NODE_IGNORE },
1.7 schwarze 90: { "firstname", NODE_PERSONNAME },
1.21 schwarze 91: { "glossary", NODE_VARIABLELIST },
92: { "glossdef", NODE_IGNORE },
93: { "glossdiv", NODE_IGNORE },
94: { "glossentry", NODE_VARLISTENTRY },
95: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 96: { "holder", NODE_IGNORE },
1.44 schwarze 97: { "imageobject", NODE_IGNORE },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.11 schwarze 99: { "informaltable", NODE_TABLE },
1.59 schwarze 100: { "jobtitle", NODE_IGNORE },
1.42 schwarze 101: { "keycap", NODE_KEYSYM },
102: { "keycode", NODE_IGNORE },
1.55 schwarze 103: { "keycombo", NODE_IGNORE },
1.44 schwarze 104: { "mediaobject", NODE_BLOCKQUOTE },
1.59 schwarze 105: { "orgdiv", NODE_IGNORE },
1.43 schwarze 106: { "orgname", NODE_IGNORE },
1.40 schwarze 107: { "othercredit", NODE_AUTHOR },
1.7 schwarze 108: { "othername", NODE_PERSONNAME },
1.1 schwarze 109: { "part", NODE_SECTION },
1.3 schwarze 110: { "phrase", NODE_IGNORE },
1.4 schwarze 111: { "primary", NODE_DELETE },
1.42 schwarze 112: { "property", NODE_PARAMETER },
1.52 schwarze 113: { "reference", NODE_SECTION },
1.1 schwarze 114: { "refsect1", NODE_SECTION },
115: { "refsect2", NODE_SECTION },
116: { "refsect3", NODE_SECTION },
117: { "refsection", NODE_SECTION },
1.43 schwarze 118: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 119: { "returnvalue", NODE_IGNORE },
1.4 schwarze 120: { "secondary", NODE_DELETE },
1.1 schwarze 121: { "sect1", NODE_SECTION },
122: { "sect2", NODE_SECTION },
1.46 schwarze 123: { "sect3", NODE_SECTION },
124: { "sect4", NODE_SECTION },
1.36 schwarze 125: { "sgmltag", NODE_MARKUP },
1.15 schwarze 126: { "simpara", NODE_PARA },
1.13 schwarze 127: { "structfield", NODE_PARAMETER },
128: { "structname", NODE_TYPE },
1.7 schwarze 129: { "surname", NODE_PERSONNAME },
1.12 schwarze 130: { "symbol", NODE_CONSTANT },
1.48 schwarze 131: { "tag", NODE_MARKUP },
1.3 schwarze 132: { "trademark", NODE_IGNORE },
1.18 schwarze 133: { "ulink", NODE_LINK },
1.13 schwarze 134: { "userinput", NODE_LITERAL },
1.5 schwarze 135: { NULL, NODE_IGNORE }
1.1 schwarze 136: };
137:
1.9 schwarze 138: struct entity {
139: const char *name;
140: const char *roff;
141: };
142:
143: /*
144: * XML character entity references found in the wild.
145: * Those that don't have an exact mandoc_char(7) representation
146: * are approximated, and the desired codepoint is given as a comment.
147: * Encoding them as \\[u...] would leave -Tascii out in the cold.
148: */
149: static const struct entity entities[] = {
150: { "alpha", "\\(*a" },
151: { "amp", "&" },
152: { "apos", "'" },
153: { "auml", "\\(:a" },
154: { "beta", "\\(*b" },
155: { "circ", "^" }, /* U+02C6 */
156: { "copy", "\\(co" },
157: { "dagger", "\\(dg" },
158: { "Delta", "\\(*D" },
159: { "eacute", "\\('e" },
160: { "emsp", "\\ " }, /* U+2003 */
161: { "gt", ">" },
162: { "hairsp", "\\^" },
163: { "kappa", "\\(*k" },
164: { "larr", "\\(<-" },
165: { "ldquo", "\\(lq" },
166: { "le", "\\(<=" },
167: { "lowbar", "_" },
168: { "lsqb", "[" },
169: { "lt", "<" },
170: { "mdash", "\\(em" },
171: { "minus", "\\-" },
172: { "ndash", "\\(en" },
173: { "nbsp", "\\ " },
174: { "num", "#" },
175: { "oslash", "\\(/o" },
176: { "ouml", "\\(:o" },
177: { "percnt", "%" },
178: { "quot", "\\(dq" },
179: { "rarr", "\\(->" },
180: { "rArr", "\\(rA" },
181: { "rdquo", "\\(rq" },
182: { "reg", "\\(rg" },
183: { "rho", "\\(*r" },
184: { "rsqb", "]" },
185: { "sigma", "\\(*s" },
186: { "shy", "\\&" }, /* U+00AD */
187: { "tau", "\\(*t" },
188: { "tilde", "\\[u02DC]" },
189: { "times", "\\[tmu]" },
190: { "uuml", "\\(:u" },
191: { NULL, NULL }
192: };
193:
1.23 schwarze 194: static size_t parse_string(struct parse *, char *, size_t,
195: enum pstate *, int);
1.24 schwarze 196: static void parse_fd(struct parse *, int);
1.23 schwarze 197:
198:
1.6 schwarze 199: static void
200: error_msg(struct parse *p, const char *fmt, ...)
201: {
202: va_list ap;
203:
1.29 schwarze 204: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 205: va_start(ap, fmt);
206: vfprintf(stderr, fmt, ap);
207: va_end(ap);
208: fputc('\n', stderr);
1.29 schwarze 209: p->tree->flags |= TREE_ERROR;
1.6 schwarze 210: }
211:
212: static void
213: warn_msg(struct parse *p, const char *fmt, ...)
214: {
215: va_list ap;
216:
1.23 schwarze 217: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 218: return;
219:
1.29 schwarze 220: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 221: va_start(ap, fmt);
222: vfprintf(stderr, fmt, ap);
223: va_end(ap);
224: fputc('\n', stderr);
1.29 schwarze 225: p->tree->flags |= TREE_WARN;
1.6 schwarze 226: }
227:
1.1 schwarze 228: /*
229: * Process a string of characters.
230: * If a text node is already open, append to it.
231: * Otherwise, create a new one as a child of the current node.
232: */
233: static void
1.35 schwarze 234: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 235: {
1.35 schwarze 236: struct pnode *n, *np;
1.32 schwarze 237: size_t oldsz, newsz;
1.35 schwarze 238: int i;
1.1 schwarze 239:
1.32 schwarze 240: assert(sz > 0);
1.30 schwarze 241: if (p->del > 0)
1.1 schwarze 242: return;
243:
1.32 schwarze 244: if ((n = p->cur) == NULL) {
1.35 schwarze 245: error_msg(p, "discarding text before document: %.*s",
246: sz, word);
1.5 schwarze 247: return;
248: }
249:
1.35 schwarze 250: /* Append to the current text node, if one is open. */
251:
252: if (n->node == NODE_TEXT) {
253: oldsz = strlen(n->b);
254: newsz = oldsz + sz;
255: if (oldsz && (p->flags & PFLAG_SPC))
256: newsz++;
1.53 schwarze 257: n->b = xrealloc(n->b, newsz + 1);
1.35 schwarze 258: if (oldsz && (p->flags & PFLAG_SPC))
259: n->b[oldsz++] = ' ';
260: memcpy(n->b + oldsz, word, sz);
261: n->b[newsz] = '\0';
1.51 schwarze 262: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 263: return;
1.1 schwarze 264: }
265:
1.35 schwarze 266: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 267: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 268:
1.35 schwarze 269: /* Create a new text node. */
1.1 schwarze 270:
1.53 schwarze 271: n = pnode_alloc(p->cur);
1.35 schwarze 272: n->node = NODE_TEXT;
1.57 schwarze 273: if (p->flags & PFLAG_LINE && TAILQ_PREV(n, pnodeq, child) != NULL)
274: n->flags |= NFLAG_LINE;
275: if (p->flags & PFLAG_SPC)
276: n->flags |= NFLAG_SPC;
1.51 schwarze 277: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 278:
279: /*
1.39 schwarze 280: * If this node follows an in-line macro without intervening
1.35 schwarze 281: * whitespace, keep the text in it as short as possible,
282: * and do not keep it open.
283: */
284:
1.51 schwarze 285: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 286: while (np != NULL) {
287: switch (pnode_class(np->node)) {
288: case CLASS_VOID:
289: case CLASS_TEXT:
290: case CLASS_BLOCK:
1.45 schwarze 291: case CLASS_NOFILL:
1.39 schwarze 292: np = NULL;
293: break;
294: case CLASS_TRANS:
295: np = TAILQ_LAST(&np->childq, pnodeq);
296: continue;
297: case CLASS_LINE:
298: case CLASS_ENCL:
299: break;
300: }
301: break;
302: }
303: if (np != NULL) {
1.35 schwarze 304: i = 0;
305: while (i < sz && !isspace((unsigned char)word[i]))
306: i++;
1.53 schwarze 307: n->b = xstrndup(word, i);
1.35 schwarze 308: if (i == sz)
309: return;
310: while (i < sz && isspace((unsigned char)word[i]))
311: i++;
312: if (i == sz) {
313: p->flags |= PFLAG_SPC;
314: return;
315: }
316:
317: /* Put any remaining text into a second node. */
318:
1.53 schwarze 319: n = pnode_alloc(p->cur);
1.35 schwarze 320: n->node = NODE_TEXT;
1.51 schwarze 321: n->flags |= NFLAG_SPC;
1.35 schwarze 322: word += i;
323: sz -= i;
324: }
1.53 schwarze 325: n->b = xstrndup(word, sz);
1.35 schwarze 326:
327: /* The new node remains open for later pnode_closetext(). */
328:
329: p->cur = n;
1.1 schwarze 330: }
331:
1.16 schwarze 332: /*
333: * Close out the text node and strip trailing whitespace, if one is open.
334: */
1.1 schwarze 335: static void
1.37 schwarze 336: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 337: {
1.16 schwarze 338: struct pnode *n;
1.37 schwarze 339: char *cp, *last_word;
1.16 schwarze 340:
341: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
342: return;
343: p->cur = n->parent;
1.32 schwarze 344: for (cp = strchr(n->b, '\0');
345: cp > n->b && isspace((unsigned char)cp[-1]);
346: *--cp = '\0')
1.23 schwarze 347: p->flags |= PFLAG_SPC;
1.37 schwarze 348:
349: if (p->flags & PFLAG_SPC || !check_last_word)
350: return;
351:
352: /*
353: * Find the beginning of the last word
354: * and delete whitespace before it.
355: */
356:
357: while (cp > n->b && !isspace((unsigned char)cp[-1]))
358: cp--;
1.58 schwarze 359: last_word = cp;
360: while (cp > n->b && isspace((unsigned char)cp[-1]))
361: cp--;
1.37 schwarze 362: if (cp == n->b)
363: return;
1.58 schwarze 364: *cp = '\0';
1.37 schwarze 365:
366: /* Move the last word into its own node, for use with .Pf. */
367:
1.54 schwarze 368: n = pnode_alloc_text(p->cur, last_word);
1.51 schwarze 369: n->flags |= NFLAG_SPC;
1.1 schwarze 370: }
371:
1.9 schwarze 372: static void
373: xml_entity(struct parse *p, const char *name)
374: {
375: const struct entity *entity;
1.30 schwarze 376: struct pnode *n;
1.23 schwarze 377: const char *ccp;
378: char *cp;
1.60 ! schwarze 379: long codepoint;
1.23 schwarze 380: enum pstate pstate;
1.9 schwarze 381:
382: if (p->del > 0)
383: return;
384:
385: if (p->cur == NULL) {
386: error_msg(p, "discarding entity before document: &%s;", name);
387: return;
388: }
389:
1.37 schwarze 390: pnode_closetext(p, 0);
1.9 schwarze 391:
392: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
393: warn_msg(p, "entity after end of document: &%s;", name);
394:
395: for (entity = entities; entity->name != NULL; entity++)
396: if (strcmp(name, entity->name) == 0)
397: break;
398:
399: if (entity->roff == NULL) {
1.23 schwarze 400: if (p->doctype != NULL) {
1.30 schwarze 401: TAILQ_FOREACH(n, &p->doctype->childq, child) {
402: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 403: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 404: strcmp(ccp, name) != 0)
405: continue;
1.30 schwarze 406: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 407: ATTRKEY_SYSTEM, NULL)) != NULL) {
408: parse_file(p, -1, ccp);
1.51 schwarze 409: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 410: return;
411: }
1.30 schwarze 412: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 413: ATTRKEY_DEFINITION, NULL)) == NULL)
414: continue;
1.53 schwarze 415: cp = xstrdup(ccp);
1.23 schwarze 416: pstate = PARSE_ELEM;
417: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 schwarze 418: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 419: free(cp);
420: return;
421: }
422: }
1.49 schwarze 423: if (*name == '#') {
1.60 ! schwarze 424: errno = 0;
! 425: if (name[1] == 'x') {
! 426: ccp = name + 2;
! 427: codepoint = strtol(ccp, &cp, 16);
! 428: } else {
! 429: ccp = name + 1;
! 430: codepoint = strtol(ccp, &cp, 10);
! 431: }
! 432: if (*ccp != '\0' && *cp == '\0' && errno != ERANGE &&
! 433: codepoint >= 0 && codepoint <= 0x10ffff) {
1.53 schwarze 434: n = pnode_alloc(p->cur);
435: xasprintf(&n->b, "\\[u%4.4X]", codepoint);
1.49 schwarze 436: goto done;
437: }
438: }
1.9 schwarze 439: error_msg(p, "unknown entity &%s;", name);
440: return;
441: }
442:
443: /* Create, append, and close out an entity node. */
1.53 schwarze 444: n = pnode_alloc(p->cur);
445: n->b = xstrdup(entity->roff);
1.49 schwarze 446: done:
1.30 schwarze 447: n->node = NODE_ESCAPE;
1.57 schwarze 448: if (p->flags & PFLAG_LINE && TAILQ_PREV(n, pnodeq, child) != NULL)
449: n->flags |= NFLAG_LINE;
450: if (p->flags & PFLAG_SPC)
451: n->flags |= NFLAG_SPC;
1.51 schwarze 452: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 453: }
454:
1.1 schwarze 455: /*
1.39 schwarze 456: * Parse an element name.
457: */
458: static enum nodeid
459: xml_name2node(struct parse *p, const char *name)
460: {
461: const struct alias *alias;
462: enum nodeid node;
463:
464: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
465: return node;
466:
467: for (alias = aliases; alias->name != NULL; alias++)
468: if (strcmp(alias->name, name) == 0)
469: return alias->node;
470:
471: return NODE_UNKNOWN;
472: }
473:
474: /*
1.1 schwarze 475: * Begin an element.
476: */
477: static void
1.30 schwarze 478: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 479: {
1.30 schwarze 480: struct pnode *n;
1.1 schwarze 481:
1.4 schwarze 482: /*
483: * An ancestor is excluded from the tree;
484: * keep track of the number of levels excluded.
485: */
1.30 schwarze 486: if (p->del > 0) {
1.23 schwarze 487: if (*name != '!' && *name != '?')
1.30 schwarze 488: p->del++;
1.4 schwarze 489: return;
490: }
491:
1.39 schwarze 492: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 493: case NODE_DELETE_WARN:
1.30 schwarze 494: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 495: /* FALLTHROUGH */
1.4 schwarze 496: case NODE_DELETE:
1.30 schwarze 497: p->del = 1;
1.4 schwarze 498: /* FALLTHROUGH */
1.2 schwarze 499: case NODE_IGNORE:
500: return;
1.39 schwarze 501: case NODE_UNKNOWN:
502: if (*name != '!' && *name != '?')
503: error_msg(p, "unknown element <%s>", name);
504: return;
1.2 schwarze 505: default:
506: break;
507: }
1.1 schwarze 508:
1.30 schwarze 509: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
510: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 511:
1.39 schwarze 512: switch (pnode_class(p->ncur)) {
513: case CLASS_LINE:
514: case CLASS_ENCL:
515: pnode_closetext(p, 1);
516: break;
517: default:
518: pnode_closetext(p, 0);
519: break;
520: }
521:
1.53 schwarze 522: n = pnode_alloc(p->cur);
1.57 schwarze 523: if (p->flags & PFLAG_LINE && p->cur != NULL &&
524: TAILQ_PREV(n, pnodeq, child) != NULL)
525: n->flags |= NFLAG_LINE;
526: p->flags &= ~PFLAG_LINE;
1.17 schwarze 527:
528: /*
1.39 schwarze 529: * Some elements are self-closing.
1.17 schwarze 530: * Nodes that begin a new macro or request line or start by
531: * printing text always want whitespace before themselves.
532: */
533:
1.39 schwarze 534: switch (n->node = p->ncur) {
1.23 schwarze 535: case NODE_DOCTYPE:
536: case NODE_ENTITY:
537: case NODE_SBR:
1.48 schwarze 538: case NODE_VOID:
1.30 schwarze 539: p->flags |= PFLAG_EEND;
1.17 schwarze 540: break;
541: default:
1.39 schwarze 542: break;
543: }
544: switch (pnode_class(p->ncur)) {
545: case CLASS_LINE:
546: case CLASS_ENCL:
1.57 schwarze 547: if (p->flags & PFLAG_SPC)
548: n->flags |= NFLAG_SPC;
1.17 schwarze 549: break;
1.45 schwarze 550: case CLASS_NOFILL:
551: p->nofill++;
552: /* FALLTHROUGH */
1.39 schwarze 553: default:
1.51 schwarze 554: n->flags |= NFLAG_SPC;
1.39 schwarze 555: break;
1.17 schwarze 556: }
1.30 schwarze 557: p->cur = n;
558: if (n->node == NODE_DOCTYPE) {
559: if (p->doctype == NULL)
560: p->doctype = n;
1.23 schwarze 561: else
1.30 schwarze 562: error_msg(p, "duplicate doctype");
563: } else if (n->parent == NULL && p->tree->root == NULL)
564: p->tree->root = n;
1.5 schwarze 565: }
566:
567: static void
1.30 schwarze 568: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 569: {
1.30 schwarze 570: struct pattr *a;
1.23 schwarze 571: const char *value;
1.5 schwarze 572: enum attrkey key;
1.1 schwarze 573:
1.47 schwarze 574: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 575: return;
1.23 schwarze 576:
1.30 schwarze 577: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
578: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 579: value = name;
580: name = "NAME";
581: } else
582: value = NULL;
583:
1.5 schwarze 584: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 585: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 586: return;
587: }
1.53 schwarze 588: a = xcalloc(1, sizeof(*a));
1.30 schwarze 589: a->key = key;
590: a->val = ATTRVAL__MAX;
1.23 schwarze 591: if (value == NULL) {
1.30 schwarze 592: a->rawval = NULL;
593: p->flags |= PFLAG_ATTR;
1.23 schwarze 594: } else {
1.53 schwarze 595: a->rawval = xstrdup(value);
1.30 schwarze 596: p->flags &= ~PFLAG_ATTR;
597: }
598: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
599: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
600: xml_attrkey(p, "DEFINITION");
1.5 schwarze 601: }
602:
603: static void
1.30 schwarze 604: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 605: {
1.30 schwarze 606: struct pattr *a;
1.5 schwarze 607:
1.47 schwarze 608: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 609: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 610: return;
1.30 schwarze 611: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 612: return;
1.53 schwarze 613: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX)
614: a->rawval = xstrdup(name);
1.30 schwarze 615: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 616: }
617:
618: /*
619: * Roll up the parse tree.
620: * If we're at a text node, roll that one up first.
621: */
622: static void
1.31 schwarze 623: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 624: {
1.26 schwarze 625: struct pnode *n;
626: const char *cp;
1.5 schwarze 627: enum nodeid node;
1.1 schwarze 628:
1.4 schwarze 629: /*
630: * An ancestor is excluded from the tree;
631: * keep track of the number of levels excluded.
632: */
1.31 schwarze 633: if (p->del > 1) {
634: p->del--;
1.4 schwarze 635: return;
636: }
637:
1.31 schwarze 638: if (p->del == 0)
1.37 schwarze 639: pnode_closetext(p, 0);
1.2 schwarze 640:
1.50 schwarze 641: n = p->cur;
1.39 schwarze 642: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 643:
1.5 schwarze 644: switch (node) {
1.4 schwarze 645: case NODE_DELETE_WARN:
646: case NODE_DELETE:
1.31 schwarze 647: if (p->del > 0)
648: p->del--;
1.4 schwarze 649: break;
1.2 schwarze 650: case NODE_IGNORE:
1.39 schwarze 651: case NODE_UNKNOWN:
1.26 schwarze 652: break;
653: case NODE_INCLUDE:
1.50 schwarze 654: p->cur = n->parent;
1.26 schwarze 655: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
656: if (cp == NULL)
1.31 schwarze 657: error_msg(p, "<xi:include> element "
1.26 schwarze 658: "without href attribute");
659: else
1.31 schwarze 660: parse_file(p, -1, cp);
1.26 schwarze 661: pnode_unlink(n);
1.51 schwarze 662: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 663: break;
1.23 schwarze 664: case NODE_DOCTYPE:
1.32 schwarze 665: case NODE_SBR:
1.48 schwarze 666: case NODE_VOID:
1.31 schwarze 667: p->flags &= ~PFLAG_EEND;
1.23 schwarze 668: /* FALLTHROUGH */
1.2 schwarze 669: default:
1.50 schwarze 670: if (n == NULL || node != n->node) {
1.31 schwarze 671: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 672: break;
673: }
1.45 schwarze 674: if (pnode_class(node) == CLASS_NOFILL)
675: p->nofill--;
1.5 schwarze 676:
677: /*
678: * Refrain from actually closing the document element.
679: * If no more content follows, no harm is done, but if
680: * some content still follows, simply processing it is
681: * obviously better than discarding it or crashing.
682: */
683:
1.50 schwarze 684: if (n->parent != NULL || node == NODE_DOCTYPE) {
685: p->cur = n->parent;
1.31 schwarze 686: if (p->cur != NULL)
687: p->ncur = p->cur->node;
1.23 schwarze 688: } else
1.31 schwarze 689: p->tree->flags |= TREE_CLOSED;
1.51 schwarze 690: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 691:
692: /* Include a file containing entity declarations. */
693:
694: if (node == NODE_ENTITY && strcmp("%",
695: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
696: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
697: parse_file(p, -1, cp);
698:
1.4 schwarze 699: break;
1.2 schwarze 700: }
1.31 schwarze 701: assert(p->del == 0);
1.1 schwarze 702: }
703:
704: struct parse *
705: parse_alloc(int warn)
706: {
707: struct parse *p;
708:
1.53 schwarze 709: p = xcalloc(1, sizeof(*p));
710: p->tree = xcalloc(1, sizeof(*p->tree));
1.23 schwarze 711: if (warn)
712: p->flags |= PFLAG_WARN;
713: else
714: p->flags &= ~PFLAG_WARN;
1.1 schwarze 715: return p;
716: }
717:
718: void
719: parse_free(struct parse *p)
720: {
721: if (p == NULL)
722: return;
723: if (p->tree != NULL) {
724: pnode_unlink(p->tree->root);
725: free(p->tree);
726: }
727: free(p);
728: }
729:
1.14 schwarze 730: static void
731: increment(struct parse *p, char *b, size_t *pend, int refill)
732: {
733: if (refill) {
734: if (b[*pend] == '\n') {
735: p->nline++;
736: p->ncol = 1;
737: } else
738: p->ncol++;
739: }
740: ++*pend;
741: }
742:
1.5 schwarze 743: /*
744: * Advance the pend pointer to the next character in the charset.
745: * If the charset starts with a space, it stands for any whitespace.
746: * Update the new input file position, used for messages.
747: * Do not overrun the buffer b of length rlen.
748: * When reaching the end, NUL-terminate the buffer and return 1;
749: * otherwise, return 0.
750: */
751: static int
752: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 753: const char *charset, int refill)
1.5 schwarze 754: {
755: int space;
756:
757: if (*charset == ' ') {
758: space = 1;
759: charset++;
760: } else
761: space = 0;
762:
1.14 schwarze 763: if (refill) {
764: p->nline = p->line;
765: p->ncol = p->col;
766: }
1.5 schwarze 767: while (*pend < rlen) {
768: if (space && isspace((unsigned char)b[*pend]))
769: break;
770: if (strchr(charset, b[*pend]) != NULL)
771: break;
1.14 schwarze 772: increment(p, b, pend, refill);
1.5 schwarze 773: }
774: if (*pend == rlen) {
775: b[rlen] = '\0';
1.14 schwarze 776: return refill;
1.5 schwarze 777: } else
778: return 0;
779: }
780:
1.14 schwarze 781: size_t
782: parse_string(struct parse *p, char *b, size_t rlen,
783: enum pstate *pstate, int refill)
784: {
785: char *cp;
1.45 schwarze 786: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 787: size_t poff; /* Parse offset in b[]. */
788: size_t pend; /* Offset of the end of the current word. */
789: int elem_end;
790:
1.45 schwarze 791: pend = pws = 0;
1.14 schwarze 792: for (;;) {
793:
794: /* Proceed to the next token, skipping whitespace. */
795:
796: if (refill) {
797: p->line = p->nline;
798: p->col = p->ncol;
799: }
800: if ((poff = pend) == rlen)
801: break;
802: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 803: p->flags |= PFLAG_SPC;
1.51 schwarze 804: if (b[pend] == '\n') {
805: p->flags |= PFLAG_LINE;
1.45 schwarze 806: pws = pend + 1;
1.51 schwarze 807: }
1.14 schwarze 808: increment(p, b, &pend, refill);
809: continue;
810: }
811:
812: /*
813: * The following four cases (ARG, TAG, and starting an
814: * entity or a tag) all parse a word or quoted string.
815: * If that extends beyond the read buffer and the last
816: * read(2) still got data, they all break out of the
817: * token loop to request more data from the read loop.
818: *
819: * Also, three of them detect self-closing tags, those
820: * ending with "/>", setting the flag elem_end and
821: * calling xml_elem_end() at the very end, after
822: * handling the attribute value, attribute name, or
823: * tag name, respectively.
824: */
825:
826: /* Parse an attribute value. */
827:
828: if (*pstate >= PARSE_ARG) {
829: if (*pstate == PARSE_ARG &&
830: (b[pend] == '\'' || b[pend] == '"')) {
831: *pstate = b[pend] == '"' ?
832: PARSE_DQ : PARSE_SQ;
833: increment(p, b, &pend, refill);
834: continue;
835: }
836: if (advance(p, b, rlen, &pend,
837: *pstate == PARSE_DQ ? "\"" :
838: *pstate == PARSE_SQ ? "'" : " >", refill))
839: break;
840: *pstate = PARSE_TAG;
841: elem_end = 0;
842: if (b[pend] == '>') {
843: *pstate = PARSE_ELEM;
844: if (pend > 0 && b[pend - 1] == '/') {
845: b[pend - 1] = '\0';
846: elem_end = 1;
847: }
1.23 schwarze 848: if (p->flags & PFLAG_EEND)
849: elem_end = 1;
1.14 schwarze 850: }
851: b[pend] = '\0';
852: if (pend < rlen)
853: increment(p, b, &pend, refill);
854: xml_attrval(p, b + poff);
855: if (elem_end)
856: xml_elem_end(p, NULL);
857:
858: /* Look for an attribute name. */
859:
860: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 861: switch (p->ncur) {
862: case NODE_DOCTYPE:
863: if (b[pend] == '[') {
864: *pstate = PARSE_ELEM;
865: increment(p, b, &pend, refill);
866: continue;
867: }
868: /* FALLTHROUGH */
869: case NODE_ENTITY:
870: if (b[pend] == '"' || b[pend] == '\'') {
871: *pstate = PARSE_ARG;
872: continue;
873: }
874: break;
875: default:
876: break;
877: }
1.14 schwarze 878: if (advance(p, b, rlen, &pend, " =>", refill))
879: break;
880: elem_end = 0;
881: switch (b[pend]) {
882: case '>':
883: *pstate = PARSE_ELEM;
884: if (pend > 0 && b[pend - 1] == '/') {
885: b[pend - 1] = '\0';
886: elem_end = 1;
887: }
1.23 schwarze 888: if (p->flags & PFLAG_EEND)
889: elem_end = 1;
1.14 schwarze 890: break;
891: case '=':
892: *pstate = PARSE_ARG;
893: break;
894: default:
895: break;
896: }
897: b[pend] = '\0';
898: if (pend < rlen)
899: increment(p, b, &pend, refill);
900: xml_attrkey(p, b + poff);
901: if (elem_end)
902: xml_elem_end(p, NULL);
903:
904: /* Begin an opening or closing tag. */
905:
906: } else if (b[poff] == '<') {
907: if (advance(p, b, rlen, &pend, " >", refill))
908: break;
909: if (pend > poff + 3 &&
910: strncmp(b + poff, "<!--", 4) == 0) {
911:
912: /* Skip a comment. */
913:
914: cp = strstr(b + pend - 2, "-->");
915: if (cp == NULL) {
916: if (refill)
917: break;
918: cp = b + rlen;
919: } else
920: cp += 3;
921: while (b + pend < cp)
922: increment(p, b, &pend, refill);
923: continue;
924: }
925: elem_end = 0;
926: if (b[pend] != '>')
927: *pstate = PARSE_TAG;
928: else if (pend > 0 && b[pend - 1] == '/') {
929: b[pend - 1] = '\0';
930: elem_end = 1;
931: }
932: b[pend] = '\0';
933: if (pend < rlen)
934: increment(p, b, &pend, refill);
935: if (b[++poff] == '/') {
936: elem_end = 1;
937: poff++;
1.23 schwarze 938: } else {
1.14 schwarze 939: xml_elem_start(p, b + poff);
1.23 schwarze 940: if (*pstate == PARSE_ELEM &&
941: p->flags & PFLAG_EEND)
942: elem_end = 1;
943: }
1.14 schwarze 944: if (elem_end)
945: xml_elem_end(p, b + poff);
946:
1.23 schwarze 947: /* Close a doctype. */
948:
949: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
950: *pstate = PARSE_TAG;
951: increment(p, b, &pend, refill);
952:
1.14 schwarze 953: /* Process an entity. */
954:
955: } else if (b[poff] == '&') {
956: if (advance(p, b, rlen, &pend, ";", refill))
957: break;
958: b[pend] = '\0';
959: if (pend < rlen)
960: increment(p, b, &pend, refill);
961: xml_entity(p, b + poff + 1);
962:
963: /* Process text up to the next tag, entity, or EOL. */
964:
965: } else {
1.28 schwarze 966: advance(p, b, rlen, &pend,
1.33 schwarze 967: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 968: refill);
1.45 schwarze 969: if (p->nofill)
970: poff = pws;
1.35 schwarze 971: xml_text(p, b + poff, pend - poff);
1.33 schwarze 972: if (b[pend] == '\n')
1.37 schwarze 973: pnode_closetext(p, 0);
1.14 schwarze 974: }
1.45 schwarze 975: pws = pend;
1.14 schwarze 976: }
977: return poff;
978: }
979:
1.24 schwarze 980:
981: /*
982: * The read loop.
983: * If the previous token was incomplete and asked for more input,
984: * we have to enter the read loop once more even on EOF.
985: * Once rsz is 0, incomplete tokens will no longer ask for more input
986: * but instead use whatever there is, and then exit the read loop.
987: * The minus one on the size limit for read(2) is needed such that
988: * advance() can set b[rlen] to NUL when needed.
989: */
990: static void
991: parse_fd(struct parse *p, int fd)
1.1 schwarze 992: {
993: char b[4096];
1.5 schwarze 994: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 995: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 996: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 997: enum pstate pstate;
1.1 schwarze 998:
1.24 schwarze 999: rlen = 0;
1.14 schwarze 1000: pstate = PARSE_ELEM;
1001: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1002: (rlen += rsz) > 0) {
1003: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1004: /* Buffer exhausted; shift left and re-fill. */
1005: assert(poff > 0);
1006: rlen -= poff;
1.14 schwarze 1007: memmove(b, b + poff, rlen);
1.5 schwarze 1008: }
1.24 schwarze 1009: if (rsz < 0)
1010: error_msg(p, "read: %s", strerror(errno));
1011: }
1012:
1013: /*
1014: * Open and parse a file.
1015: */
1016: struct ptree *
1017: parse_file(struct parse *p, int fd, const char *fname)
1018: {
1019: const char *save_fname;
1020: int save_line, save_col;
1021:
1022: /* Save and initialize reporting data. */
1023:
1024: save_fname = p->fname;
1025: save_line = p->nline;
1026: save_col = p->ncol;
1027: p->fname = fname;
1028: p->line = 0;
1029: p->col = 0;
1030:
1031: /* Open the file, unless it is already open. */
1032:
1033: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1034: error_msg(p, "open: %s", strerror(errno));
1035: p->fname = save_fname;
1036: return p->tree;
1.5 schwarze 1037: }
1.24 schwarze 1038:
1039: /*
1040: * After opening the starting file, change to the directory it
1041: * is located in, in case it wants to include any further files,
1042: * which are typically given with relative paths in DocBook.
1043: * Do this on a best-effort basis; don't complain about failure.
1044: */
1045:
1046: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1047: strcmp(fname, ".") != 0)
1048: (void)chdir(fname);
1049:
1050: /* Run the read loop. */
1051:
1052: p->nline = 1;
1053: p->ncol = 1;
1054: parse_fd(p, fd);
1055:
1056: /* On the top level, finalize the parse tree. */
1057:
1058: if (save_fname == NULL) {
1.37 schwarze 1059: pnode_closetext(p, 0);
1.24 schwarze 1060: if (p->tree->root == NULL)
1061: error_msg(p, "empty document");
1062: else if ((p->tree->flags & TREE_CLOSED) == 0)
1063: warn_msg(p, "document not closed");
1064: pnode_unlink(p->doctype);
1065: }
1066:
1067: /* Clean up. */
1068:
1069: if (fd != STDIN_FILENO)
1070: close(fd);
1071: p->fname = save_fname;
1072: p->nline = save_line;
1073: p->ncol = save_col;
1.1 schwarze 1074: return p->tree;
1075: }
CVSweb