Annotation of docbook2mdoc/parse.c, Revision 1.57
1.57 ! schwarze 1: /* $Id: parse.c,v 1.56 2019/05/01 11:34:20 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
1.53 schwarze 18: #include <sys/types.h>
19:
1.1 schwarze 20: #include <assert.h>
21: #include <ctype.h>
1.24 schwarze 22: #include <errno.h>
23: #include <fcntl.h>
24: #include <libgen.h>
1.6 schwarze 25: #include <stdarg.h>
1.1 schwarze 26: #include <stdio.h>
1.5 schwarze 27: #include <stdlib.h>
1.1 schwarze 28: #include <string.h>
29: #include <unistd.h>
30:
1.53 schwarze 31: #include "xmalloc.h"
1.1 schwarze 32: #include "node.h"
33: #include "parse.h"
34:
35: /*
36: * The implementation of the DocBook parser.
37: */
38:
1.14 schwarze 39: enum pstate {
40: PARSE_ELEM,
41: PARSE_TAG,
42: PARSE_ARG,
43: PARSE_SQ,
44: PARSE_DQ
45: };
46:
1.1 schwarze 47: /*
48: * Global parse state.
49: * Keep this as simple and small as possible.
50: */
51: struct parse {
52: const char *fname; /* Name of the input file. */
53: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 54: struct pnode *doctype;
1.1 schwarze 55: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 56: enum nodeid ncur; /* Type of the current node. */
57: int line; /* Line number in the input file. */
58: int col; /* Column number in the input file. */
59: int nline; /* Line number of next token. */
60: int ncol; /* Column number of next token. */
1.4 schwarze 61: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 62: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 63: int flags;
64: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 schwarze 65: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
66: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
67: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
68: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 69: };
70:
1.39 schwarze 71: struct alias {
1.1 schwarze 72: const char *name; /* DocBook element name. */
73: enum nodeid node; /* Node type to generate. */
74: };
75:
1.39 schwarze 76: static const struct alias aliases[] = {
1.3 schwarze 77: { "acronym", NODE_IGNORE },
1.43 schwarze 78: { "affiliation", NODE_IGNORE },
1.4 schwarze 79: { "anchor", NODE_DELETE },
1.42 schwarze 80: { "application", NODE_COMMAND },
1.22 schwarze 81: { "article", NODE_SECTION },
1.41 schwarze 82: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 83: { "book", NODE_SECTION },
1.1 schwarze 84: { "chapter", NODE_SECTION },
1.44 schwarze 85: { "caption", NODE_IGNORE },
1.13 schwarze 86: { "code", NODE_LITERAL },
1.36 schwarze 87: { "computeroutput", NODE_LITERAL },
1.23 schwarze 88: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 89: { "figure", NODE_IGNORE },
1.7 schwarze 90: { "firstname", NODE_PERSONNAME },
1.21 schwarze 91: { "glossary", NODE_VARIABLELIST },
92: { "glossdef", NODE_IGNORE },
93: { "glossdiv", NODE_IGNORE },
94: { "glossentry", NODE_VARLISTENTRY },
95: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 96: { "holder", NODE_IGNORE },
1.44 schwarze 97: { "imageobject", NODE_IGNORE },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.11 schwarze 99: { "informaltable", NODE_TABLE },
1.42 schwarze 100: { "keycap", NODE_KEYSYM },
101: { "keycode", NODE_IGNORE },
1.55 schwarze 102: { "keycombo", NODE_IGNORE },
1.44 schwarze 103: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 104: { "orgname", NODE_IGNORE },
1.40 schwarze 105: { "othercredit", NODE_AUTHOR },
1.7 schwarze 106: { "othername", NODE_PERSONNAME },
1.1 schwarze 107: { "part", NODE_SECTION },
1.3 schwarze 108: { "phrase", NODE_IGNORE },
1.4 schwarze 109: { "primary", NODE_DELETE },
1.42 schwarze 110: { "property", NODE_PARAMETER },
1.52 schwarze 111: { "reference", NODE_SECTION },
1.1 schwarze 112: { "refsect1", NODE_SECTION },
113: { "refsect2", NODE_SECTION },
114: { "refsect3", NODE_SECTION },
115: { "refsection", NODE_SECTION },
1.43 schwarze 116: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 117: { "returnvalue", NODE_IGNORE },
1.4 schwarze 118: { "secondary", NODE_DELETE },
1.1 schwarze 119: { "sect1", NODE_SECTION },
120: { "sect2", NODE_SECTION },
1.46 schwarze 121: { "sect3", NODE_SECTION },
122: { "sect4", NODE_SECTION },
1.36 schwarze 123: { "sgmltag", NODE_MARKUP },
1.15 schwarze 124: { "simpara", NODE_PARA },
1.13 schwarze 125: { "structfield", NODE_PARAMETER },
126: { "structname", NODE_TYPE },
1.7 schwarze 127: { "surname", NODE_PERSONNAME },
1.12 schwarze 128: { "symbol", NODE_CONSTANT },
1.48 schwarze 129: { "tag", NODE_MARKUP },
1.3 schwarze 130: { "trademark", NODE_IGNORE },
1.18 schwarze 131: { "ulink", NODE_LINK },
1.13 schwarze 132: { "userinput", NODE_LITERAL },
1.5 schwarze 133: { NULL, NODE_IGNORE }
1.1 schwarze 134: };
135:
1.9 schwarze 136: struct entity {
137: const char *name;
138: const char *roff;
139: };
140:
141: /*
142: * XML character entity references found in the wild.
143: * Those that don't have an exact mandoc_char(7) representation
144: * are approximated, and the desired codepoint is given as a comment.
145: * Encoding them as \\[u...] would leave -Tascii out in the cold.
146: */
147: static const struct entity entities[] = {
148: { "alpha", "\\(*a" },
149: { "amp", "&" },
150: { "apos", "'" },
151: { "auml", "\\(:a" },
152: { "beta", "\\(*b" },
153: { "circ", "^" }, /* U+02C6 */
154: { "copy", "\\(co" },
155: { "dagger", "\\(dg" },
156: { "Delta", "\\(*D" },
157: { "eacute", "\\('e" },
158: { "emsp", "\\ " }, /* U+2003 */
159: { "gt", ">" },
160: { "hairsp", "\\^" },
161: { "kappa", "\\(*k" },
162: { "larr", "\\(<-" },
163: { "ldquo", "\\(lq" },
164: { "le", "\\(<=" },
165: { "lowbar", "_" },
166: { "lsqb", "[" },
167: { "lt", "<" },
168: { "mdash", "\\(em" },
169: { "minus", "\\-" },
170: { "ndash", "\\(en" },
171: { "nbsp", "\\ " },
172: { "num", "#" },
173: { "oslash", "\\(/o" },
174: { "ouml", "\\(:o" },
175: { "percnt", "%" },
176: { "quot", "\\(dq" },
177: { "rarr", "\\(->" },
178: { "rArr", "\\(rA" },
179: { "rdquo", "\\(rq" },
180: { "reg", "\\(rg" },
181: { "rho", "\\(*r" },
182: { "rsqb", "]" },
183: { "sigma", "\\(*s" },
184: { "shy", "\\&" }, /* U+00AD */
185: { "tau", "\\(*t" },
186: { "tilde", "\\[u02DC]" },
187: { "times", "\\[tmu]" },
188: { "uuml", "\\(:u" },
189: { NULL, NULL }
190: };
191:
1.23 schwarze 192: static size_t parse_string(struct parse *, char *, size_t,
193: enum pstate *, int);
1.24 schwarze 194: static void parse_fd(struct parse *, int);
1.23 schwarze 195:
196:
1.6 schwarze 197: static void
198: error_msg(struct parse *p, const char *fmt, ...)
199: {
200: va_list ap;
201:
1.29 schwarze 202: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 203: va_start(ap, fmt);
204: vfprintf(stderr, fmt, ap);
205: va_end(ap);
206: fputc('\n', stderr);
1.29 schwarze 207: p->tree->flags |= TREE_ERROR;
1.6 schwarze 208: }
209:
210: static void
211: warn_msg(struct parse *p, const char *fmt, ...)
212: {
213: va_list ap;
214:
1.23 schwarze 215: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 216: return;
217:
1.29 schwarze 218: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 219: va_start(ap, fmt);
220: vfprintf(stderr, fmt, ap);
221: va_end(ap);
222: fputc('\n', stderr);
1.29 schwarze 223: p->tree->flags |= TREE_WARN;
1.6 schwarze 224: }
225:
1.1 schwarze 226: /*
227: * Process a string of characters.
228: * If a text node is already open, append to it.
229: * Otherwise, create a new one as a child of the current node.
230: */
231: static void
1.35 schwarze 232: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 233: {
1.35 schwarze 234: struct pnode *n, *np;
1.32 schwarze 235: size_t oldsz, newsz;
1.35 schwarze 236: int i;
1.1 schwarze 237:
1.32 schwarze 238: assert(sz > 0);
1.30 schwarze 239: if (p->del > 0)
1.1 schwarze 240: return;
241:
1.32 schwarze 242: if ((n = p->cur) == NULL) {
1.35 schwarze 243: error_msg(p, "discarding text before document: %.*s",
244: sz, word);
1.5 schwarze 245: return;
246: }
247:
1.35 schwarze 248: /* Append to the current text node, if one is open. */
249:
250: if (n->node == NODE_TEXT) {
251: oldsz = strlen(n->b);
252: newsz = oldsz + sz;
253: if (oldsz && (p->flags & PFLAG_SPC))
254: newsz++;
1.53 schwarze 255: n->b = xrealloc(n->b, newsz + 1);
1.35 schwarze 256: if (oldsz && (p->flags & PFLAG_SPC))
257: n->b[oldsz++] = ' ';
258: memcpy(n->b + oldsz, word, sz);
259: n->b[newsz] = '\0';
1.51 schwarze 260: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 261: return;
1.1 schwarze 262: }
263:
1.35 schwarze 264: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 265: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 266:
1.35 schwarze 267: /* Create a new text node. */
1.1 schwarze 268:
1.53 schwarze 269: n = pnode_alloc(p->cur);
1.35 schwarze 270: n->node = NODE_TEXT;
1.57 ! schwarze 271: if (p->flags & PFLAG_LINE && TAILQ_PREV(n, pnodeq, child) != NULL)
! 272: n->flags |= NFLAG_LINE;
! 273: if (p->flags & PFLAG_SPC)
! 274: n->flags |= NFLAG_SPC;
1.51 schwarze 275: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 276:
277: /*
1.39 schwarze 278: * If this node follows an in-line macro without intervening
1.35 schwarze 279: * whitespace, keep the text in it as short as possible,
280: * and do not keep it open.
281: */
282:
1.51 schwarze 283: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 284: while (np != NULL) {
285: switch (pnode_class(np->node)) {
286: case CLASS_VOID:
287: case CLASS_TEXT:
288: case CLASS_BLOCK:
1.45 schwarze 289: case CLASS_NOFILL:
1.39 schwarze 290: np = NULL;
291: break;
292: case CLASS_TRANS:
293: np = TAILQ_LAST(&np->childq, pnodeq);
294: continue;
295: case CLASS_LINE:
296: case CLASS_ENCL:
297: break;
298: }
299: break;
300: }
301: if (np != NULL) {
1.35 schwarze 302: i = 0;
303: while (i < sz && !isspace((unsigned char)word[i]))
304: i++;
1.53 schwarze 305: n->b = xstrndup(word, i);
1.35 schwarze 306: if (i == sz)
307: return;
308: while (i < sz && isspace((unsigned char)word[i]))
309: i++;
310: if (i == sz) {
311: p->flags |= PFLAG_SPC;
312: return;
313: }
314:
315: /* Put any remaining text into a second node. */
316:
1.53 schwarze 317: n = pnode_alloc(p->cur);
1.35 schwarze 318: n->node = NODE_TEXT;
1.51 schwarze 319: n->flags |= NFLAG_SPC;
1.35 schwarze 320: word += i;
321: sz -= i;
322: }
1.53 schwarze 323: n->b = xstrndup(word, sz);
1.35 schwarze 324:
325: /* The new node remains open for later pnode_closetext(). */
326:
327: p->cur = n;
1.1 schwarze 328: }
329:
1.16 schwarze 330: /*
331: * Close out the text node and strip trailing whitespace, if one is open.
332: */
1.1 schwarze 333: static void
1.37 schwarze 334: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 335: {
1.16 schwarze 336: struct pnode *n;
1.37 schwarze 337: char *cp, *last_word;
1.16 schwarze 338:
339: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
340: return;
341: p->cur = n->parent;
1.32 schwarze 342: for (cp = strchr(n->b, '\0');
343: cp > n->b && isspace((unsigned char)cp[-1]);
344: *--cp = '\0')
1.23 schwarze 345: p->flags |= PFLAG_SPC;
1.37 schwarze 346:
347: if (p->flags & PFLAG_SPC || !check_last_word)
348: return;
349:
350: /*
351: * Find the beginning of the last word
352: * and delete whitespace before it.
353: */
354:
355: while (cp > n->b && !isspace((unsigned char)cp[-1]))
356: cp--;
357: if (cp == n->b)
358: return;
359:
360: last_word = cp;
361: while (cp > n->b && isspace((unsigned char)cp[-1]))
362: *--cp = '\0';
363:
364: /* Move the last word into its own node, for use with .Pf. */
365:
1.54 schwarze 366: n = pnode_alloc_text(p->cur, last_word);
1.51 schwarze 367: n->flags |= NFLAG_SPC;
1.1 schwarze 368: }
369:
1.9 schwarze 370: static void
371: xml_entity(struct parse *p, const char *name)
372: {
373: const struct entity *entity;
1.30 schwarze 374: struct pnode *n;
1.23 schwarze 375: const char *ccp;
376: char *cp;
1.49 schwarze 377: unsigned int codepoint;
1.23 schwarze 378: enum pstate pstate;
1.9 schwarze 379:
380: if (p->del > 0)
381: return;
382:
383: if (p->cur == NULL) {
384: error_msg(p, "discarding entity before document: &%s;", name);
385: return;
386: }
387:
1.37 schwarze 388: pnode_closetext(p, 0);
1.9 schwarze 389:
390: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
391: warn_msg(p, "entity after end of document: &%s;", name);
392:
393: for (entity = entities; entity->name != NULL; entity++)
394: if (strcmp(name, entity->name) == 0)
395: break;
396:
397: if (entity->roff == NULL) {
1.23 schwarze 398: if (p->doctype != NULL) {
1.30 schwarze 399: TAILQ_FOREACH(n, &p->doctype->childq, child) {
400: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 401: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 402: strcmp(ccp, name) != 0)
403: continue;
1.30 schwarze 404: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 405: ATTRKEY_SYSTEM, NULL)) != NULL) {
406: parse_file(p, -1, ccp);
1.51 schwarze 407: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 408: return;
409: }
1.30 schwarze 410: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 411: ATTRKEY_DEFINITION, NULL)) == NULL)
412: continue;
1.53 schwarze 413: cp = xstrdup(ccp);
1.23 schwarze 414: pstate = PARSE_ELEM;
415: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 schwarze 416: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 417: free(cp);
418: return;
419: }
420: }
1.49 schwarze 421: if (*name == '#') {
422: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
423: if (ccp == NULL) {
1.53 schwarze 424: n = pnode_alloc(p->cur);
425: xasprintf(&n->b, "\\[u%4.4X]", codepoint);
1.49 schwarze 426: goto done;
427: }
428: }
1.9 schwarze 429: error_msg(p, "unknown entity &%s;", name);
430: return;
431: }
432:
433: /* Create, append, and close out an entity node. */
1.53 schwarze 434: n = pnode_alloc(p->cur);
435: n->b = xstrdup(entity->roff);
1.49 schwarze 436: done:
1.30 schwarze 437: n->node = NODE_ESCAPE;
1.57 ! schwarze 438: if (p->flags & PFLAG_LINE && TAILQ_PREV(n, pnodeq, child) != NULL)
! 439: n->flags |= NFLAG_LINE;
! 440: if (p->flags & PFLAG_SPC)
! 441: n->flags |= NFLAG_SPC;
1.51 schwarze 442: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 443: }
444:
1.1 schwarze 445: /*
1.39 schwarze 446: * Parse an element name.
447: */
448: static enum nodeid
449: xml_name2node(struct parse *p, const char *name)
450: {
451: const struct alias *alias;
452: enum nodeid node;
453:
454: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
455: return node;
456:
457: for (alias = aliases; alias->name != NULL; alias++)
458: if (strcmp(alias->name, name) == 0)
459: return alias->node;
460:
461: return NODE_UNKNOWN;
462: }
463:
464: /*
1.1 schwarze 465: * Begin an element.
466: */
467: static void
1.30 schwarze 468: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 469: {
1.30 schwarze 470: struct pnode *n;
1.1 schwarze 471:
1.4 schwarze 472: /*
473: * An ancestor is excluded from the tree;
474: * keep track of the number of levels excluded.
475: */
1.30 schwarze 476: if (p->del > 0) {
1.23 schwarze 477: if (*name != '!' && *name != '?')
1.30 schwarze 478: p->del++;
1.4 schwarze 479: return;
480: }
481:
1.39 schwarze 482: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 483: case NODE_DELETE_WARN:
1.30 schwarze 484: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 485: /* FALLTHROUGH */
1.4 schwarze 486: case NODE_DELETE:
1.30 schwarze 487: p->del = 1;
1.4 schwarze 488: /* FALLTHROUGH */
1.2 schwarze 489: case NODE_IGNORE:
490: return;
1.39 schwarze 491: case NODE_UNKNOWN:
492: if (*name != '!' && *name != '?')
493: error_msg(p, "unknown element <%s>", name);
494: return;
1.2 schwarze 495: default:
496: break;
497: }
1.1 schwarze 498:
1.30 schwarze 499: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
500: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 501:
1.39 schwarze 502: switch (pnode_class(p->ncur)) {
503: case CLASS_LINE:
504: case CLASS_ENCL:
505: pnode_closetext(p, 1);
506: break;
507: default:
508: pnode_closetext(p, 0);
509: break;
510: }
511:
1.53 schwarze 512: n = pnode_alloc(p->cur);
1.57 ! schwarze 513: if (p->flags & PFLAG_LINE && p->cur != NULL &&
! 514: TAILQ_PREV(n, pnodeq, child) != NULL)
! 515: n->flags |= NFLAG_LINE;
! 516: p->flags &= ~PFLAG_LINE;
1.17 schwarze 517:
518: /*
1.39 schwarze 519: * Some elements are self-closing.
1.17 schwarze 520: * Nodes that begin a new macro or request line or start by
521: * printing text always want whitespace before themselves.
522: */
523:
1.39 schwarze 524: switch (n->node = p->ncur) {
1.23 schwarze 525: case NODE_DOCTYPE:
526: case NODE_ENTITY:
527: case NODE_SBR:
1.48 schwarze 528: case NODE_VOID:
1.30 schwarze 529: p->flags |= PFLAG_EEND;
1.17 schwarze 530: break;
531: default:
1.39 schwarze 532: break;
533: }
534: switch (pnode_class(p->ncur)) {
535: case CLASS_LINE:
536: case CLASS_ENCL:
1.57 ! schwarze 537: if (p->flags & PFLAG_SPC)
! 538: n->flags |= NFLAG_SPC;
1.17 schwarze 539: break;
1.45 schwarze 540: case CLASS_NOFILL:
541: p->nofill++;
542: /* FALLTHROUGH */
1.39 schwarze 543: default:
1.51 schwarze 544: n->flags |= NFLAG_SPC;
1.39 schwarze 545: break;
1.17 schwarze 546: }
1.30 schwarze 547: p->cur = n;
548: if (n->node == NODE_DOCTYPE) {
549: if (p->doctype == NULL)
550: p->doctype = n;
1.23 schwarze 551: else
1.30 schwarze 552: error_msg(p, "duplicate doctype");
553: } else if (n->parent == NULL && p->tree->root == NULL)
554: p->tree->root = n;
1.5 schwarze 555: }
556:
557: static void
1.30 schwarze 558: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 559: {
1.30 schwarze 560: struct pattr *a;
1.23 schwarze 561: const char *value;
1.5 schwarze 562: enum attrkey key;
1.1 schwarze 563:
1.47 schwarze 564: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 565: return;
1.23 schwarze 566:
1.30 schwarze 567: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
568: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 569: value = name;
570: name = "NAME";
571: } else
572: value = NULL;
573:
1.5 schwarze 574: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 575: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 576: return;
577: }
1.53 schwarze 578: a = xcalloc(1, sizeof(*a));
1.30 schwarze 579: a->key = key;
580: a->val = ATTRVAL__MAX;
1.23 schwarze 581: if (value == NULL) {
1.30 schwarze 582: a->rawval = NULL;
583: p->flags |= PFLAG_ATTR;
1.23 schwarze 584: } else {
1.53 schwarze 585: a->rawval = xstrdup(value);
1.30 schwarze 586: p->flags &= ~PFLAG_ATTR;
587: }
588: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
589: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
590: xml_attrkey(p, "DEFINITION");
1.5 schwarze 591: }
592:
593: static void
1.30 schwarze 594: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 595: {
1.30 schwarze 596: struct pattr *a;
1.5 schwarze 597:
1.47 schwarze 598: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 599: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 600: return;
1.30 schwarze 601: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 602: return;
1.53 schwarze 603: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX)
604: a->rawval = xstrdup(name);
1.30 schwarze 605: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 606: }
607:
608: /*
609: * Roll up the parse tree.
610: * If we're at a text node, roll that one up first.
611: */
612: static void
1.31 schwarze 613: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 614: {
1.26 schwarze 615: struct pnode *n;
616: const char *cp;
1.5 schwarze 617: enum nodeid node;
1.1 schwarze 618:
1.4 schwarze 619: /*
620: * An ancestor is excluded from the tree;
621: * keep track of the number of levels excluded.
622: */
1.31 schwarze 623: if (p->del > 1) {
624: p->del--;
1.4 schwarze 625: return;
626: }
627:
1.31 schwarze 628: if (p->del == 0)
1.37 schwarze 629: pnode_closetext(p, 0);
1.2 schwarze 630:
1.50 schwarze 631: n = p->cur;
1.39 schwarze 632: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 633:
1.5 schwarze 634: switch (node) {
1.4 schwarze 635: case NODE_DELETE_WARN:
636: case NODE_DELETE:
1.31 schwarze 637: if (p->del > 0)
638: p->del--;
1.4 schwarze 639: break;
1.2 schwarze 640: case NODE_IGNORE:
1.39 schwarze 641: case NODE_UNKNOWN:
1.26 schwarze 642: break;
643: case NODE_INCLUDE:
1.50 schwarze 644: p->cur = n->parent;
1.26 schwarze 645: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
646: if (cp == NULL)
1.31 schwarze 647: error_msg(p, "<xi:include> element "
1.26 schwarze 648: "without href attribute");
649: else
1.31 schwarze 650: parse_file(p, -1, cp);
1.26 schwarze 651: pnode_unlink(n);
1.51 schwarze 652: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 653: break;
1.23 schwarze 654: case NODE_DOCTYPE:
1.32 schwarze 655: case NODE_SBR:
1.48 schwarze 656: case NODE_VOID:
1.31 schwarze 657: p->flags &= ~PFLAG_EEND;
1.23 schwarze 658: /* FALLTHROUGH */
1.2 schwarze 659: default:
1.50 schwarze 660: if (n == NULL || node != n->node) {
1.31 schwarze 661: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 662: break;
663: }
1.45 schwarze 664: if (pnode_class(node) == CLASS_NOFILL)
665: p->nofill--;
1.5 schwarze 666:
667: /*
668: * Refrain from actually closing the document element.
669: * If no more content follows, no harm is done, but if
670: * some content still follows, simply processing it is
671: * obviously better than discarding it or crashing.
672: */
673:
1.50 schwarze 674: if (n->parent != NULL || node == NODE_DOCTYPE) {
675: p->cur = n->parent;
1.31 schwarze 676: if (p->cur != NULL)
677: p->ncur = p->cur->node;
1.23 schwarze 678: } else
1.31 schwarze 679: p->tree->flags |= TREE_CLOSED;
1.51 schwarze 680: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 681:
682: /* Include a file containing entity declarations. */
683:
684: if (node == NODE_ENTITY && strcmp("%",
685: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
686: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
687: parse_file(p, -1, cp);
688:
1.4 schwarze 689: break;
1.2 schwarze 690: }
1.31 schwarze 691: assert(p->del == 0);
1.1 schwarze 692: }
693:
694: struct parse *
695: parse_alloc(int warn)
696: {
697: struct parse *p;
698:
1.53 schwarze 699: p = xcalloc(1, sizeof(*p));
700: p->tree = xcalloc(1, sizeof(*p->tree));
1.23 schwarze 701: if (warn)
702: p->flags |= PFLAG_WARN;
703: else
704: p->flags &= ~PFLAG_WARN;
1.1 schwarze 705: return p;
706: }
707:
708: void
709: parse_free(struct parse *p)
710: {
711: if (p == NULL)
712: return;
713: if (p->tree != NULL) {
714: pnode_unlink(p->tree->root);
715: free(p->tree);
716: }
717: free(p);
718: }
719:
1.14 schwarze 720: static void
721: increment(struct parse *p, char *b, size_t *pend, int refill)
722: {
723: if (refill) {
724: if (b[*pend] == '\n') {
725: p->nline++;
726: p->ncol = 1;
727: } else
728: p->ncol++;
729: }
730: ++*pend;
731: }
732:
1.5 schwarze 733: /*
734: * Advance the pend pointer to the next character in the charset.
735: * If the charset starts with a space, it stands for any whitespace.
736: * Update the new input file position, used for messages.
737: * Do not overrun the buffer b of length rlen.
738: * When reaching the end, NUL-terminate the buffer and return 1;
739: * otherwise, return 0.
740: */
741: static int
742: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 743: const char *charset, int refill)
1.5 schwarze 744: {
745: int space;
746:
747: if (*charset == ' ') {
748: space = 1;
749: charset++;
750: } else
751: space = 0;
752:
1.14 schwarze 753: if (refill) {
754: p->nline = p->line;
755: p->ncol = p->col;
756: }
1.5 schwarze 757: while (*pend < rlen) {
758: if (space && isspace((unsigned char)b[*pend]))
759: break;
760: if (strchr(charset, b[*pend]) != NULL)
761: break;
1.14 schwarze 762: increment(p, b, pend, refill);
1.5 schwarze 763: }
764: if (*pend == rlen) {
765: b[rlen] = '\0';
1.14 schwarze 766: return refill;
1.5 schwarze 767: } else
768: return 0;
769: }
770:
1.14 schwarze 771: size_t
772: parse_string(struct parse *p, char *b, size_t rlen,
773: enum pstate *pstate, int refill)
774: {
775: char *cp;
1.45 schwarze 776: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 777: size_t poff; /* Parse offset in b[]. */
778: size_t pend; /* Offset of the end of the current word. */
779: int elem_end;
780:
1.45 schwarze 781: pend = pws = 0;
1.14 schwarze 782: for (;;) {
783:
784: /* Proceed to the next token, skipping whitespace. */
785:
786: if (refill) {
787: p->line = p->nline;
788: p->col = p->ncol;
789: }
790: if ((poff = pend) == rlen)
791: break;
792: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 793: p->flags |= PFLAG_SPC;
1.51 schwarze 794: if (b[pend] == '\n') {
795: p->flags |= PFLAG_LINE;
1.45 schwarze 796: pws = pend + 1;
1.51 schwarze 797: }
1.14 schwarze 798: increment(p, b, &pend, refill);
799: continue;
800: }
801:
802: /*
803: * The following four cases (ARG, TAG, and starting an
804: * entity or a tag) all parse a word or quoted string.
805: * If that extends beyond the read buffer and the last
806: * read(2) still got data, they all break out of the
807: * token loop to request more data from the read loop.
808: *
809: * Also, three of them detect self-closing tags, those
810: * ending with "/>", setting the flag elem_end and
811: * calling xml_elem_end() at the very end, after
812: * handling the attribute value, attribute name, or
813: * tag name, respectively.
814: */
815:
816: /* Parse an attribute value. */
817:
818: if (*pstate >= PARSE_ARG) {
819: if (*pstate == PARSE_ARG &&
820: (b[pend] == '\'' || b[pend] == '"')) {
821: *pstate = b[pend] == '"' ?
822: PARSE_DQ : PARSE_SQ;
823: increment(p, b, &pend, refill);
824: continue;
825: }
826: if (advance(p, b, rlen, &pend,
827: *pstate == PARSE_DQ ? "\"" :
828: *pstate == PARSE_SQ ? "'" : " >", refill))
829: break;
830: *pstate = PARSE_TAG;
831: elem_end = 0;
832: if (b[pend] == '>') {
833: *pstate = PARSE_ELEM;
834: if (pend > 0 && b[pend - 1] == '/') {
835: b[pend - 1] = '\0';
836: elem_end = 1;
837: }
1.23 schwarze 838: if (p->flags & PFLAG_EEND)
839: elem_end = 1;
1.14 schwarze 840: }
841: b[pend] = '\0';
842: if (pend < rlen)
843: increment(p, b, &pend, refill);
844: xml_attrval(p, b + poff);
845: if (elem_end)
846: xml_elem_end(p, NULL);
847:
848: /* Look for an attribute name. */
849:
850: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 851: switch (p->ncur) {
852: case NODE_DOCTYPE:
853: if (b[pend] == '[') {
854: *pstate = PARSE_ELEM;
855: increment(p, b, &pend, refill);
856: continue;
857: }
858: /* FALLTHROUGH */
859: case NODE_ENTITY:
860: if (b[pend] == '"' || b[pend] == '\'') {
861: *pstate = PARSE_ARG;
862: continue;
863: }
864: break;
865: default:
866: break;
867: }
1.14 schwarze 868: if (advance(p, b, rlen, &pend, " =>", refill))
869: break;
870: elem_end = 0;
871: switch (b[pend]) {
872: case '>':
873: *pstate = PARSE_ELEM;
874: if (pend > 0 && b[pend - 1] == '/') {
875: b[pend - 1] = '\0';
876: elem_end = 1;
877: }
1.23 schwarze 878: if (p->flags & PFLAG_EEND)
879: elem_end = 1;
1.14 schwarze 880: break;
881: case '=':
882: *pstate = PARSE_ARG;
883: break;
884: default:
885: break;
886: }
887: b[pend] = '\0';
888: if (pend < rlen)
889: increment(p, b, &pend, refill);
890: xml_attrkey(p, b + poff);
891: if (elem_end)
892: xml_elem_end(p, NULL);
893:
894: /* Begin an opening or closing tag. */
895:
896: } else if (b[poff] == '<') {
897: if (advance(p, b, rlen, &pend, " >", refill))
898: break;
899: if (pend > poff + 3 &&
900: strncmp(b + poff, "<!--", 4) == 0) {
901:
902: /* Skip a comment. */
903:
904: cp = strstr(b + pend - 2, "-->");
905: if (cp == NULL) {
906: if (refill)
907: break;
908: cp = b + rlen;
909: } else
910: cp += 3;
911: while (b + pend < cp)
912: increment(p, b, &pend, refill);
913: continue;
914: }
915: elem_end = 0;
916: if (b[pend] != '>')
917: *pstate = PARSE_TAG;
918: else if (pend > 0 && b[pend - 1] == '/') {
919: b[pend - 1] = '\0';
920: elem_end = 1;
921: }
922: b[pend] = '\0';
923: if (pend < rlen)
924: increment(p, b, &pend, refill);
925: if (b[++poff] == '/') {
926: elem_end = 1;
927: poff++;
1.23 schwarze 928: } else {
1.14 schwarze 929: xml_elem_start(p, b + poff);
1.23 schwarze 930: if (*pstate == PARSE_ELEM &&
931: p->flags & PFLAG_EEND)
932: elem_end = 1;
933: }
1.14 schwarze 934: if (elem_end)
935: xml_elem_end(p, b + poff);
936:
1.23 schwarze 937: /* Close a doctype. */
938:
939: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
940: *pstate = PARSE_TAG;
941: increment(p, b, &pend, refill);
942:
1.14 schwarze 943: /* Process an entity. */
944:
945: } else if (b[poff] == '&') {
946: if (advance(p, b, rlen, &pend, ";", refill))
947: break;
948: b[pend] = '\0';
949: if (pend < rlen)
950: increment(p, b, &pend, refill);
951: xml_entity(p, b + poff + 1);
952:
953: /* Process text up to the next tag, entity, or EOL. */
954:
955: } else {
1.28 schwarze 956: advance(p, b, rlen, &pend,
1.33 schwarze 957: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 958: refill);
1.45 schwarze 959: if (p->nofill)
960: poff = pws;
1.35 schwarze 961: xml_text(p, b + poff, pend - poff);
1.33 schwarze 962: if (b[pend] == '\n')
1.37 schwarze 963: pnode_closetext(p, 0);
1.14 schwarze 964: }
1.45 schwarze 965: pws = pend;
1.14 schwarze 966: }
967: return poff;
968: }
969:
1.24 schwarze 970:
971: /*
972: * The read loop.
973: * If the previous token was incomplete and asked for more input,
974: * we have to enter the read loop once more even on EOF.
975: * Once rsz is 0, incomplete tokens will no longer ask for more input
976: * but instead use whatever there is, and then exit the read loop.
977: * The minus one on the size limit for read(2) is needed such that
978: * advance() can set b[rlen] to NUL when needed.
979: */
980: static void
981: parse_fd(struct parse *p, int fd)
1.1 schwarze 982: {
983: char b[4096];
1.5 schwarze 984: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 985: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 986: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 987: enum pstate pstate;
1.1 schwarze 988:
1.24 schwarze 989: rlen = 0;
1.14 schwarze 990: pstate = PARSE_ELEM;
991: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
992: (rlen += rsz) > 0) {
993: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 994: /* Buffer exhausted; shift left and re-fill. */
995: assert(poff > 0);
996: rlen -= poff;
1.14 schwarze 997: memmove(b, b + poff, rlen);
1.5 schwarze 998: }
1.24 schwarze 999: if (rsz < 0)
1000: error_msg(p, "read: %s", strerror(errno));
1001: }
1002:
1003: /*
1004: * Open and parse a file.
1005: */
1006: struct ptree *
1007: parse_file(struct parse *p, int fd, const char *fname)
1008: {
1009: const char *save_fname;
1010: int save_line, save_col;
1011:
1012: /* Save and initialize reporting data. */
1013:
1014: save_fname = p->fname;
1015: save_line = p->nline;
1016: save_col = p->ncol;
1017: p->fname = fname;
1018: p->line = 0;
1019: p->col = 0;
1020:
1021: /* Open the file, unless it is already open. */
1022:
1023: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1024: error_msg(p, "open: %s", strerror(errno));
1025: p->fname = save_fname;
1026: return p->tree;
1.5 schwarze 1027: }
1.24 schwarze 1028:
1029: /*
1030: * After opening the starting file, change to the directory it
1031: * is located in, in case it wants to include any further files,
1032: * which are typically given with relative paths in DocBook.
1033: * Do this on a best-effort basis; don't complain about failure.
1034: */
1035:
1036: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1037: strcmp(fname, ".") != 0)
1038: (void)chdir(fname);
1039:
1040: /* Run the read loop. */
1041:
1042: p->nline = 1;
1043: p->ncol = 1;
1044: parse_fd(p, fd);
1045:
1046: /* On the top level, finalize the parse tree. */
1047:
1048: if (save_fname == NULL) {
1.37 schwarze 1049: pnode_closetext(p, 0);
1.24 schwarze 1050: if (p->tree->root == NULL)
1051: error_msg(p, "empty document");
1052: else if ((p->tree->flags & TREE_CLOSED) == 0)
1053: warn_msg(p, "document not closed");
1054: pnode_unlink(p->doctype);
1055: }
1056:
1057: /* Clean up. */
1058:
1059: if (fd != STDIN_FILENO)
1060: close(fd);
1061: p->fname = save_fname;
1062: p->nline = save_line;
1063: p->ncol = save_col;
1.1 schwarze 1064: return p->tree;
1065: }
CVSweb