Annotation of docbook2mdoc/parse.c, Revision 1.51
1.51 ! schwarze 1: /* $Id: parse.c,v 1.50 2019/04/23 15:47:23 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 59: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 60: int flags;
61: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
1.51 ! schwarze 62: #define PFLAG_LINE (1 << 1) /* New line before the next element. */
! 63: #define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */
! 64: #define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */
! 65: #define PFLAG_EEND (1 << 4) /* This element is self-closing. */
1.1 schwarze 66: };
67:
1.39 schwarze 68: struct alias {
1.1 schwarze 69: const char *name; /* DocBook element name. */
70: enum nodeid node; /* Node type to generate. */
71: };
72:
1.39 schwarze 73: static const struct alias aliases[] = {
1.3 schwarze 74: { "acronym", NODE_IGNORE },
1.43 schwarze 75: { "affiliation", NODE_IGNORE },
1.4 schwarze 76: { "anchor", NODE_DELETE },
1.42 schwarze 77: { "application", NODE_COMMAND },
1.22 schwarze 78: { "article", NODE_SECTION },
1.41 schwarze 79: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 80: { "book", NODE_SECTION },
1.1 schwarze 81: { "chapter", NODE_SECTION },
1.44 schwarze 82: { "caption", NODE_IGNORE },
1.13 schwarze 83: { "code", NODE_LITERAL },
1.36 schwarze 84: { "computeroutput", NODE_LITERAL },
1.23 schwarze 85: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 86: { "figure", NODE_IGNORE },
1.7 schwarze 87: { "firstname", NODE_PERSONNAME },
1.21 schwarze 88: { "glossary", NODE_VARIABLELIST },
89: { "glossdef", NODE_IGNORE },
90: { "glossdiv", NODE_IGNORE },
91: { "glossentry", NODE_VARLISTENTRY },
92: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 93: { "holder", NODE_IGNORE },
1.44 schwarze 94: { "imageobject", NODE_IGNORE },
1.4 schwarze 95: { "indexterm", NODE_DELETE },
1.11 schwarze 96: { "informaltable", NODE_TABLE },
1.42 schwarze 97: { "keycap", NODE_KEYSYM },
98: { "keycode", NODE_IGNORE },
1.44 schwarze 99: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 100: { "orgname", NODE_IGNORE },
1.40 schwarze 101: { "othercredit", NODE_AUTHOR },
1.7 schwarze 102: { "othername", NODE_PERSONNAME },
1.1 schwarze 103: { "part", NODE_SECTION },
1.3 schwarze 104: { "phrase", NODE_IGNORE },
1.4 schwarze 105: { "primary", NODE_DELETE },
1.42 schwarze 106: { "property", NODE_PARAMETER },
1.1 schwarze 107: { "refsect1", NODE_SECTION },
108: { "refsect2", NODE_SECTION },
109: { "refsect3", NODE_SECTION },
110: { "refsection", NODE_SECTION },
1.43 schwarze 111: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 112: { "returnvalue", NODE_IGNORE },
1.4 schwarze 113: { "secondary", NODE_DELETE },
1.1 schwarze 114: { "sect1", NODE_SECTION },
115: { "sect2", NODE_SECTION },
1.46 schwarze 116: { "sect3", NODE_SECTION },
117: { "sect4", NODE_SECTION },
1.36 schwarze 118: { "sgmltag", NODE_MARKUP },
1.15 schwarze 119: { "simpara", NODE_PARA },
1.13 schwarze 120: { "structfield", NODE_PARAMETER },
121: { "structname", NODE_TYPE },
1.7 schwarze 122: { "surname", NODE_PERSONNAME },
1.12 schwarze 123: { "symbol", NODE_CONSTANT },
1.48 schwarze 124: { "tag", NODE_MARKUP },
1.3 schwarze 125: { "trademark", NODE_IGNORE },
1.18 schwarze 126: { "ulink", NODE_LINK },
1.13 schwarze 127: { "userinput", NODE_LITERAL },
1.43 schwarze 128: { "year", NODE_IGNORE },
1.5 schwarze 129: { NULL, NODE_IGNORE }
1.1 schwarze 130: };
131:
1.9 schwarze 132: struct entity {
133: const char *name;
134: const char *roff;
135: };
136:
137: /*
138: * XML character entity references found in the wild.
139: * Those that don't have an exact mandoc_char(7) representation
140: * are approximated, and the desired codepoint is given as a comment.
141: * Encoding them as \\[u...] would leave -Tascii out in the cold.
142: */
143: static const struct entity entities[] = {
144: { "alpha", "\\(*a" },
145: { "amp", "&" },
146: { "apos", "'" },
147: { "auml", "\\(:a" },
148: { "beta", "\\(*b" },
149: { "circ", "^" }, /* U+02C6 */
150: { "copy", "\\(co" },
151: { "dagger", "\\(dg" },
152: { "Delta", "\\(*D" },
153: { "eacute", "\\('e" },
154: { "emsp", "\\ " }, /* U+2003 */
155: { "gt", ">" },
156: { "hairsp", "\\^" },
157: { "kappa", "\\(*k" },
158: { "larr", "\\(<-" },
159: { "ldquo", "\\(lq" },
160: { "le", "\\(<=" },
161: { "lowbar", "_" },
162: { "lsqb", "[" },
163: { "lt", "<" },
164: { "mdash", "\\(em" },
165: { "minus", "\\-" },
166: { "ndash", "\\(en" },
167: { "nbsp", "\\ " },
168: { "num", "#" },
169: { "oslash", "\\(/o" },
170: { "ouml", "\\(:o" },
171: { "percnt", "%" },
172: { "quot", "\\(dq" },
173: { "rarr", "\\(->" },
174: { "rArr", "\\(rA" },
175: { "rdquo", "\\(rq" },
176: { "reg", "\\(rg" },
177: { "rho", "\\(*r" },
178: { "rsqb", "]" },
179: { "sigma", "\\(*s" },
180: { "shy", "\\&" }, /* U+00AD */
181: { "tau", "\\(*t" },
182: { "tilde", "\\[u02DC]" },
183: { "times", "\\[tmu]" },
184: { "uuml", "\\(:u" },
185: { NULL, NULL }
186: };
187:
1.23 schwarze 188: static size_t parse_string(struct parse *, char *, size_t,
189: enum pstate *, int);
1.24 schwarze 190: static void parse_fd(struct parse *, int);
1.23 schwarze 191:
192:
1.6 schwarze 193: static void
1.29 schwarze 194: fatal(struct parse *p)
195: {
196: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
197: perror(NULL);
198: exit(6);
199: }
200:
201: static void
1.6 schwarze 202: error_msg(struct parse *p, const char *fmt, ...)
203: {
204: va_list ap;
205:
1.29 schwarze 206: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 207: va_start(ap, fmt);
208: vfprintf(stderr, fmt, ap);
209: va_end(ap);
210: fputc('\n', stderr);
1.29 schwarze 211: p->tree->flags |= TREE_ERROR;
1.6 schwarze 212: }
213:
214: static void
215: warn_msg(struct parse *p, const char *fmt, ...)
216: {
217: va_list ap;
218:
1.23 schwarze 219: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 220: return;
221:
1.29 schwarze 222: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 223: va_start(ap, fmt);
224: vfprintf(stderr, fmt, ap);
225: va_end(ap);
226: fputc('\n', stderr);
1.29 schwarze 227: p->tree->flags |= TREE_WARN;
1.6 schwarze 228: }
229:
1.1 schwarze 230: /*
231: * Process a string of characters.
232: * If a text node is already open, append to it.
233: * Otherwise, create a new one as a child of the current node.
234: */
235: static void
1.35 schwarze 236: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 237: {
1.35 schwarze 238: struct pnode *n, *np;
1.32 schwarze 239: size_t oldsz, newsz;
1.35 schwarze 240: int i;
1.1 schwarze 241:
1.32 schwarze 242: assert(sz > 0);
1.30 schwarze 243: if (p->del > 0)
1.1 schwarze 244: return;
245:
1.32 schwarze 246: if ((n = p->cur) == NULL) {
1.35 schwarze 247: error_msg(p, "discarding text before document: %.*s",
248: sz, word);
1.5 schwarze 249: return;
250: }
251:
1.35 schwarze 252: /* Append to the current text node, if one is open. */
253:
254: if (n->node == NODE_TEXT) {
255: oldsz = strlen(n->b);
256: newsz = oldsz + sz;
257: if (oldsz && (p->flags & PFLAG_SPC))
258: newsz++;
259: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 260: fatal(p);
1.35 schwarze 261: if (oldsz && (p->flags & PFLAG_SPC))
262: n->b[oldsz++] = ' ';
263: memcpy(n->b + oldsz, word, sz);
264: n->b[newsz] = '\0';
1.51 ! schwarze 265: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 266: return;
1.1 schwarze 267: }
268:
1.35 schwarze 269: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 270: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 271:
1.35 schwarze 272: /* Create a new text node. */
1.1 schwarze 273:
1.35 schwarze 274: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 275: fatal(p);
1.35 schwarze 276: n->node = NODE_TEXT;
1.51 ! schwarze 277: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
! 278: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
! 279: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.35 schwarze 280:
281: /*
1.39 schwarze 282: * If this node follows an in-line macro without intervening
1.35 schwarze 283: * whitespace, keep the text in it as short as possible,
284: * and do not keep it open.
285: */
286:
1.51 ! schwarze 287: np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child);
1.39 schwarze 288: while (np != NULL) {
289: switch (pnode_class(np->node)) {
290: case CLASS_VOID:
291: case CLASS_TEXT:
292: case CLASS_BLOCK:
1.45 schwarze 293: case CLASS_NOFILL:
1.39 schwarze 294: np = NULL;
295: break;
296: case CLASS_TRANS:
297: np = TAILQ_LAST(&np->childq, pnodeq);
298: continue;
299: case CLASS_LINE:
300: case CLASS_ENCL:
301: break;
302: }
303: break;
304: }
305: if (np != NULL) {
1.35 schwarze 306: i = 0;
307: while (i < sz && !isspace((unsigned char)word[i]))
308: i++;
309: if ((n->b = strndup(word, i)) == NULL)
310: fatal(p);
311: if (i == sz)
312: return;
313: while (i < sz && isspace((unsigned char)word[i]))
314: i++;
315: if (i == sz) {
316: p->flags |= PFLAG_SPC;
317: return;
318: }
319:
320: /* Put any remaining text into a second node. */
321:
322: if ((n = pnode_alloc(p->cur)) == NULL)
323: fatal(p);
324: n->node = NODE_TEXT;
1.51 ! schwarze 325: n->flags |= NFLAG_SPC;
1.35 schwarze 326: word += i;
327: sz -= i;
328: }
329: if ((n->b = strndup(word, sz)) == NULL)
330: fatal(p);
331:
332: /* The new node remains open for later pnode_closetext(). */
333:
334: p->cur = n;
1.1 schwarze 335: }
336:
1.16 schwarze 337: /*
338: * Close out the text node and strip trailing whitespace, if one is open.
339: */
1.1 schwarze 340: static void
1.37 schwarze 341: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 342: {
1.16 schwarze 343: struct pnode *n;
1.37 schwarze 344: char *cp, *last_word;
1.16 schwarze 345:
346: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
347: return;
348: p->cur = n->parent;
1.32 schwarze 349: for (cp = strchr(n->b, '\0');
350: cp > n->b && isspace((unsigned char)cp[-1]);
351: *--cp = '\0')
1.23 schwarze 352: p->flags |= PFLAG_SPC;
1.37 schwarze 353:
354: if (p->flags & PFLAG_SPC || !check_last_word)
355: return;
356:
357: /*
358: * Find the beginning of the last word
359: * and delete whitespace before it.
360: */
361:
362: while (cp > n->b && !isspace((unsigned char)cp[-1]))
363: cp--;
364: if (cp == n->b)
365: return;
366:
367: last_word = cp;
368: while (cp > n->b && isspace((unsigned char)cp[-1]))
369: *--cp = '\0';
370:
371: /* Move the last word into its own node, for use with .Pf. */
372:
373: if ((n = pnode_alloc(p->cur)) == NULL)
374: fatal(p);
375: n->node = NODE_TEXT;
1.51 ! schwarze 376: n->flags |= NFLAG_SPC;
1.37 schwarze 377: if ((n->b = strdup(last_word)) == NULL)
378: fatal(p);
1.1 schwarze 379: }
380:
1.9 schwarze 381: static void
382: xml_entity(struct parse *p, const char *name)
383: {
384: const struct entity *entity;
1.30 schwarze 385: struct pnode *n;
1.23 schwarze 386: const char *ccp;
387: char *cp;
1.49 schwarze 388: unsigned int codepoint;
1.23 schwarze 389: enum pstate pstate;
1.9 schwarze 390:
391: if (p->del > 0)
392: return;
393:
394: if (p->cur == NULL) {
395: error_msg(p, "discarding entity before document: &%s;", name);
396: return;
397: }
398:
1.37 schwarze 399: pnode_closetext(p, 0);
1.9 schwarze 400:
401: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
402: warn_msg(p, "entity after end of document: &%s;", name);
403:
404: for (entity = entities; entity->name != NULL; entity++)
405: if (strcmp(name, entity->name) == 0)
406: break;
407:
408: if (entity->roff == NULL) {
1.23 schwarze 409: if (p->doctype != NULL) {
1.30 schwarze 410: TAILQ_FOREACH(n, &p->doctype->childq, child) {
411: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 412: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 413: strcmp(ccp, name) != 0)
414: continue;
1.30 schwarze 415: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 416: ATTRKEY_SYSTEM, NULL)) != NULL) {
417: parse_file(p, -1, ccp);
1.51 ! schwarze 418: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.25 schwarze 419: return;
420: }
1.30 schwarze 421: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 422: ATTRKEY_DEFINITION, NULL)) == NULL)
423: continue;
1.29 schwarze 424: if ((cp = strdup(ccp)) == NULL)
425: fatal(p);
1.23 schwarze 426: pstate = PARSE_ELEM;
427: parse_string(p, cp, strlen(cp), &pstate, 0);
1.51 ! schwarze 428: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.23 schwarze 429: free(cp);
430: return;
431: }
432: }
1.49 schwarze 433: if (*name == '#') {
434: codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp);
435: if (ccp == NULL) {
436: if ((n = pnode_alloc(p->cur)) == NULL ||
437: asprintf(&n->b, "\\[u%4.4X]",
438: codepoint) < 0)
439: fatal(p);
440: goto done;
441: }
442: }
1.9 schwarze 443: error_msg(p, "unknown entity &%s;", name);
444: return;
445: }
446:
447: /* Create, append, and close out an entity node. */
1.34 schwarze 448: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 449: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 450: fatal(p);
1.49 schwarze 451: done:
1.30 schwarze 452: n->node = NODE_ESCAPE;
1.51 ! schwarze 453: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
! 454: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
! 455: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.9 schwarze 456: }
457:
1.1 schwarze 458: /*
1.39 schwarze 459: * Parse an element name.
460: */
461: static enum nodeid
462: xml_name2node(struct parse *p, const char *name)
463: {
464: const struct alias *alias;
465: enum nodeid node;
466:
467: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
468: return node;
469:
470: for (alias = aliases; alias->name != NULL; alias++)
471: if (strcmp(alias->name, name) == 0)
472: return alias->node;
473:
474: return NODE_UNKNOWN;
475: }
476:
477: /*
1.1 schwarze 478: * Begin an element.
479: */
480: static void
1.30 schwarze 481: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 482: {
1.30 schwarze 483: struct pnode *n;
1.1 schwarze 484:
1.4 schwarze 485: /*
486: * An ancestor is excluded from the tree;
487: * keep track of the number of levels excluded.
488: */
1.30 schwarze 489: if (p->del > 0) {
1.23 schwarze 490: if (*name != '!' && *name != '?')
1.30 schwarze 491: p->del++;
1.4 schwarze 492: return;
493: }
494:
1.39 schwarze 495: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 496: case NODE_DELETE_WARN:
1.30 schwarze 497: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 498: /* FALLTHROUGH */
1.4 schwarze 499: case NODE_DELETE:
1.30 schwarze 500: p->del = 1;
1.4 schwarze 501: /* FALLTHROUGH */
1.2 schwarze 502: case NODE_IGNORE:
503: return;
1.39 schwarze 504: case NODE_UNKNOWN:
505: if (*name != '!' && *name != '?')
506: error_msg(p, "unknown element <%s>", name);
507: return;
1.2 schwarze 508: default:
509: break;
510: }
1.1 schwarze 511:
1.30 schwarze 512: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
513: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 514:
1.39 schwarze 515: switch (pnode_class(p->ncur)) {
516: case CLASS_LINE:
517: case CLASS_ENCL:
518: pnode_closetext(p, 1);
519: break;
520: default:
521: pnode_closetext(p, 0);
522: break;
523: }
524:
1.34 schwarze 525: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 526: fatal(p);
1.17 schwarze 527:
528: /*
1.39 schwarze 529: * Some elements are self-closing.
1.17 schwarze 530: * Nodes that begin a new macro or request line or start by
531: * printing text always want whitespace before themselves.
532: */
533:
1.39 schwarze 534: switch (n->node = p->ncur) {
1.23 schwarze 535: case NODE_DOCTYPE:
536: case NODE_ENTITY:
537: case NODE_SBR:
1.48 schwarze 538: case NODE_VOID:
1.30 schwarze 539: p->flags |= PFLAG_EEND;
1.17 schwarze 540: break;
541: default:
1.39 schwarze 542: break;
543: }
544: switch (pnode_class(p->ncur)) {
545: case CLASS_LINE:
546: case CLASS_ENCL:
1.51 ! schwarze 547: n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) |
! 548: ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0);
1.17 schwarze 549: break;
1.45 schwarze 550: case CLASS_NOFILL:
551: p->nofill++;
552: /* FALLTHROUGH */
1.39 schwarze 553: default:
1.51 ! schwarze 554: n->flags |= NFLAG_SPC;
1.39 schwarze 555: break;
1.17 schwarze 556: }
1.30 schwarze 557: p->cur = n;
558: if (n->node == NODE_DOCTYPE) {
559: if (p->doctype == NULL)
560: p->doctype = n;
1.23 schwarze 561: else
1.30 schwarze 562: error_msg(p, "duplicate doctype");
563: } else if (n->parent == NULL && p->tree->root == NULL)
564: p->tree->root = n;
1.5 schwarze 565: }
566:
567: static void
1.30 schwarze 568: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 569: {
1.30 schwarze 570: struct pattr *a;
1.23 schwarze 571: const char *value;
1.5 schwarze 572: enum attrkey key;
1.1 schwarze 573:
1.47 schwarze 574: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 575: return;
1.23 schwarze 576:
1.30 schwarze 577: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
578: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 579: value = name;
580: name = "NAME";
581: } else
582: value = NULL;
583:
1.5 schwarze 584: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 585: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 586: return;
587: }
1.30 schwarze 588: if ((a = calloc(1, sizeof(*a))) == NULL)
589: fatal(p);
1.29 schwarze 590:
1.30 schwarze 591: a->key = key;
592: a->val = ATTRVAL__MAX;
1.23 schwarze 593: if (value == NULL) {
1.30 schwarze 594: a->rawval = NULL;
595: p->flags |= PFLAG_ATTR;
1.23 schwarze 596: } else {
1.30 schwarze 597: if ((a->rawval = strdup(value)) == NULL)
598: fatal(p);
599: p->flags &= ~PFLAG_ATTR;
600: }
601: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
602: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
603: xml_attrkey(p, "DEFINITION");
1.5 schwarze 604: }
605:
606: static void
1.30 schwarze 607: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 608: {
1.30 schwarze 609: struct pattr *a;
1.5 schwarze 610:
1.47 schwarze 611: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 612: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 613: return;
1.30 schwarze 614: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 615: return;
1.30 schwarze 616: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
617: (a->rawval = strdup(name)) == NULL)
618: fatal(p);
619: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 620: }
621:
622: /*
623: * Roll up the parse tree.
624: * If we're at a text node, roll that one up first.
625: */
626: static void
1.31 schwarze 627: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 628: {
1.26 schwarze 629: struct pnode *n;
630: const char *cp;
1.5 schwarze 631: enum nodeid node;
1.1 schwarze 632:
1.4 schwarze 633: /*
634: * An ancestor is excluded from the tree;
635: * keep track of the number of levels excluded.
636: */
1.31 schwarze 637: if (p->del > 1) {
638: p->del--;
1.4 schwarze 639: return;
640: }
641:
1.31 schwarze 642: if (p->del == 0)
1.37 schwarze 643: pnode_closetext(p, 0);
1.2 schwarze 644:
1.50 schwarze 645: n = p->cur;
1.39 schwarze 646: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 647:
1.5 schwarze 648: switch (node) {
1.4 schwarze 649: case NODE_DELETE_WARN:
650: case NODE_DELETE:
1.31 schwarze 651: if (p->del > 0)
652: p->del--;
1.4 schwarze 653: break;
1.2 schwarze 654: case NODE_IGNORE:
1.39 schwarze 655: case NODE_UNKNOWN:
1.26 schwarze 656: break;
657: case NODE_INCLUDE:
1.50 schwarze 658: p->cur = n->parent;
1.26 schwarze 659: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
660: if (cp == NULL)
1.31 schwarze 661: error_msg(p, "<xi:include> element "
1.26 schwarze 662: "without href attribute");
663: else
1.31 schwarze 664: parse_file(p, -1, cp);
1.26 schwarze 665: pnode_unlink(n);
1.51 ! schwarze 666: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.2 schwarze 667: break;
1.23 schwarze 668: case NODE_DOCTYPE:
1.32 schwarze 669: case NODE_SBR:
1.48 schwarze 670: case NODE_VOID:
1.31 schwarze 671: p->flags &= ~PFLAG_EEND;
1.23 schwarze 672: /* FALLTHROUGH */
1.2 schwarze 673: default:
1.50 schwarze 674: if (n == NULL || node != n->node) {
1.31 schwarze 675: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 676: break;
677: }
1.45 schwarze 678: if (pnode_class(node) == CLASS_NOFILL)
679: p->nofill--;
1.5 schwarze 680:
681: /*
682: * Refrain from actually closing the document element.
683: * If no more content follows, no harm is done, but if
684: * some content still follows, simply processing it is
685: * obviously better than discarding it or crashing.
686: */
687:
1.50 schwarze 688: if (n->parent != NULL || node == NODE_DOCTYPE) {
689: p->cur = n->parent;
1.31 schwarze 690: if (p->cur != NULL)
691: p->ncur = p->cur->node;
1.23 schwarze 692: } else
1.31 schwarze 693: p->tree->flags |= TREE_CLOSED;
1.51 ! schwarze 694: p->flags &= ~(PFLAG_LINE | PFLAG_SPC);
1.50 schwarze 695:
696: /* Include a file containing entity declarations. */
697:
698: if (node == NODE_ENTITY && strcmp("%",
699: pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 &&
700: (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL)
701: parse_file(p, -1, cp);
702:
1.4 schwarze 703: break;
1.2 schwarze 704: }
1.31 schwarze 705: assert(p->del == 0);
1.1 schwarze 706: }
707:
708: struct parse *
709: parse_alloc(int warn)
710: {
711: struct parse *p;
712:
713: if ((p = calloc(1, sizeof(*p))) == NULL)
714: return NULL;
715:
716: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
717: free(p);
718: return NULL;
719: }
1.23 schwarze 720: if (warn)
721: p->flags |= PFLAG_WARN;
722: else
723: p->flags &= ~PFLAG_WARN;
1.1 schwarze 724: return p;
725: }
726:
727: void
728: parse_free(struct parse *p)
729: {
730: if (p == NULL)
731: return;
732: if (p->tree != NULL) {
733: pnode_unlink(p->tree->root);
734: free(p->tree);
735: }
736: free(p);
737: }
738:
1.14 schwarze 739: static void
740: increment(struct parse *p, char *b, size_t *pend, int refill)
741: {
742: if (refill) {
743: if (b[*pend] == '\n') {
744: p->nline++;
745: p->ncol = 1;
746: } else
747: p->ncol++;
748: }
749: ++*pend;
750: }
751:
1.5 schwarze 752: /*
753: * Advance the pend pointer to the next character in the charset.
754: * If the charset starts with a space, it stands for any whitespace.
755: * Update the new input file position, used for messages.
756: * Do not overrun the buffer b of length rlen.
757: * When reaching the end, NUL-terminate the buffer and return 1;
758: * otherwise, return 0.
759: */
760: static int
761: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 762: const char *charset, int refill)
1.5 schwarze 763: {
764: int space;
765:
766: if (*charset == ' ') {
767: space = 1;
768: charset++;
769: } else
770: space = 0;
771:
1.14 schwarze 772: if (refill) {
773: p->nline = p->line;
774: p->ncol = p->col;
775: }
1.5 schwarze 776: while (*pend < rlen) {
777: if (space && isspace((unsigned char)b[*pend]))
778: break;
779: if (strchr(charset, b[*pend]) != NULL)
780: break;
1.14 schwarze 781: increment(p, b, pend, refill);
1.5 schwarze 782: }
783: if (*pend == rlen) {
784: b[rlen] = '\0';
1.14 schwarze 785: return refill;
1.5 schwarze 786: } else
787: return 0;
788: }
789:
1.14 schwarze 790: size_t
791: parse_string(struct parse *p, char *b, size_t rlen,
792: enum pstate *pstate, int refill)
793: {
794: char *cp;
1.45 schwarze 795: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 796: size_t poff; /* Parse offset in b[]. */
797: size_t pend; /* Offset of the end of the current word. */
798: int elem_end;
799:
1.45 schwarze 800: pend = pws = 0;
1.14 schwarze 801: for (;;) {
802:
803: /* Proceed to the next token, skipping whitespace. */
804:
805: if (refill) {
806: p->line = p->nline;
807: p->col = p->ncol;
808: }
809: if ((poff = pend) == rlen)
810: break;
811: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 812: p->flags |= PFLAG_SPC;
1.51 ! schwarze 813: if (b[pend] == '\n') {
! 814: p->flags |= PFLAG_LINE;
1.45 schwarze 815: pws = pend + 1;
1.51 ! schwarze 816: }
1.14 schwarze 817: increment(p, b, &pend, refill);
818: continue;
819: }
820:
821: /*
822: * The following four cases (ARG, TAG, and starting an
823: * entity or a tag) all parse a word or quoted string.
824: * If that extends beyond the read buffer and the last
825: * read(2) still got data, they all break out of the
826: * token loop to request more data from the read loop.
827: *
828: * Also, three of them detect self-closing tags, those
829: * ending with "/>", setting the flag elem_end and
830: * calling xml_elem_end() at the very end, after
831: * handling the attribute value, attribute name, or
832: * tag name, respectively.
833: */
834:
835: /* Parse an attribute value. */
836:
837: if (*pstate >= PARSE_ARG) {
838: if (*pstate == PARSE_ARG &&
839: (b[pend] == '\'' || b[pend] == '"')) {
840: *pstate = b[pend] == '"' ?
841: PARSE_DQ : PARSE_SQ;
842: increment(p, b, &pend, refill);
843: continue;
844: }
845: if (advance(p, b, rlen, &pend,
846: *pstate == PARSE_DQ ? "\"" :
847: *pstate == PARSE_SQ ? "'" : " >", refill))
848: break;
849: *pstate = PARSE_TAG;
850: elem_end = 0;
851: if (b[pend] == '>') {
852: *pstate = PARSE_ELEM;
853: if (pend > 0 && b[pend - 1] == '/') {
854: b[pend - 1] = '\0';
855: elem_end = 1;
856: }
1.23 schwarze 857: if (p->flags & PFLAG_EEND)
858: elem_end = 1;
1.14 schwarze 859: }
860: b[pend] = '\0';
861: if (pend < rlen)
862: increment(p, b, &pend, refill);
863: xml_attrval(p, b + poff);
864: if (elem_end)
865: xml_elem_end(p, NULL);
866:
867: /* Look for an attribute name. */
868:
869: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 870: switch (p->ncur) {
871: case NODE_DOCTYPE:
872: if (b[pend] == '[') {
873: *pstate = PARSE_ELEM;
874: increment(p, b, &pend, refill);
875: continue;
876: }
877: /* FALLTHROUGH */
878: case NODE_ENTITY:
879: if (b[pend] == '"' || b[pend] == '\'') {
880: *pstate = PARSE_ARG;
881: continue;
882: }
883: break;
884: default:
885: break;
886: }
1.14 schwarze 887: if (advance(p, b, rlen, &pend, " =>", refill))
888: break;
889: elem_end = 0;
890: switch (b[pend]) {
891: case '>':
892: *pstate = PARSE_ELEM;
893: if (pend > 0 && b[pend - 1] == '/') {
894: b[pend - 1] = '\0';
895: elem_end = 1;
896: }
1.23 schwarze 897: if (p->flags & PFLAG_EEND)
898: elem_end = 1;
1.14 schwarze 899: break;
900: case '=':
901: *pstate = PARSE_ARG;
902: break;
903: default:
904: break;
905: }
906: b[pend] = '\0';
907: if (pend < rlen)
908: increment(p, b, &pend, refill);
909: xml_attrkey(p, b + poff);
910: if (elem_end)
911: xml_elem_end(p, NULL);
912:
913: /* Begin an opening or closing tag. */
914:
915: } else if (b[poff] == '<') {
916: if (advance(p, b, rlen, &pend, " >", refill))
917: break;
918: if (pend > poff + 3 &&
919: strncmp(b + poff, "<!--", 4) == 0) {
920:
921: /* Skip a comment. */
922:
923: cp = strstr(b + pend - 2, "-->");
924: if (cp == NULL) {
925: if (refill)
926: break;
927: cp = b + rlen;
928: } else
929: cp += 3;
930: while (b + pend < cp)
931: increment(p, b, &pend, refill);
932: continue;
933: }
934: elem_end = 0;
935: if (b[pend] != '>')
936: *pstate = PARSE_TAG;
937: else if (pend > 0 && b[pend - 1] == '/') {
938: b[pend - 1] = '\0';
939: elem_end = 1;
940: }
941: b[pend] = '\0';
942: if (pend < rlen)
943: increment(p, b, &pend, refill);
944: if (b[++poff] == '/') {
945: elem_end = 1;
946: poff++;
1.23 schwarze 947: } else {
1.14 schwarze 948: xml_elem_start(p, b + poff);
1.23 schwarze 949: if (*pstate == PARSE_ELEM &&
950: p->flags & PFLAG_EEND)
951: elem_end = 1;
952: }
1.14 schwarze 953: if (elem_end)
954: xml_elem_end(p, b + poff);
955:
1.23 schwarze 956: /* Close a doctype. */
957:
958: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
959: *pstate = PARSE_TAG;
960: increment(p, b, &pend, refill);
961:
1.14 schwarze 962: /* Process an entity. */
963:
964: } else if (b[poff] == '&') {
965: if (advance(p, b, rlen, &pend, ";", refill))
966: break;
967: b[pend] = '\0';
968: if (pend < rlen)
969: increment(p, b, &pend, refill);
970: xml_entity(p, b + poff + 1);
971:
972: /* Process text up to the next tag, entity, or EOL. */
973:
974: } else {
1.28 schwarze 975: advance(p, b, rlen, &pend,
1.33 schwarze 976: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 977: refill);
1.45 schwarze 978: if (p->nofill)
979: poff = pws;
1.35 schwarze 980: xml_text(p, b + poff, pend - poff);
1.33 schwarze 981: if (b[pend] == '\n')
1.37 schwarze 982: pnode_closetext(p, 0);
1.14 schwarze 983: }
1.45 schwarze 984: pws = pend;
1.14 schwarze 985: }
986: return poff;
987: }
988:
1.24 schwarze 989:
990: /*
991: * The read loop.
992: * If the previous token was incomplete and asked for more input,
993: * we have to enter the read loop once more even on EOF.
994: * Once rsz is 0, incomplete tokens will no longer ask for more input
995: * but instead use whatever there is, and then exit the read loop.
996: * The minus one on the size limit for read(2) is needed such that
997: * advance() can set b[rlen] to NUL when needed.
998: */
999: static void
1000: parse_fd(struct parse *p, int fd)
1.1 schwarze 1001: {
1002: char b[4096];
1.5 schwarze 1003: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 1004: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 1005: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1006: enum pstate pstate;
1.1 schwarze 1007:
1.24 schwarze 1008: rlen = 0;
1.14 schwarze 1009: pstate = PARSE_ELEM;
1010: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1011: (rlen += rsz) > 0) {
1012: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1013: /* Buffer exhausted; shift left and re-fill. */
1014: assert(poff > 0);
1015: rlen -= poff;
1.14 schwarze 1016: memmove(b, b + poff, rlen);
1.5 schwarze 1017: }
1.24 schwarze 1018: if (rsz < 0)
1019: error_msg(p, "read: %s", strerror(errno));
1020: }
1021:
1022: /*
1023: * Open and parse a file.
1024: */
1025: struct ptree *
1026: parse_file(struct parse *p, int fd, const char *fname)
1027: {
1028: const char *save_fname;
1029: int save_line, save_col;
1030:
1031: /* Save and initialize reporting data. */
1032:
1033: save_fname = p->fname;
1034: save_line = p->nline;
1035: save_col = p->ncol;
1036: p->fname = fname;
1037: p->line = 0;
1038: p->col = 0;
1039:
1040: /* Open the file, unless it is already open. */
1041:
1042: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1043: error_msg(p, "open: %s", strerror(errno));
1044: p->fname = save_fname;
1045: return p->tree;
1.5 schwarze 1046: }
1.24 schwarze 1047:
1048: /*
1049: * After opening the starting file, change to the directory it
1050: * is located in, in case it wants to include any further files,
1051: * which are typically given with relative paths in DocBook.
1052: * Do this on a best-effort basis; don't complain about failure.
1053: */
1054:
1055: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1056: strcmp(fname, ".") != 0)
1057: (void)chdir(fname);
1058:
1059: /* Run the read loop. */
1060:
1061: p->nline = 1;
1062: p->ncol = 1;
1063: parse_fd(p, fd);
1064:
1065: /* On the top level, finalize the parse tree. */
1066:
1067: if (save_fname == NULL) {
1.37 schwarze 1068: pnode_closetext(p, 0);
1.24 schwarze 1069: if (p->tree->root == NULL)
1070: error_msg(p, "empty document");
1071: else if ((p->tree->flags & TREE_CLOSED) == 0)
1072: warn_msg(p, "document not closed");
1073: pnode_unlink(p->doctype);
1074: }
1075:
1076: /* Clean up. */
1077:
1078: if (fd != STDIN_FILENO)
1079: close(fd);
1080: p->fname = save_fname;
1081: p->nline = save_line;
1082: p->ncol = save_col;
1.1 schwarze 1083: return p->tree;
1084: }
CVSweb