Annotation of docbook2mdoc/parse.c, Revision 1.43
1.43 ! schwarze 1: /* $Id: parse.c,v 1.42 2019/04/14 14:00:16 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.43 ! schwarze 73: { "affiliation", NODE_IGNORE },
1.4 schwarze 74: { "anchor", NODE_DELETE },
1.42 schwarze 75: { "application", NODE_COMMAND },
1.22 schwarze 76: { "article", NODE_SECTION },
1.41 schwarze 77: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 78: { "book", NODE_SECTION },
1.1 schwarze 79: { "chapter", NODE_SECTION },
1.13 schwarze 80: { "code", NODE_LITERAL },
1.36 schwarze 81: { "computeroutput", NODE_LITERAL },
1.23 schwarze 82: { "!doctype", NODE_DOCTYPE },
1.7 schwarze 83: { "firstname", NODE_PERSONNAME },
1.21 schwarze 84: { "glossary", NODE_VARIABLELIST },
85: { "glossdef", NODE_IGNORE },
86: { "glossdiv", NODE_IGNORE },
87: { "glossentry", NODE_VARLISTENTRY },
88: { "glosslist", NODE_VARIABLELIST },
1.43 ! schwarze 89: { "holder", NODE_IGNORE },
1.4 schwarze 90: { "indexterm", NODE_DELETE },
1.11 schwarze 91: { "informaltable", NODE_TABLE },
1.42 schwarze 92: { "keycap", NODE_KEYSYM },
93: { "keycode", NODE_IGNORE },
1.43 ! schwarze 94: { "orgname", NODE_IGNORE },
1.40 schwarze 95: { "othercredit", NODE_AUTHOR },
1.7 schwarze 96: { "othername", NODE_PERSONNAME },
1.1 schwarze 97: { "part", NODE_SECTION },
1.3 schwarze 98: { "phrase", NODE_IGNORE },
1.4 schwarze 99: { "primary", NODE_DELETE },
1.42 schwarze 100: { "property", NODE_PARAMETER },
1.1 schwarze 101: { "refsect1", NODE_SECTION },
102: { "refsect2", NODE_SECTION },
103: { "refsect3", NODE_SECTION },
104: { "refsection", NODE_SECTION },
1.43 ! schwarze 105: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 106: { "returnvalue", NODE_IGNORE },
1.4 schwarze 107: { "secondary", NODE_DELETE },
1.1 schwarze 108: { "sect1", NODE_SECTION },
109: { "sect2", NODE_SECTION },
1.36 schwarze 110: { "sgmltag", NODE_MARKUP },
1.15 schwarze 111: { "simpara", NODE_PARA },
1.13 schwarze 112: { "structfield", NODE_PARAMETER },
113: { "structname", NODE_TYPE },
1.7 schwarze 114: { "surname", NODE_PERSONNAME },
1.12 schwarze 115: { "symbol", NODE_CONSTANT },
1.3 schwarze 116: { "trademark", NODE_IGNORE },
1.18 schwarze 117: { "ulink", NODE_LINK },
1.13 schwarze 118: { "userinput", NODE_LITERAL },
1.43 ! schwarze 119: { "year", NODE_IGNORE },
1.5 schwarze 120: { NULL, NODE_IGNORE }
1.1 schwarze 121: };
122:
1.9 schwarze 123: struct entity {
124: const char *name;
125: const char *roff;
126: };
127:
128: /*
129: * XML character entity references found in the wild.
130: * Those that don't have an exact mandoc_char(7) representation
131: * are approximated, and the desired codepoint is given as a comment.
132: * Encoding them as \\[u...] would leave -Tascii out in the cold.
133: */
134: static const struct entity entities[] = {
135: { "alpha", "\\(*a" },
136: { "amp", "&" },
137: { "apos", "'" },
138: { "auml", "\\(:a" },
139: { "beta", "\\(*b" },
140: { "circ", "^" }, /* U+02C6 */
141: { "copy", "\\(co" },
142: { "dagger", "\\(dg" },
143: { "Delta", "\\(*D" },
144: { "eacute", "\\('e" },
145: { "emsp", "\\ " }, /* U+2003 */
146: { "gt", ">" },
147: { "hairsp", "\\^" },
148: { "kappa", "\\(*k" },
149: { "larr", "\\(<-" },
150: { "ldquo", "\\(lq" },
151: { "le", "\\(<=" },
152: { "lowbar", "_" },
153: { "lsqb", "[" },
154: { "lt", "<" },
155: { "mdash", "\\(em" },
156: { "minus", "\\-" },
157: { "ndash", "\\(en" },
158: { "nbsp", "\\ " },
159: { "num", "#" },
160: { "oslash", "\\(/o" },
161: { "ouml", "\\(:o" },
162: { "percnt", "%" },
163: { "quot", "\\(dq" },
164: { "rarr", "\\(->" },
165: { "rArr", "\\(rA" },
166: { "rdquo", "\\(rq" },
167: { "reg", "\\(rg" },
168: { "rho", "\\(*r" },
169: { "rsqb", "]" },
170: { "sigma", "\\(*s" },
171: { "shy", "\\&" }, /* U+00AD */
172: { "tau", "\\(*t" },
173: { "tilde", "\\[u02DC]" },
174: { "times", "\\[tmu]" },
175: { "uuml", "\\(:u" },
176: { NULL, NULL }
177: };
178:
1.23 schwarze 179: static size_t parse_string(struct parse *, char *, size_t,
180: enum pstate *, int);
1.24 schwarze 181: static void parse_fd(struct parse *, int);
1.23 schwarze 182:
183:
1.6 schwarze 184: static void
1.29 schwarze 185: fatal(struct parse *p)
186: {
187: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
188: perror(NULL);
189: exit(6);
190: }
191:
192: static void
1.6 schwarze 193: error_msg(struct parse *p, const char *fmt, ...)
194: {
195: va_list ap;
196:
1.29 schwarze 197: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 198: va_start(ap, fmt);
199: vfprintf(stderr, fmt, ap);
200: va_end(ap);
201: fputc('\n', stderr);
1.29 schwarze 202: p->tree->flags |= TREE_ERROR;
1.6 schwarze 203: }
204:
205: static void
206: warn_msg(struct parse *p, const char *fmt, ...)
207: {
208: va_list ap;
209:
1.23 schwarze 210: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 211: return;
212:
1.29 schwarze 213: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 214: va_start(ap, fmt);
215: vfprintf(stderr, fmt, ap);
216: va_end(ap);
217: fputc('\n', stderr);
1.29 schwarze 218: p->tree->flags |= TREE_WARN;
1.6 schwarze 219: }
220:
1.1 schwarze 221: /*
222: * Process a string of characters.
223: * If a text node is already open, append to it.
224: * Otherwise, create a new one as a child of the current node.
225: */
226: static void
1.35 schwarze 227: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 228: {
1.35 schwarze 229: struct pnode *n, *np;
1.32 schwarze 230: size_t oldsz, newsz;
1.35 schwarze 231: int i;
1.1 schwarze 232:
1.32 schwarze 233: assert(sz > 0);
1.30 schwarze 234: if (p->del > 0)
1.1 schwarze 235: return;
236:
1.32 schwarze 237: if ((n = p->cur) == NULL) {
1.35 schwarze 238: error_msg(p, "discarding text before document: %.*s",
239: sz, word);
1.5 schwarze 240: return;
241: }
242:
1.35 schwarze 243: /* Append to the current text node, if one is open. */
244:
245: if (n->node == NODE_TEXT) {
246: oldsz = strlen(n->b);
247: newsz = oldsz + sz;
248: if (oldsz && (p->flags & PFLAG_SPC))
249: newsz++;
250: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 251: fatal(p);
1.35 schwarze 252: if (oldsz && (p->flags & PFLAG_SPC))
253: n->b[oldsz++] = ' ';
254: memcpy(n->b + oldsz, word, sz);
255: n->b[newsz] = '\0';
256: p->flags &= ~PFLAG_SPC;
257: return;
1.1 schwarze 258: }
259:
1.35 schwarze 260: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 261: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 262:
1.35 schwarze 263: /* Create a new text node. */
1.1 schwarze 264:
1.35 schwarze 265: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 266: fatal(p);
1.35 schwarze 267: n->node = NODE_TEXT;
268: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 269: p->flags &= ~PFLAG_SPC;
1.35 schwarze 270:
271: /*
1.39 schwarze 272: * If this node follows an in-line macro without intervening
1.35 schwarze 273: * whitespace, keep the text in it as short as possible,
274: * and do not keep it open.
275: */
276:
1.39 schwarze 277: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
278: while (np != NULL) {
279: switch (pnode_class(np->node)) {
280: case CLASS_VOID:
281: case CLASS_TEXT:
282: case CLASS_BLOCK:
283: np = NULL;
284: break;
285: case CLASS_TRANS:
286: np = TAILQ_LAST(&np->childq, pnodeq);
287: continue;
288: case CLASS_LINE:
289: case CLASS_ENCL:
290: break;
291: }
292: break;
293: }
294: if (np != NULL) {
1.35 schwarze 295: i = 0;
296: while (i < sz && !isspace((unsigned char)word[i]))
297: i++;
298: if ((n->b = strndup(word, i)) == NULL)
299: fatal(p);
300: if (i == sz)
301: return;
302: while (i < sz && isspace((unsigned char)word[i]))
303: i++;
304: if (i == sz) {
305: p->flags |= PFLAG_SPC;
306: return;
307: }
308:
309: /* Put any remaining text into a second node. */
310:
311: if ((n = pnode_alloc(p->cur)) == NULL)
312: fatal(p);
313: n->node = NODE_TEXT;
314: n->spc = 1;
315: word += i;
316: sz -= i;
317: }
318: if ((n->b = strndup(word, sz)) == NULL)
319: fatal(p);
320:
321: /* The new node remains open for later pnode_closetext(). */
322:
323: p->cur = n;
1.1 schwarze 324: }
325:
1.16 schwarze 326: /*
327: * Close out the text node and strip trailing whitespace, if one is open.
328: */
1.1 schwarze 329: static void
1.37 schwarze 330: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 331: {
1.16 schwarze 332: struct pnode *n;
1.37 schwarze 333: char *cp, *last_word;
1.16 schwarze 334:
335: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
336: return;
337: p->cur = n->parent;
1.32 schwarze 338: for (cp = strchr(n->b, '\0');
339: cp > n->b && isspace((unsigned char)cp[-1]);
340: *--cp = '\0')
1.23 schwarze 341: p->flags |= PFLAG_SPC;
1.37 schwarze 342:
343: if (p->flags & PFLAG_SPC || !check_last_word)
344: return;
345:
346: /*
347: * Find the beginning of the last word
348: * and delete whitespace before it.
349: */
350:
351: while (cp > n->b && !isspace((unsigned char)cp[-1]))
352: cp--;
353: if (cp == n->b)
354: return;
355:
356: last_word = cp;
357: while (cp > n->b && isspace((unsigned char)cp[-1]))
358: *--cp = '\0';
359:
360: /* Move the last word into its own node, for use with .Pf. */
361:
362: if ((n = pnode_alloc(p->cur)) == NULL)
363: fatal(p);
364: n->node = NODE_TEXT;
365: n->spc = 1;
366: if ((n->b = strdup(last_word)) == NULL)
367: fatal(p);
1.1 schwarze 368: }
369:
1.9 schwarze 370: static void
371: xml_entity(struct parse *p, const char *name)
372: {
373: const struct entity *entity;
1.30 schwarze 374: struct pnode *n;
1.23 schwarze 375: const char *ccp;
376: char *cp;
377: enum pstate pstate;
1.9 schwarze 378:
379: if (p->del > 0)
380: return;
381:
382: if (p->cur == NULL) {
383: error_msg(p, "discarding entity before document: &%s;", name);
384: return;
385: }
386:
1.37 schwarze 387: pnode_closetext(p, 0);
1.9 schwarze 388:
389: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
390: warn_msg(p, "entity after end of document: &%s;", name);
391:
392: for (entity = entities; entity->name != NULL; entity++)
393: if (strcmp(name, entity->name) == 0)
394: break;
395:
396: if (entity->roff == NULL) {
1.23 schwarze 397: if (p->doctype != NULL) {
1.30 schwarze 398: TAILQ_FOREACH(n, &p->doctype->childq, child) {
399: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 400: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 401: strcmp(ccp, name) != 0)
402: continue;
1.30 schwarze 403: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 404: ATTRKEY_SYSTEM, NULL)) != NULL) {
405: parse_file(p, -1, ccp);
406: p->flags &= ~PFLAG_SPC;
407: return;
408: }
1.30 schwarze 409: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 410: ATTRKEY_DEFINITION, NULL)) == NULL)
411: continue;
1.29 schwarze 412: if ((cp = strdup(ccp)) == NULL)
413: fatal(p);
1.23 schwarze 414: pstate = PARSE_ELEM;
415: parse_string(p, cp, strlen(cp), &pstate, 0);
416: p->flags &= ~PFLAG_SPC;
417: free(cp);
418: return;
419: }
420: }
1.9 schwarze 421: error_msg(p, "unknown entity &%s;", name);
422: return;
423: }
424:
425: /* Create, append, and close out an entity node. */
1.34 schwarze 426: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 427: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 428: fatal(p);
1.30 schwarze 429: n->node = NODE_ESCAPE;
430: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 431: p->flags &= ~PFLAG_SPC;
1.9 schwarze 432: }
433:
1.1 schwarze 434: /*
1.39 schwarze 435: * Parse an element name.
436: */
437: static enum nodeid
438: xml_name2node(struct parse *p, const char *name)
439: {
440: const struct alias *alias;
441: enum nodeid node;
442:
443: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
444: return node;
445:
446: for (alias = aliases; alias->name != NULL; alias++)
447: if (strcmp(alias->name, name) == 0)
448: return alias->node;
449:
450: return NODE_UNKNOWN;
451: }
452:
453: /*
1.1 schwarze 454: * Begin an element.
455: */
456: static void
1.30 schwarze 457: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 458: {
1.30 schwarze 459: struct pnode *n;
1.1 schwarze 460:
1.4 schwarze 461: /*
462: * An ancestor is excluded from the tree;
463: * keep track of the number of levels excluded.
464: */
1.30 schwarze 465: if (p->del > 0) {
1.23 schwarze 466: if (*name != '!' && *name != '?')
1.30 schwarze 467: p->del++;
1.4 schwarze 468: return;
469: }
470:
1.39 schwarze 471: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 472: case NODE_DELETE_WARN:
1.30 schwarze 473: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 474: /* FALLTHROUGH */
1.4 schwarze 475: case NODE_DELETE:
1.30 schwarze 476: p->del = 1;
1.4 schwarze 477: /* FALLTHROUGH */
1.2 schwarze 478: case NODE_IGNORE:
479: return;
1.39 schwarze 480: case NODE_UNKNOWN:
481: if (*name != '!' && *name != '?')
482: error_msg(p, "unknown element <%s>", name);
483: return;
1.2 schwarze 484: default:
485: break;
486: }
1.1 schwarze 487:
1.30 schwarze 488: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
489: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 490:
1.39 schwarze 491: switch (pnode_class(p->ncur)) {
492: case CLASS_LINE:
493: case CLASS_ENCL:
494: pnode_closetext(p, 1);
495: break;
496: default:
497: pnode_closetext(p, 0);
498: break;
499: }
500:
1.34 schwarze 501: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 502: fatal(p);
1.17 schwarze 503:
504: /*
1.39 schwarze 505: * Some elements are self-closing.
1.17 schwarze 506: * Nodes that begin a new macro or request line or start by
507: * printing text always want whitespace before themselves.
508: */
509:
1.39 schwarze 510: switch (n->node = p->ncur) {
1.23 schwarze 511: case NODE_DOCTYPE:
512: case NODE_ENTITY:
513: case NODE_SBR:
1.30 schwarze 514: p->flags |= PFLAG_EEND;
1.17 schwarze 515: break;
516: default:
1.39 schwarze 517: break;
518: }
519: switch (pnode_class(p->ncur)) {
520: case CLASS_LINE:
521: case CLASS_ENCL:
1.30 schwarze 522: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 523: break;
1.39 schwarze 524: default:
525: n->spc = 1;
526: break;
1.17 schwarze 527: }
1.30 schwarze 528: p->cur = n;
529: if (n->node == NODE_DOCTYPE) {
530: if (p->doctype == NULL)
531: p->doctype = n;
1.23 schwarze 532: else
1.30 schwarze 533: error_msg(p, "duplicate doctype");
534: } else if (n->parent == NULL && p->tree->root == NULL)
535: p->tree->root = n;
1.5 schwarze 536: }
537:
538: static void
1.30 schwarze 539: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 540: {
1.30 schwarze 541: struct pattr *a;
1.23 schwarze 542: const char *value;
1.5 schwarze 543: enum attrkey key;
1.1 schwarze 544:
1.30 schwarze 545: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 546: return;
1.23 schwarze 547:
1.30 schwarze 548: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
549: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 550: value = name;
551: name = "NAME";
552: } else
553: value = NULL;
554:
1.5 schwarze 555: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 556: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 557: return;
558: }
1.30 schwarze 559: if ((a = calloc(1, sizeof(*a))) == NULL)
560: fatal(p);
1.29 schwarze 561:
1.30 schwarze 562: a->key = key;
563: a->val = ATTRVAL__MAX;
1.23 schwarze 564: if (value == NULL) {
1.30 schwarze 565: a->rawval = NULL;
566: p->flags |= PFLAG_ATTR;
1.23 schwarze 567: } else {
1.30 schwarze 568: if ((a->rawval = strdup(value)) == NULL)
569: fatal(p);
570: p->flags &= ~PFLAG_ATTR;
571: }
572: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
573: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
574: xml_attrkey(p, "DEFINITION");
1.5 schwarze 575: }
576:
577: static void
1.30 schwarze 578: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 579: {
1.30 schwarze 580: struct pattr *a;
1.5 schwarze 581:
1.30 schwarze 582: if (p->del > 0 || p->ncur == NODE_IGNORE ||
583: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 584: return;
1.30 schwarze 585: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 586: return;
1.30 schwarze 587: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
588: (a->rawval = strdup(name)) == NULL)
589: fatal(p);
590: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 591: }
592:
593: /*
594: * Roll up the parse tree.
595: * If we're at a text node, roll that one up first.
596: */
597: static void
1.31 schwarze 598: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 599: {
1.26 schwarze 600: struct pnode *n;
601: const char *cp;
1.5 schwarze 602: enum nodeid node;
1.1 schwarze 603:
1.4 schwarze 604: /*
605: * An ancestor is excluded from the tree;
606: * keep track of the number of levels excluded.
607: */
1.31 schwarze 608: if (p->del > 1) {
609: p->del--;
1.4 schwarze 610: return;
611: }
612:
1.31 schwarze 613: if (p->del == 0)
1.37 schwarze 614: pnode_closetext(p, 0);
1.2 schwarze 615:
1.39 schwarze 616: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 617:
1.5 schwarze 618: switch (node) {
1.4 schwarze 619: case NODE_DELETE_WARN:
620: case NODE_DELETE:
1.31 schwarze 621: if (p->del > 0)
622: p->del--;
1.4 schwarze 623: break;
1.2 schwarze 624: case NODE_IGNORE:
1.39 schwarze 625: case NODE_UNKNOWN:
1.26 schwarze 626: break;
627: case NODE_INCLUDE:
1.31 schwarze 628: n = p->cur;
629: p->cur = p->cur->parent;
1.26 schwarze 630: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
631: if (cp == NULL)
1.31 schwarze 632: error_msg(p, "<xi:include> element "
1.26 schwarze 633: "without href attribute");
634: else
1.31 schwarze 635: parse_file(p, -1, cp);
1.26 schwarze 636: pnode_unlink(n);
1.31 schwarze 637: p->flags &= ~PFLAG_SPC;
1.2 schwarze 638: break;
1.23 schwarze 639: case NODE_DOCTYPE:
1.32 schwarze 640: case NODE_SBR:
1.31 schwarze 641: p->flags &= ~PFLAG_EEND;
1.23 schwarze 642: /* FALLTHROUGH */
1.2 schwarze 643: default:
1.31 schwarze 644: if (p->cur == NULL || node != p->cur->node) {
645: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 646: break;
647: }
648:
649: /*
650: * Refrain from actually closing the document element.
651: * If no more content follows, no harm is done, but if
652: * some content still follows, simply processing it is
653: * obviously better than discarding it or crashing.
654: */
655:
1.31 schwarze 656: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
657: p->cur = p->cur->parent;
658: if (p->cur != NULL)
659: p->ncur = p->cur->node;
1.23 schwarze 660: } else
1.31 schwarze 661: p->tree->flags |= TREE_CLOSED;
662: p->flags &= ~PFLAG_SPC;
1.4 schwarze 663: break;
1.2 schwarze 664: }
1.31 schwarze 665: assert(p->del == 0);
1.1 schwarze 666: }
667:
668: struct parse *
669: parse_alloc(int warn)
670: {
671: struct parse *p;
672:
673: if ((p = calloc(1, sizeof(*p))) == NULL)
674: return NULL;
675:
676: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
677: free(p);
678: return NULL;
679: }
1.23 schwarze 680: if (warn)
681: p->flags |= PFLAG_WARN;
682: else
683: p->flags &= ~PFLAG_WARN;
1.1 schwarze 684: return p;
685: }
686:
687: void
688: parse_free(struct parse *p)
689: {
690: if (p == NULL)
691: return;
692: if (p->tree != NULL) {
693: pnode_unlink(p->tree->root);
694: free(p->tree);
695: }
696: free(p);
697: }
698:
1.14 schwarze 699: static void
700: increment(struct parse *p, char *b, size_t *pend, int refill)
701: {
702: if (refill) {
703: if (b[*pend] == '\n') {
704: p->nline++;
705: p->ncol = 1;
706: } else
707: p->ncol++;
708: }
709: ++*pend;
710: }
711:
1.5 schwarze 712: /*
713: * Advance the pend pointer to the next character in the charset.
714: * If the charset starts with a space, it stands for any whitespace.
715: * Update the new input file position, used for messages.
716: * Do not overrun the buffer b of length rlen.
717: * When reaching the end, NUL-terminate the buffer and return 1;
718: * otherwise, return 0.
719: */
720: static int
721: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 722: const char *charset, int refill)
1.5 schwarze 723: {
724: int space;
725:
726: if (*charset == ' ') {
727: space = 1;
728: charset++;
729: } else
730: space = 0;
731:
1.14 schwarze 732: if (refill) {
733: p->nline = p->line;
734: p->ncol = p->col;
735: }
1.5 schwarze 736: while (*pend < rlen) {
737: if (space && isspace((unsigned char)b[*pend]))
738: break;
739: if (strchr(charset, b[*pend]) != NULL)
740: break;
1.14 schwarze 741: increment(p, b, pend, refill);
1.5 schwarze 742: }
743: if (*pend == rlen) {
744: b[rlen] = '\0';
1.14 schwarze 745: return refill;
1.5 schwarze 746: } else
747: return 0;
748: }
749:
1.14 schwarze 750: size_t
751: parse_string(struct parse *p, char *b, size_t rlen,
752: enum pstate *pstate, int refill)
753: {
754: char *cp;
755: size_t poff; /* Parse offset in b[]. */
756: size_t pend; /* Offset of the end of the current word. */
757: int elem_end;
758:
759: pend = 0;
760: for (;;) {
761:
762: /* Proceed to the next token, skipping whitespace. */
763:
764: if (refill) {
765: p->line = p->nline;
766: p->col = p->ncol;
767: }
768: if ((poff = pend) == rlen)
769: break;
770: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 771: p->flags |= PFLAG_SPC;
1.14 schwarze 772: increment(p, b, &pend, refill);
773: continue;
774: }
775:
776: /*
777: * The following four cases (ARG, TAG, and starting an
778: * entity or a tag) all parse a word or quoted string.
779: * If that extends beyond the read buffer and the last
780: * read(2) still got data, they all break out of the
781: * token loop to request more data from the read loop.
782: *
783: * Also, three of them detect self-closing tags, those
784: * ending with "/>", setting the flag elem_end and
785: * calling xml_elem_end() at the very end, after
786: * handling the attribute value, attribute name, or
787: * tag name, respectively.
788: */
789:
790: /* Parse an attribute value. */
791:
792: if (*pstate >= PARSE_ARG) {
793: if (*pstate == PARSE_ARG &&
794: (b[pend] == '\'' || b[pend] == '"')) {
795: *pstate = b[pend] == '"' ?
796: PARSE_DQ : PARSE_SQ;
797: increment(p, b, &pend, refill);
798: continue;
799: }
800: if (advance(p, b, rlen, &pend,
801: *pstate == PARSE_DQ ? "\"" :
802: *pstate == PARSE_SQ ? "'" : " >", refill))
803: break;
804: *pstate = PARSE_TAG;
805: elem_end = 0;
806: if (b[pend] == '>') {
807: *pstate = PARSE_ELEM;
808: if (pend > 0 && b[pend - 1] == '/') {
809: b[pend - 1] = '\0';
810: elem_end = 1;
811: }
1.23 schwarze 812: if (p->flags & PFLAG_EEND)
813: elem_end = 1;
1.14 schwarze 814: }
815: b[pend] = '\0';
816: if (pend < rlen)
817: increment(p, b, &pend, refill);
818: xml_attrval(p, b + poff);
819: if (elem_end)
820: xml_elem_end(p, NULL);
821:
822: /* Look for an attribute name. */
823:
824: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 825: switch (p->ncur) {
826: case NODE_DOCTYPE:
827: if (b[pend] == '[') {
828: *pstate = PARSE_ELEM;
829: increment(p, b, &pend, refill);
830: continue;
831: }
832: /* FALLTHROUGH */
833: case NODE_ENTITY:
834: if (b[pend] == '"' || b[pend] == '\'') {
835: *pstate = PARSE_ARG;
836: continue;
837: }
838: break;
839: default:
840: break;
841: }
1.14 schwarze 842: if (advance(p, b, rlen, &pend, " =>", refill))
843: break;
844: elem_end = 0;
845: switch (b[pend]) {
846: case '>':
847: *pstate = PARSE_ELEM;
848: if (pend > 0 && b[pend - 1] == '/') {
849: b[pend - 1] = '\0';
850: elem_end = 1;
851: }
1.23 schwarze 852: if (p->flags & PFLAG_EEND)
853: elem_end = 1;
1.14 schwarze 854: break;
855: case '=':
856: *pstate = PARSE_ARG;
857: break;
858: default:
859: break;
860: }
861: b[pend] = '\0';
862: if (pend < rlen)
863: increment(p, b, &pend, refill);
864: xml_attrkey(p, b + poff);
865: if (elem_end)
866: xml_elem_end(p, NULL);
867:
868: /* Begin an opening or closing tag. */
869:
870: } else if (b[poff] == '<') {
871: if (advance(p, b, rlen, &pend, " >", refill))
872: break;
873: if (pend > poff + 3 &&
874: strncmp(b + poff, "<!--", 4) == 0) {
875:
876: /* Skip a comment. */
877:
878: cp = strstr(b + pend - 2, "-->");
879: if (cp == NULL) {
880: if (refill)
881: break;
882: cp = b + rlen;
883: } else
884: cp += 3;
885: while (b + pend < cp)
886: increment(p, b, &pend, refill);
887: continue;
888: }
889: elem_end = 0;
890: if (b[pend] != '>')
891: *pstate = PARSE_TAG;
892: else if (pend > 0 && b[pend - 1] == '/') {
893: b[pend - 1] = '\0';
894: elem_end = 1;
895: }
896: b[pend] = '\0';
897: if (pend < rlen)
898: increment(p, b, &pend, refill);
899: if (b[++poff] == '/') {
900: elem_end = 1;
901: poff++;
1.23 schwarze 902: } else {
1.14 schwarze 903: xml_elem_start(p, b + poff);
1.23 schwarze 904: if (*pstate == PARSE_ELEM &&
905: p->flags & PFLAG_EEND)
906: elem_end = 1;
907: }
1.14 schwarze 908: if (elem_end)
909: xml_elem_end(p, b + poff);
910:
1.23 schwarze 911: /* Close a doctype. */
912:
913: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
914: *pstate = PARSE_TAG;
915: increment(p, b, &pend, refill);
916:
1.14 schwarze 917: /* Process an entity. */
918:
919: } else if (b[poff] == '&') {
920: if (advance(p, b, rlen, &pend, ";", refill))
921: break;
922: b[pend] = '\0';
923: if (pend < rlen)
924: increment(p, b, &pend, refill);
925: xml_entity(p, b + poff + 1);
926:
927: /* Process text up to the next tag, entity, or EOL. */
928:
929: } else {
1.28 schwarze 930: advance(p, b, rlen, &pend,
1.33 schwarze 931: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 932: refill);
1.35 schwarze 933: xml_text(p, b + poff, pend - poff);
1.33 schwarze 934: if (b[pend] == '\n')
1.37 schwarze 935: pnode_closetext(p, 0);
1.14 schwarze 936: }
937: }
938: return poff;
939: }
940:
1.24 schwarze 941:
942: /*
943: * The read loop.
944: * If the previous token was incomplete and asked for more input,
945: * we have to enter the read loop once more even on EOF.
946: * Once rsz is 0, incomplete tokens will no longer ask for more input
947: * but instead use whatever there is, and then exit the read loop.
948: * The minus one on the size limit for read(2) is needed such that
949: * advance() can set b[rlen] to NUL when needed.
950: */
951: static void
952: parse_fd(struct parse *p, int fd)
1.1 schwarze 953: {
954: char b[4096];
1.5 schwarze 955: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 956: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 957: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 958: enum pstate pstate;
1.1 schwarze 959:
1.24 schwarze 960: rlen = 0;
1.14 schwarze 961: pstate = PARSE_ELEM;
962: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
963: (rlen += rsz) > 0) {
964: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 965: /* Buffer exhausted; shift left and re-fill. */
966: assert(poff > 0);
967: rlen -= poff;
1.14 schwarze 968: memmove(b, b + poff, rlen);
1.5 schwarze 969: }
1.24 schwarze 970: if (rsz < 0)
971: error_msg(p, "read: %s", strerror(errno));
972: }
973:
974: /*
975: * Open and parse a file.
976: */
977: struct ptree *
978: parse_file(struct parse *p, int fd, const char *fname)
979: {
980: const char *save_fname;
981: int save_line, save_col;
982:
983: /* Save and initialize reporting data. */
984:
985: save_fname = p->fname;
986: save_line = p->nline;
987: save_col = p->ncol;
988: p->fname = fname;
989: p->line = 0;
990: p->col = 0;
991:
992: /* Open the file, unless it is already open. */
993:
994: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
995: error_msg(p, "open: %s", strerror(errno));
996: p->fname = save_fname;
997: return p->tree;
1.5 schwarze 998: }
1.24 schwarze 999:
1000: /*
1001: * After opening the starting file, change to the directory it
1002: * is located in, in case it wants to include any further files,
1003: * which are typically given with relative paths in DocBook.
1004: * Do this on a best-effort basis; don't complain about failure.
1005: */
1006:
1007: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1008: strcmp(fname, ".") != 0)
1009: (void)chdir(fname);
1010:
1011: /* Run the read loop. */
1012:
1013: p->nline = 1;
1014: p->ncol = 1;
1015: parse_fd(p, fd);
1016:
1017: /* On the top level, finalize the parse tree. */
1018:
1019: if (save_fname == NULL) {
1.37 schwarze 1020: pnode_closetext(p, 0);
1.24 schwarze 1021: if (p->tree->root == NULL)
1022: error_msg(p, "empty document");
1023: else if ((p->tree->flags & TREE_CLOSED) == 0)
1024: warn_msg(p, "document not closed");
1025: pnode_unlink(p->doctype);
1026: }
1027:
1028: /* Clean up. */
1029:
1030: if (fd != STDIN_FILENO)
1031: close(fd);
1032: p->fname = save_fname;
1033: p->nline = save_line;
1034: p->ncol = save_col;
1.1 schwarze 1035: return p->tree;
1036: }
CVSweb