Annotation of docbook2mdoc/parse.c, Revision 1.47
1.47 ! schwarze 1: /* $Id: parse.c,v 1.46 2019/04/16 21:19:54 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.45 schwarze 59: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 60: int flags;
61: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
62: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
63: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
64: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 65: };
66:
1.39 schwarze 67: struct alias {
1.1 schwarze 68: const char *name; /* DocBook element name. */
69: enum nodeid node; /* Node type to generate. */
70: };
71:
1.39 schwarze 72: static const struct alias aliases[] = {
1.3 schwarze 73: { "acronym", NODE_IGNORE },
1.43 schwarze 74: { "affiliation", NODE_IGNORE },
1.4 schwarze 75: { "anchor", NODE_DELETE },
1.42 schwarze 76: { "application", NODE_COMMAND },
1.22 schwarze 77: { "article", NODE_SECTION },
1.41 schwarze 78: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 79: { "book", NODE_SECTION },
1.1 schwarze 80: { "chapter", NODE_SECTION },
1.44 schwarze 81: { "caption", NODE_IGNORE },
1.13 schwarze 82: { "code", NODE_LITERAL },
1.36 schwarze 83: { "computeroutput", NODE_LITERAL },
1.23 schwarze 84: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 85: { "figure", NODE_IGNORE },
1.7 schwarze 86: { "firstname", NODE_PERSONNAME },
1.21 schwarze 87: { "glossary", NODE_VARIABLELIST },
88: { "glossdef", NODE_IGNORE },
89: { "glossdiv", NODE_IGNORE },
90: { "glossentry", NODE_VARLISTENTRY },
91: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 92: { "holder", NODE_IGNORE },
1.44 schwarze 93: { "imageobject", NODE_IGNORE },
1.4 schwarze 94: { "indexterm", NODE_DELETE },
1.11 schwarze 95: { "informaltable", NODE_TABLE },
1.42 schwarze 96: { "keycap", NODE_KEYSYM },
97: { "keycode", NODE_IGNORE },
1.44 schwarze 98: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 99: { "orgname", NODE_IGNORE },
1.40 schwarze 100: { "othercredit", NODE_AUTHOR },
1.7 schwarze 101: { "othername", NODE_PERSONNAME },
1.1 schwarze 102: { "part", NODE_SECTION },
1.3 schwarze 103: { "phrase", NODE_IGNORE },
1.4 schwarze 104: { "primary", NODE_DELETE },
1.42 schwarze 105: { "property", NODE_PARAMETER },
1.1 schwarze 106: { "refsect1", NODE_SECTION },
107: { "refsect2", NODE_SECTION },
108: { "refsect3", NODE_SECTION },
109: { "refsection", NODE_SECTION },
1.43 schwarze 110: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 111: { "returnvalue", NODE_IGNORE },
1.4 schwarze 112: { "secondary", NODE_DELETE },
1.1 schwarze 113: { "sect1", NODE_SECTION },
114: { "sect2", NODE_SECTION },
1.46 schwarze 115: { "sect3", NODE_SECTION },
116: { "sect4", NODE_SECTION },
1.36 schwarze 117: { "sgmltag", NODE_MARKUP },
1.15 schwarze 118: { "simpara", NODE_PARA },
1.13 schwarze 119: { "structfield", NODE_PARAMETER },
120: { "structname", NODE_TYPE },
1.7 schwarze 121: { "surname", NODE_PERSONNAME },
1.12 schwarze 122: { "symbol", NODE_CONSTANT },
1.3 schwarze 123: { "trademark", NODE_IGNORE },
1.18 schwarze 124: { "ulink", NODE_LINK },
1.13 schwarze 125: { "userinput", NODE_LITERAL },
1.43 schwarze 126: { "year", NODE_IGNORE },
1.5 schwarze 127: { NULL, NODE_IGNORE }
1.1 schwarze 128: };
129:
1.9 schwarze 130: struct entity {
131: const char *name;
132: const char *roff;
133: };
134:
135: /*
136: * XML character entity references found in the wild.
137: * Those that don't have an exact mandoc_char(7) representation
138: * are approximated, and the desired codepoint is given as a comment.
139: * Encoding them as \\[u...] would leave -Tascii out in the cold.
140: */
141: static const struct entity entities[] = {
142: { "alpha", "\\(*a" },
143: { "amp", "&" },
144: { "apos", "'" },
145: { "auml", "\\(:a" },
146: { "beta", "\\(*b" },
147: { "circ", "^" }, /* U+02C6 */
148: { "copy", "\\(co" },
149: { "dagger", "\\(dg" },
150: { "Delta", "\\(*D" },
151: { "eacute", "\\('e" },
152: { "emsp", "\\ " }, /* U+2003 */
153: { "gt", ">" },
154: { "hairsp", "\\^" },
155: { "kappa", "\\(*k" },
156: { "larr", "\\(<-" },
157: { "ldquo", "\\(lq" },
158: { "le", "\\(<=" },
159: { "lowbar", "_" },
160: { "lsqb", "[" },
161: { "lt", "<" },
162: { "mdash", "\\(em" },
163: { "minus", "\\-" },
164: { "ndash", "\\(en" },
165: { "nbsp", "\\ " },
166: { "num", "#" },
167: { "oslash", "\\(/o" },
168: { "ouml", "\\(:o" },
169: { "percnt", "%" },
170: { "quot", "\\(dq" },
171: { "rarr", "\\(->" },
172: { "rArr", "\\(rA" },
173: { "rdquo", "\\(rq" },
174: { "reg", "\\(rg" },
175: { "rho", "\\(*r" },
176: { "rsqb", "]" },
177: { "sigma", "\\(*s" },
178: { "shy", "\\&" }, /* U+00AD */
179: { "tau", "\\(*t" },
180: { "tilde", "\\[u02DC]" },
181: { "times", "\\[tmu]" },
182: { "uuml", "\\(:u" },
183: { NULL, NULL }
184: };
185:
1.23 schwarze 186: static size_t parse_string(struct parse *, char *, size_t,
187: enum pstate *, int);
1.24 schwarze 188: static void parse_fd(struct parse *, int);
1.23 schwarze 189:
190:
1.6 schwarze 191: static void
1.29 schwarze 192: fatal(struct parse *p)
193: {
194: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
195: perror(NULL);
196: exit(6);
197: }
198:
199: static void
1.6 schwarze 200: error_msg(struct parse *p, const char *fmt, ...)
201: {
202: va_list ap;
203:
1.29 schwarze 204: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 205: va_start(ap, fmt);
206: vfprintf(stderr, fmt, ap);
207: va_end(ap);
208: fputc('\n', stderr);
1.29 schwarze 209: p->tree->flags |= TREE_ERROR;
1.6 schwarze 210: }
211:
212: static void
213: warn_msg(struct parse *p, const char *fmt, ...)
214: {
215: va_list ap;
216:
1.23 schwarze 217: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 218: return;
219:
1.29 schwarze 220: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 221: va_start(ap, fmt);
222: vfprintf(stderr, fmt, ap);
223: va_end(ap);
224: fputc('\n', stderr);
1.29 schwarze 225: p->tree->flags |= TREE_WARN;
1.6 schwarze 226: }
227:
1.1 schwarze 228: /*
229: * Process a string of characters.
230: * If a text node is already open, append to it.
231: * Otherwise, create a new one as a child of the current node.
232: */
233: static void
1.35 schwarze 234: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 235: {
1.35 schwarze 236: struct pnode *n, *np;
1.32 schwarze 237: size_t oldsz, newsz;
1.35 schwarze 238: int i;
1.1 schwarze 239:
1.32 schwarze 240: assert(sz > 0);
1.30 schwarze 241: if (p->del > 0)
1.1 schwarze 242: return;
243:
1.32 schwarze 244: if ((n = p->cur) == NULL) {
1.35 schwarze 245: error_msg(p, "discarding text before document: %.*s",
246: sz, word);
1.5 schwarze 247: return;
248: }
249:
1.35 schwarze 250: /* Append to the current text node, if one is open. */
251:
252: if (n->node == NODE_TEXT) {
253: oldsz = strlen(n->b);
254: newsz = oldsz + sz;
255: if (oldsz && (p->flags & PFLAG_SPC))
256: newsz++;
257: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 258: fatal(p);
1.35 schwarze 259: if (oldsz && (p->flags & PFLAG_SPC))
260: n->b[oldsz++] = ' ';
261: memcpy(n->b + oldsz, word, sz);
262: n->b[newsz] = '\0';
263: p->flags &= ~PFLAG_SPC;
264: return;
1.1 schwarze 265: }
266:
1.35 schwarze 267: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 268: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 269:
1.35 schwarze 270: /* Create a new text node. */
1.1 schwarze 271:
1.35 schwarze 272: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 273: fatal(p);
1.35 schwarze 274: n->node = NODE_TEXT;
275: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 276: p->flags &= ~PFLAG_SPC;
1.35 schwarze 277:
278: /*
1.39 schwarze 279: * If this node follows an in-line macro without intervening
1.35 schwarze 280: * whitespace, keep the text in it as short as possible,
281: * and do not keep it open.
282: */
283:
1.39 schwarze 284: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
285: while (np != NULL) {
286: switch (pnode_class(np->node)) {
287: case CLASS_VOID:
288: case CLASS_TEXT:
289: case CLASS_BLOCK:
1.45 schwarze 290: case CLASS_NOFILL:
1.39 schwarze 291: np = NULL;
292: break;
293: case CLASS_TRANS:
294: np = TAILQ_LAST(&np->childq, pnodeq);
295: continue;
296: case CLASS_LINE:
297: case CLASS_ENCL:
298: break;
299: }
300: break;
301: }
302: if (np != NULL) {
1.35 schwarze 303: i = 0;
304: while (i < sz && !isspace((unsigned char)word[i]))
305: i++;
306: if ((n->b = strndup(word, i)) == NULL)
307: fatal(p);
308: if (i == sz)
309: return;
310: while (i < sz && isspace((unsigned char)word[i]))
311: i++;
312: if (i == sz) {
313: p->flags |= PFLAG_SPC;
314: return;
315: }
316:
317: /* Put any remaining text into a second node. */
318:
319: if ((n = pnode_alloc(p->cur)) == NULL)
320: fatal(p);
321: n->node = NODE_TEXT;
322: n->spc = 1;
323: word += i;
324: sz -= i;
325: }
326: if ((n->b = strndup(word, sz)) == NULL)
327: fatal(p);
328:
329: /* The new node remains open for later pnode_closetext(). */
330:
331: p->cur = n;
1.1 schwarze 332: }
333:
1.16 schwarze 334: /*
335: * Close out the text node and strip trailing whitespace, if one is open.
336: */
1.1 schwarze 337: static void
1.37 schwarze 338: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 339: {
1.16 schwarze 340: struct pnode *n;
1.37 schwarze 341: char *cp, *last_word;
1.16 schwarze 342:
343: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
344: return;
345: p->cur = n->parent;
1.32 schwarze 346: for (cp = strchr(n->b, '\0');
347: cp > n->b && isspace((unsigned char)cp[-1]);
348: *--cp = '\0')
1.23 schwarze 349: p->flags |= PFLAG_SPC;
1.37 schwarze 350:
351: if (p->flags & PFLAG_SPC || !check_last_word)
352: return;
353:
354: /*
355: * Find the beginning of the last word
356: * and delete whitespace before it.
357: */
358:
359: while (cp > n->b && !isspace((unsigned char)cp[-1]))
360: cp--;
361: if (cp == n->b)
362: return;
363:
364: last_word = cp;
365: while (cp > n->b && isspace((unsigned char)cp[-1]))
366: *--cp = '\0';
367:
368: /* Move the last word into its own node, for use with .Pf. */
369:
370: if ((n = pnode_alloc(p->cur)) == NULL)
371: fatal(p);
372: n->node = NODE_TEXT;
373: n->spc = 1;
374: if ((n->b = strdup(last_word)) == NULL)
375: fatal(p);
1.1 schwarze 376: }
377:
1.9 schwarze 378: static void
379: xml_entity(struct parse *p, const char *name)
380: {
381: const struct entity *entity;
1.30 schwarze 382: struct pnode *n;
1.23 schwarze 383: const char *ccp;
384: char *cp;
385: enum pstate pstate;
1.9 schwarze 386:
387: if (p->del > 0)
388: return;
389:
390: if (p->cur == NULL) {
391: error_msg(p, "discarding entity before document: &%s;", name);
392: return;
393: }
394:
1.37 schwarze 395: pnode_closetext(p, 0);
1.9 schwarze 396:
397: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
398: warn_msg(p, "entity after end of document: &%s;", name);
399:
400: for (entity = entities; entity->name != NULL; entity++)
401: if (strcmp(name, entity->name) == 0)
402: break;
403:
404: if (entity->roff == NULL) {
1.23 schwarze 405: if (p->doctype != NULL) {
1.30 schwarze 406: TAILQ_FOREACH(n, &p->doctype->childq, child) {
407: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 408: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 409: strcmp(ccp, name) != 0)
410: continue;
1.30 schwarze 411: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 412: ATTRKEY_SYSTEM, NULL)) != NULL) {
413: parse_file(p, -1, ccp);
414: p->flags &= ~PFLAG_SPC;
415: return;
416: }
1.30 schwarze 417: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 418: ATTRKEY_DEFINITION, NULL)) == NULL)
419: continue;
1.29 schwarze 420: if ((cp = strdup(ccp)) == NULL)
421: fatal(p);
1.23 schwarze 422: pstate = PARSE_ELEM;
423: parse_string(p, cp, strlen(cp), &pstate, 0);
424: p->flags &= ~PFLAG_SPC;
425: free(cp);
426: return;
427: }
428: }
1.9 schwarze 429: error_msg(p, "unknown entity &%s;", name);
430: return;
431: }
432:
433: /* Create, append, and close out an entity node. */
1.34 schwarze 434: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 435: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 436: fatal(p);
1.30 schwarze 437: n->node = NODE_ESCAPE;
438: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 439: p->flags &= ~PFLAG_SPC;
1.9 schwarze 440: }
441:
1.1 schwarze 442: /*
1.39 schwarze 443: * Parse an element name.
444: */
445: static enum nodeid
446: xml_name2node(struct parse *p, const char *name)
447: {
448: const struct alias *alias;
449: enum nodeid node;
450:
451: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
452: return node;
453:
454: for (alias = aliases; alias->name != NULL; alias++)
455: if (strcmp(alias->name, name) == 0)
456: return alias->node;
457:
458: return NODE_UNKNOWN;
459: }
460:
461: /*
1.1 schwarze 462: * Begin an element.
463: */
464: static void
1.30 schwarze 465: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 466: {
1.30 schwarze 467: struct pnode *n;
1.1 schwarze 468:
1.4 schwarze 469: /*
470: * An ancestor is excluded from the tree;
471: * keep track of the number of levels excluded.
472: */
1.30 schwarze 473: if (p->del > 0) {
1.23 schwarze 474: if (*name != '!' && *name != '?')
1.30 schwarze 475: p->del++;
1.4 schwarze 476: return;
477: }
478:
1.39 schwarze 479: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 480: case NODE_DELETE_WARN:
1.30 schwarze 481: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 482: /* FALLTHROUGH */
1.4 schwarze 483: case NODE_DELETE:
1.30 schwarze 484: p->del = 1;
1.4 schwarze 485: /* FALLTHROUGH */
1.2 schwarze 486: case NODE_IGNORE:
487: return;
1.39 schwarze 488: case NODE_UNKNOWN:
489: if (*name != '!' && *name != '?')
490: error_msg(p, "unknown element <%s>", name);
491: return;
1.2 schwarze 492: default:
493: break;
494: }
1.1 schwarze 495:
1.30 schwarze 496: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
497: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 498:
1.39 schwarze 499: switch (pnode_class(p->ncur)) {
500: case CLASS_LINE:
501: case CLASS_ENCL:
502: pnode_closetext(p, 1);
503: break;
504: default:
505: pnode_closetext(p, 0);
506: break;
507: }
508:
1.34 schwarze 509: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 510: fatal(p);
1.17 schwarze 511:
512: /*
1.39 schwarze 513: * Some elements are self-closing.
1.17 schwarze 514: * Nodes that begin a new macro or request line or start by
515: * printing text always want whitespace before themselves.
516: */
517:
1.39 schwarze 518: switch (n->node = p->ncur) {
1.23 schwarze 519: case NODE_DOCTYPE:
520: case NODE_ENTITY:
521: case NODE_SBR:
1.30 schwarze 522: p->flags |= PFLAG_EEND;
1.17 schwarze 523: break;
524: default:
1.39 schwarze 525: break;
526: }
527: switch (pnode_class(p->ncur)) {
528: case CLASS_LINE:
529: case CLASS_ENCL:
1.30 schwarze 530: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 531: break;
1.45 schwarze 532: case CLASS_NOFILL:
533: p->nofill++;
534: /* FALLTHROUGH */
1.39 schwarze 535: default:
536: n->spc = 1;
537: break;
1.17 schwarze 538: }
1.30 schwarze 539: p->cur = n;
540: if (n->node == NODE_DOCTYPE) {
541: if (p->doctype == NULL)
542: p->doctype = n;
1.23 schwarze 543: else
1.30 schwarze 544: error_msg(p, "duplicate doctype");
545: } else if (n->parent == NULL && p->tree->root == NULL)
546: p->tree->root = n;
1.5 schwarze 547: }
548:
549: static void
1.30 schwarze 550: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 551: {
1.30 schwarze 552: struct pattr *a;
1.23 schwarze 553: const char *value;
1.5 schwarze 554: enum attrkey key;
1.1 schwarze 555:
1.47 ! schwarze 556: if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0')
1.5 schwarze 557: return;
1.23 schwarze 558:
1.30 schwarze 559: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
560: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 561: value = name;
562: name = "NAME";
563: } else
564: value = NULL;
565:
1.5 schwarze 566: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 567: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 568: return;
569: }
1.30 schwarze 570: if ((a = calloc(1, sizeof(*a))) == NULL)
571: fatal(p);
1.29 schwarze 572:
1.30 schwarze 573: a->key = key;
574: a->val = ATTRVAL__MAX;
1.23 schwarze 575: if (value == NULL) {
1.30 schwarze 576: a->rawval = NULL;
577: p->flags |= PFLAG_ATTR;
1.23 schwarze 578: } else {
1.30 schwarze 579: if ((a->rawval = strdup(value)) == NULL)
580: fatal(p);
581: p->flags &= ~PFLAG_ATTR;
582: }
583: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
584: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
585: xml_attrkey(p, "DEFINITION");
1.5 schwarze 586: }
587:
588: static void
1.30 schwarze 589: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 590: {
1.30 schwarze 591: struct pattr *a;
1.5 schwarze 592:
1.47 ! schwarze 593: if (p->del > 0 || p->ncur >= NODE_UNKNOWN ||
1.30 schwarze 594: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 595: return;
1.30 schwarze 596: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 597: return;
1.30 schwarze 598: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
599: (a->rawval = strdup(name)) == NULL)
600: fatal(p);
601: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 602: }
603:
604: /*
605: * Roll up the parse tree.
606: * If we're at a text node, roll that one up first.
607: */
608: static void
1.31 schwarze 609: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 610: {
1.26 schwarze 611: struct pnode *n;
612: const char *cp;
1.5 schwarze 613: enum nodeid node;
1.1 schwarze 614:
1.4 schwarze 615: /*
616: * An ancestor is excluded from the tree;
617: * keep track of the number of levels excluded.
618: */
1.31 schwarze 619: if (p->del > 1) {
620: p->del--;
1.4 schwarze 621: return;
622: }
623:
1.31 schwarze 624: if (p->del == 0)
1.37 schwarze 625: pnode_closetext(p, 0);
1.2 schwarze 626:
1.39 schwarze 627: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 628:
1.5 schwarze 629: switch (node) {
1.4 schwarze 630: case NODE_DELETE_WARN:
631: case NODE_DELETE:
1.31 schwarze 632: if (p->del > 0)
633: p->del--;
1.4 schwarze 634: break;
1.2 schwarze 635: case NODE_IGNORE:
1.39 schwarze 636: case NODE_UNKNOWN:
1.26 schwarze 637: break;
638: case NODE_INCLUDE:
1.31 schwarze 639: n = p->cur;
640: p->cur = p->cur->parent;
1.26 schwarze 641: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
642: if (cp == NULL)
1.31 schwarze 643: error_msg(p, "<xi:include> element "
1.26 schwarze 644: "without href attribute");
645: else
1.31 schwarze 646: parse_file(p, -1, cp);
1.26 schwarze 647: pnode_unlink(n);
1.31 schwarze 648: p->flags &= ~PFLAG_SPC;
1.2 schwarze 649: break;
1.23 schwarze 650: case NODE_DOCTYPE:
1.32 schwarze 651: case NODE_SBR:
1.31 schwarze 652: p->flags &= ~PFLAG_EEND;
1.23 schwarze 653: /* FALLTHROUGH */
1.2 schwarze 654: default:
1.31 schwarze 655: if (p->cur == NULL || node != p->cur->node) {
656: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 657: break;
658: }
1.45 schwarze 659: if (pnode_class(node) == CLASS_NOFILL)
660: p->nofill--;
1.5 schwarze 661:
662: /*
663: * Refrain from actually closing the document element.
664: * If no more content follows, no harm is done, but if
665: * some content still follows, simply processing it is
666: * obviously better than discarding it or crashing.
667: */
668:
1.31 schwarze 669: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
670: p->cur = p->cur->parent;
671: if (p->cur != NULL)
672: p->ncur = p->cur->node;
1.23 schwarze 673: } else
1.31 schwarze 674: p->tree->flags |= TREE_CLOSED;
675: p->flags &= ~PFLAG_SPC;
1.4 schwarze 676: break;
1.2 schwarze 677: }
1.31 schwarze 678: assert(p->del == 0);
1.1 schwarze 679: }
680:
681: struct parse *
682: parse_alloc(int warn)
683: {
684: struct parse *p;
685:
686: if ((p = calloc(1, sizeof(*p))) == NULL)
687: return NULL;
688:
689: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
690: free(p);
691: return NULL;
692: }
1.23 schwarze 693: if (warn)
694: p->flags |= PFLAG_WARN;
695: else
696: p->flags &= ~PFLAG_WARN;
1.1 schwarze 697: return p;
698: }
699:
700: void
701: parse_free(struct parse *p)
702: {
703: if (p == NULL)
704: return;
705: if (p->tree != NULL) {
706: pnode_unlink(p->tree->root);
707: free(p->tree);
708: }
709: free(p);
710: }
711:
1.14 schwarze 712: static void
713: increment(struct parse *p, char *b, size_t *pend, int refill)
714: {
715: if (refill) {
716: if (b[*pend] == '\n') {
717: p->nline++;
718: p->ncol = 1;
719: } else
720: p->ncol++;
721: }
722: ++*pend;
723: }
724:
1.5 schwarze 725: /*
726: * Advance the pend pointer to the next character in the charset.
727: * If the charset starts with a space, it stands for any whitespace.
728: * Update the new input file position, used for messages.
729: * Do not overrun the buffer b of length rlen.
730: * When reaching the end, NUL-terminate the buffer and return 1;
731: * otherwise, return 0.
732: */
733: static int
734: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 735: const char *charset, int refill)
1.5 schwarze 736: {
737: int space;
738:
739: if (*charset == ' ') {
740: space = 1;
741: charset++;
742: } else
743: space = 0;
744:
1.14 schwarze 745: if (refill) {
746: p->nline = p->line;
747: p->ncol = p->col;
748: }
1.5 schwarze 749: while (*pend < rlen) {
750: if (space && isspace((unsigned char)b[*pend]))
751: break;
752: if (strchr(charset, b[*pend]) != NULL)
753: break;
1.14 schwarze 754: increment(p, b, pend, refill);
1.5 schwarze 755: }
756: if (*pend == rlen) {
757: b[rlen] = '\0';
1.14 schwarze 758: return refill;
1.5 schwarze 759: } else
760: return 0;
761: }
762:
1.14 schwarze 763: size_t
764: parse_string(struct parse *p, char *b, size_t rlen,
765: enum pstate *pstate, int refill)
766: {
767: char *cp;
1.45 schwarze 768: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 769: size_t poff; /* Parse offset in b[]. */
770: size_t pend; /* Offset of the end of the current word. */
771: int elem_end;
772:
1.45 schwarze 773: pend = pws = 0;
1.14 schwarze 774: for (;;) {
775:
776: /* Proceed to the next token, skipping whitespace. */
777:
778: if (refill) {
779: p->line = p->nline;
780: p->col = p->ncol;
781: }
782: if ((poff = pend) == rlen)
783: break;
784: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 785: p->flags |= PFLAG_SPC;
1.45 schwarze 786: if (b[pend] == '\n')
787: pws = pend + 1;
1.14 schwarze 788: increment(p, b, &pend, refill);
789: continue;
790: }
791:
792: /*
793: * The following four cases (ARG, TAG, and starting an
794: * entity or a tag) all parse a word or quoted string.
795: * If that extends beyond the read buffer and the last
796: * read(2) still got data, they all break out of the
797: * token loop to request more data from the read loop.
798: *
799: * Also, three of them detect self-closing tags, those
800: * ending with "/>", setting the flag elem_end and
801: * calling xml_elem_end() at the very end, after
802: * handling the attribute value, attribute name, or
803: * tag name, respectively.
804: */
805:
806: /* Parse an attribute value. */
807:
808: if (*pstate >= PARSE_ARG) {
809: if (*pstate == PARSE_ARG &&
810: (b[pend] == '\'' || b[pend] == '"')) {
811: *pstate = b[pend] == '"' ?
812: PARSE_DQ : PARSE_SQ;
813: increment(p, b, &pend, refill);
814: continue;
815: }
816: if (advance(p, b, rlen, &pend,
817: *pstate == PARSE_DQ ? "\"" :
818: *pstate == PARSE_SQ ? "'" : " >", refill))
819: break;
820: *pstate = PARSE_TAG;
821: elem_end = 0;
822: if (b[pend] == '>') {
823: *pstate = PARSE_ELEM;
824: if (pend > 0 && b[pend - 1] == '/') {
825: b[pend - 1] = '\0';
826: elem_end = 1;
827: }
1.23 schwarze 828: if (p->flags & PFLAG_EEND)
829: elem_end = 1;
1.14 schwarze 830: }
831: b[pend] = '\0';
832: if (pend < rlen)
833: increment(p, b, &pend, refill);
834: xml_attrval(p, b + poff);
835: if (elem_end)
836: xml_elem_end(p, NULL);
837:
838: /* Look for an attribute name. */
839:
840: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 841: switch (p->ncur) {
842: case NODE_DOCTYPE:
843: if (b[pend] == '[') {
844: *pstate = PARSE_ELEM;
845: increment(p, b, &pend, refill);
846: continue;
847: }
848: /* FALLTHROUGH */
849: case NODE_ENTITY:
850: if (b[pend] == '"' || b[pend] == '\'') {
851: *pstate = PARSE_ARG;
852: continue;
853: }
854: break;
855: default:
856: break;
857: }
1.14 schwarze 858: if (advance(p, b, rlen, &pend, " =>", refill))
859: break;
860: elem_end = 0;
861: switch (b[pend]) {
862: case '>':
863: *pstate = PARSE_ELEM;
864: if (pend > 0 && b[pend - 1] == '/') {
865: b[pend - 1] = '\0';
866: elem_end = 1;
867: }
1.23 schwarze 868: if (p->flags & PFLAG_EEND)
869: elem_end = 1;
1.14 schwarze 870: break;
871: case '=':
872: *pstate = PARSE_ARG;
873: break;
874: default:
875: break;
876: }
877: b[pend] = '\0';
878: if (pend < rlen)
879: increment(p, b, &pend, refill);
880: xml_attrkey(p, b + poff);
881: if (elem_end)
882: xml_elem_end(p, NULL);
883:
884: /* Begin an opening or closing tag. */
885:
886: } else if (b[poff] == '<') {
887: if (advance(p, b, rlen, &pend, " >", refill))
888: break;
889: if (pend > poff + 3 &&
890: strncmp(b + poff, "<!--", 4) == 0) {
891:
892: /* Skip a comment. */
893:
894: cp = strstr(b + pend - 2, "-->");
895: if (cp == NULL) {
896: if (refill)
897: break;
898: cp = b + rlen;
899: } else
900: cp += 3;
901: while (b + pend < cp)
902: increment(p, b, &pend, refill);
903: continue;
904: }
905: elem_end = 0;
906: if (b[pend] != '>')
907: *pstate = PARSE_TAG;
908: else if (pend > 0 && b[pend - 1] == '/') {
909: b[pend - 1] = '\0';
910: elem_end = 1;
911: }
912: b[pend] = '\0';
913: if (pend < rlen)
914: increment(p, b, &pend, refill);
915: if (b[++poff] == '/') {
916: elem_end = 1;
917: poff++;
1.23 schwarze 918: } else {
1.14 schwarze 919: xml_elem_start(p, b + poff);
1.23 schwarze 920: if (*pstate == PARSE_ELEM &&
921: p->flags & PFLAG_EEND)
922: elem_end = 1;
923: }
1.14 schwarze 924: if (elem_end)
925: xml_elem_end(p, b + poff);
926:
1.23 schwarze 927: /* Close a doctype. */
928:
929: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
930: *pstate = PARSE_TAG;
931: increment(p, b, &pend, refill);
932:
1.14 schwarze 933: /* Process an entity. */
934:
935: } else if (b[poff] == '&') {
936: if (advance(p, b, rlen, &pend, ";", refill))
937: break;
938: b[pend] = '\0';
939: if (pend < rlen)
940: increment(p, b, &pend, refill);
941: xml_entity(p, b + poff + 1);
942:
943: /* Process text up to the next tag, entity, or EOL. */
944:
945: } else {
1.28 schwarze 946: advance(p, b, rlen, &pend,
1.33 schwarze 947: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 948: refill);
1.45 schwarze 949: if (p->nofill)
950: poff = pws;
1.35 schwarze 951: xml_text(p, b + poff, pend - poff);
1.33 schwarze 952: if (b[pend] == '\n')
1.37 schwarze 953: pnode_closetext(p, 0);
1.14 schwarze 954: }
1.45 schwarze 955: pws = pend;
1.14 schwarze 956: }
957: return poff;
958: }
959:
1.24 schwarze 960:
961: /*
962: * The read loop.
963: * If the previous token was incomplete and asked for more input,
964: * we have to enter the read loop once more even on EOF.
965: * Once rsz is 0, incomplete tokens will no longer ask for more input
966: * but instead use whatever there is, and then exit the read loop.
967: * The minus one on the size limit for read(2) is needed such that
968: * advance() can set b[rlen] to NUL when needed.
969: */
970: static void
971: parse_fd(struct parse *p, int fd)
1.1 schwarze 972: {
973: char b[4096];
1.5 schwarze 974: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 975: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 976: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 977: enum pstate pstate;
1.1 schwarze 978:
1.24 schwarze 979: rlen = 0;
1.14 schwarze 980: pstate = PARSE_ELEM;
981: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
982: (rlen += rsz) > 0) {
983: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 984: /* Buffer exhausted; shift left and re-fill. */
985: assert(poff > 0);
986: rlen -= poff;
1.14 schwarze 987: memmove(b, b + poff, rlen);
1.5 schwarze 988: }
1.24 schwarze 989: if (rsz < 0)
990: error_msg(p, "read: %s", strerror(errno));
991: }
992:
993: /*
994: * Open and parse a file.
995: */
996: struct ptree *
997: parse_file(struct parse *p, int fd, const char *fname)
998: {
999: const char *save_fname;
1000: int save_line, save_col;
1001:
1002: /* Save and initialize reporting data. */
1003:
1004: save_fname = p->fname;
1005: save_line = p->nline;
1006: save_col = p->ncol;
1007: p->fname = fname;
1008: p->line = 0;
1009: p->col = 0;
1010:
1011: /* Open the file, unless it is already open. */
1012:
1013: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1014: error_msg(p, "open: %s", strerror(errno));
1015: p->fname = save_fname;
1016: return p->tree;
1.5 schwarze 1017: }
1.24 schwarze 1018:
1019: /*
1020: * After opening the starting file, change to the directory it
1021: * is located in, in case it wants to include any further files,
1022: * which are typically given with relative paths in DocBook.
1023: * Do this on a best-effort basis; don't complain about failure.
1024: */
1025:
1026: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1027: strcmp(fname, ".") != 0)
1028: (void)chdir(fname);
1029:
1030: /* Run the read loop. */
1031:
1032: p->nline = 1;
1033: p->ncol = 1;
1034: parse_fd(p, fd);
1035:
1036: /* On the top level, finalize the parse tree. */
1037:
1038: if (save_fname == NULL) {
1.37 schwarze 1039: pnode_closetext(p, 0);
1.24 schwarze 1040: if (p->tree->root == NULL)
1041: error_msg(p, "empty document");
1042: else if ((p->tree->flags & TREE_CLOSED) == 0)
1043: warn_msg(p, "document not closed");
1044: pnode_unlink(p->doctype);
1045: }
1046:
1047: /* Clean up. */
1048:
1049: if (fd != STDIN_FILENO)
1050: close(fd);
1051: p->fname = save_fname;
1052: p->nline = save_line;
1053: p->ncol = save_col;
1.1 schwarze 1054: return p->tree;
1055: }
CVSweb