Annotation of docbook2mdoc/parse.c, Revision 1.44
1.44 ! schwarze 1: /* $Id: parse.c,v 1.43 2019/04/14 18:07:35 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.43 schwarze 73: { "affiliation", NODE_IGNORE },
1.4 schwarze 74: { "anchor", NODE_DELETE },
1.42 schwarze 75: { "application", NODE_COMMAND },
1.22 schwarze 76: { "article", NODE_SECTION },
1.41 schwarze 77: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 78: { "book", NODE_SECTION },
1.1 schwarze 79: { "chapter", NODE_SECTION },
1.44 ! schwarze 80: { "caption", NODE_IGNORE },
1.13 schwarze 81: { "code", NODE_LITERAL },
1.36 schwarze 82: { "computeroutput", NODE_LITERAL },
1.23 schwarze 83: { "!doctype", NODE_DOCTYPE },
1.44 ! schwarze 84: { "figure", NODE_IGNORE },
1.7 schwarze 85: { "firstname", NODE_PERSONNAME },
1.21 schwarze 86: { "glossary", NODE_VARIABLELIST },
87: { "glossdef", NODE_IGNORE },
88: { "glossdiv", NODE_IGNORE },
89: { "glossentry", NODE_VARLISTENTRY },
90: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 91: { "holder", NODE_IGNORE },
1.44 ! schwarze 92: { "imageobject", NODE_IGNORE },
1.4 schwarze 93: { "indexterm", NODE_DELETE },
1.11 schwarze 94: { "informaltable", NODE_TABLE },
1.42 schwarze 95: { "keycap", NODE_KEYSYM },
96: { "keycode", NODE_IGNORE },
1.44 ! schwarze 97: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 98: { "orgname", NODE_IGNORE },
1.40 schwarze 99: { "othercredit", NODE_AUTHOR },
1.7 schwarze 100: { "othername", NODE_PERSONNAME },
1.1 schwarze 101: { "part", NODE_SECTION },
1.3 schwarze 102: { "phrase", NODE_IGNORE },
1.4 schwarze 103: { "primary", NODE_DELETE },
1.42 schwarze 104: { "property", NODE_PARAMETER },
1.1 schwarze 105: { "refsect1", NODE_SECTION },
106: { "refsect2", NODE_SECTION },
107: { "refsect3", NODE_SECTION },
108: { "refsection", NODE_SECTION },
1.43 schwarze 109: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 110: { "returnvalue", NODE_IGNORE },
1.4 schwarze 111: { "secondary", NODE_DELETE },
1.1 schwarze 112: { "sect1", NODE_SECTION },
113: { "sect2", NODE_SECTION },
1.36 schwarze 114: { "sgmltag", NODE_MARKUP },
1.15 schwarze 115: { "simpara", NODE_PARA },
1.13 schwarze 116: { "structfield", NODE_PARAMETER },
117: { "structname", NODE_TYPE },
1.7 schwarze 118: { "surname", NODE_PERSONNAME },
1.12 schwarze 119: { "symbol", NODE_CONSTANT },
1.3 schwarze 120: { "trademark", NODE_IGNORE },
1.18 schwarze 121: { "ulink", NODE_LINK },
1.13 schwarze 122: { "userinput", NODE_LITERAL },
1.43 schwarze 123: { "year", NODE_IGNORE },
1.5 schwarze 124: { NULL, NODE_IGNORE }
1.1 schwarze 125: };
126:
1.9 schwarze 127: struct entity {
128: const char *name;
129: const char *roff;
130: };
131:
132: /*
133: * XML character entity references found in the wild.
134: * Those that don't have an exact mandoc_char(7) representation
135: * are approximated, and the desired codepoint is given as a comment.
136: * Encoding them as \\[u...] would leave -Tascii out in the cold.
137: */
138: static const struct entity entities[] = {
139: { "alpha", "\\(*a" },
140: { "amp", "&" },
141: { "apos", "'" },
142: { "auml", "\\(:a" },
143: { "beta", "\\(*b" },
144: { "circ", "^" }, /* U+02C6 */
145: { "copy", "\\(co" },
146: { "dagger", "\\(dg" },
147: { "Delta", "\\(*D" },
148: { "eacute", "\\('e" },
149: { "emsp", "\\ " }, /* U+2003 */
150: { "gt", ">" },
151: { "hairsp", "\\^" },
152: { "kappa", "\\(*k" },
153: { "larr", "\\(<-" },
154: { "ldquo", "\\(lq" },
155: { "le", "\\(<=" },
156: { "lowbar", "_" },
157: { "lsqb", "[" },
158: { "lt", "<" },
159: { "mdash", "\\(em" },
160: { "minus", "\\-" },
161: { "ndash", "\\(en" },
162: { "nbsp", "\\ " },
163: { "num", "#" },
164: { "oslash", "\\(/o" },
165: { "ouml", "\\(:o" },
166: { "percnt", "%" },
167: { "quot", "\\(dq" },
168: { "rarr", "\\(->" },
169: { "rArr", "\\(rA" },
170: { "rdquo", "\\(rq" },
171: { "reg", "\\(rg" },
172: { "rho", "\\(*r" },
173: { "rsqb", "]" },
174: { "sigma", "\\(*s" },
175: { "shy", "\\&" }, /* U+00AD */
176: { "tau", "\\(*t" },
177: { "tilde", "\\[u02DC]" },
178: { "times", "\\[tmu]" },
179: { "uuml", "\\(:u" },
180: { NULL, NULL }
181: };
182:
1.23 schwarze 183: static size_t parse_string(struct parse *, char *, size_t,
184: enum pstate *, int);
1.24 schwarze 185: static void parse_fd(struct parse *, int);
1.23 schwarze 186:
187:
1.6 schwarze 188: static void
1.29 schwarze 189: fatal(struct parse *p)
190: {
191: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
192: perror(NULL);
193: exit(6);
194: }
195:
196: static void
1.6 schwarze 197: error_msg(struct parse *p, const char *fmt, ...)
198: {
199: va_list ap;
200:
1.29 schwarze 201: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 202: va_start(ap, fmt);
203: vfprintf(stderr, fmt, ap);
204: va_end(ap);
205: fputc('\n', stderr);
1.29 schwarze 206: p->tree->flags |= TREE_ERROR;
1.6 schwarze 207: }
208:
209: static void
210: warn_msg(struct parse *p, const char *fmt, ...)
211: {
212: va_list ap;
213:
1.23 schwarze 214: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 215: return;
216:
1.29 schwarze 217: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 218: va_start(ap, fmt);
219: vfprintf(stderr, fmt, ap);
220: va_end(ap);
221: fputc('\n', stderr);
1.29 schwarze 222: p->tree->flags |= TREE_WARN;
1.6 schwarze 223: }
224:
1.1 schwarze 225: /*
226: * Process a string of characters.
227: * If a text node is already open, append to it.
228: * Otherwise, create a new one as a child of the current node.
229: */
230: static void
1.35 schwarze 231: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 232: {
1.35 schwarze 233: struct pnode *n, *np;
1.32 schwarze 234: size_t oldsz, newsz;
1.35 schwarze 235: int i;
1.1 schwarze 236:
1.32 schwarze 237: assert(sz > 0);
1.30 schwarze 238: if (p->del > 0)
1.1 schwarze 239: return;
240:
1.32 schwarze 241: if ((n = p->cur) == NULL) {
1.35 schwarze 242: error_msg(p, "discarding text before document: %.*s",
243: sz, word);
1.5 schwarze 244: return;
245: }
246:
1.35 schwarze 247: /* Append to the current text node, if one is open. */
248:
249: if (n->node == NODE_TEXT) {
250: oldsz = strlen(n->b);
251: newsz = oldsz + sz;
252: if (oldsz && (p->flags & PFLAG_SPC))
253: newsz++;
254: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 255: fatal(p);
1.35 schwarze 256: if (oldsz && (p->flags & PFLAG_SPC))
257: n->b[oldsz++] = ' ';
258: memcpy(n->b + oldsz, word, sz);
259: n->b[newsz] = '\0';
260: p->flags &= ~PFLAG_SPC;
261: return;
1.1 schwarze 262: }
263:
1.35 schwarze 264: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 265: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 266:
1.35 schwarze 267: /* Create a new text node. */
1.1 schwarze 268:
1.35 schwarze 269: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 270: fatal(p);
1.35 schwarze 271: n->node = NODE_TEXT;
272: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 273: p->flags &= ~PFLAG_SPC;
1.35 schwarze 274:
275: /*
1.39 schwarze 276: * If this node follows an in-line macro without intervening
1.35 schwarze 277: * whitespace, keep the text in it as short as possible,
278: * and do not keep it open.
279: */
280:
1.39 schwarze 281: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
282: while (np != NULL) {
283: switch (pnode_class(np->node)) {
284: case CLASS_VOID:
285: case CLASS_TEXT:
286: case CLASS_BLOCK:
287: np = NULL;
288: break;
289: case CLASS_TRANS:
290: np = TAILQ_LAST(&np->childq, pnodeq);
291: continue;
292: case CLASS_LINE:
293: case CLASS_ENCL:
294: break;
295: }
296: break;
297: }
298: if (np != NULL) {
1.35 schwarze 299: i = 0;
300: while (i < sz && !isspace((unsigned char)word[i]))
301: i++;
302: if ((n->b = strndup(word, i)) == NULL)
303: fatal(p);
304: if (i == sz)
305: return;
306: while (i < sz && isspace((unsigned char)word[i]))
307: i++;
308: if (i == sz) {
309: p->flags |= PFLAG_SPC;
310: return;
311: }
312:
313: /* Put any remaining text into a second node. */
314:
315: if ((n = pnode_alloc(p->cur)) == NULL)
316: fatal(p);
317: n->node = NODE_TEXT;
318: n->spc = 1;
319: word += i;
320: sz -= i;
321: }
322: if ((n->b = strndup(word, sz)) == NULL)
323: fatal(p);
324:
325: /* The new node remains open for later pnode_closetext(). */
326:
327: p->cur = n;
1.1 schwarze 328: }
329:
1.16 schwarze 330: /*
331: * Close out the text node and strip trailing whitespace, if one is open.
332: */
1.1 schwarze 333: static void
1.37 schwarze 334: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 335: {
1.16 schwarze 336: struct pnode *n;
1.37 schwarze 337: char *cp, *last_word;
1.16 schwarze 338:
339: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
340: return;
341: p->cur = n->parent;
1.32 schwarze 342: for (cp = strchr(n->b, '\0');
343: cp > n->b && isspace((unsigned char)cp[-1]);
344: *--cp = '\0')
1.23 schwarze 345: p->flags |= PFLAG_SPC;
1.37 schwarze 346:
347: if (p->flags & PFLAG_SPC || !check_last_word)
348: return;
349:
350: /*
351: * Find the beginning of the last word
352: * and delete whitespace before it.
353: */
354:
355: while (cp > n->b && !isspace((unsigned char)cp[-1]))
356: cp--;
357: if (cp == n->b)
358: return;
359:
360: last_word = cp;
361: while (cp > n->b && isspace((unsigned char)cp[-1]))
362: *--cp = '\0';
363:
364: /* Move the last word into its own node, for use with .Pf. */
365:
366: if ((n = pnode_alloc(p->cur)) == NULL)
367: fatal(p);
368: n->node = NODE_TEXT;
369: n->spc = 1;
370: if ((n->b = strdup(last_word)) == NULL)
371: fatal(p);
1.1 schwarze 372: }
373:
1.9 schwarze 374: static void
375: xml_entity(struct parse *p, const char *name)
376: {
377: const struct entity *entity;
1.30 schwarze 378: struct pnode *n;
1.23 schwarze 379: const char *ccp;
380: char *cp;
381: enum pstate pstate;
1.9 schwarze 382:
383: if (p->del > 0)
384: return;
385:
386: if (p->cur == NULL) {
387: error_msg(p, "discarding entity before document: &%s;", name);
388: return;
389: }
390:
1.37 schwarze 391: pnode_closetext(p, 0);
1.9 schwarze 392:
393: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
394: warn_msg(p, "entity after end of document: &%s;", name);
395:
396: for (entity = entities; entity->name != NULL; entity++)
397: if (strcmp(name, entity->name) == 0)
398: break;
399:
400: if (entity->roff == NULL) {
1.23 schwarze 401: if (p->doctype != NULL) {
1.30 schwarze 402: TAILQ_FOREACH(n, &p->doctype->childq, child) {
403: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 404: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 405: strcmp(ccp, name) != 0)
406: continue;
1.30 schwarze 407: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 408: ATTRKEY_SYSTEM, NULL)) != NULL) {
409: parse_file(p, -1, ccp);
410: p->flags &= ~PFLAG_SPC;
411: return;
412: }
1.30 schwarze 413: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 414: ATTRKEY_DEFINITION, NULL)) == NULL)
415: continue;
1.29 schwarze 416: if ((cp = strdup(ccp)) == NULL)
417: fatal(p);
1.23 schwarze 418: pstate = PARSE_ELEM;
419: parse_string(p, cp, strlen(cp), &pstate, 0);
420: p->flags &= ~PFLAG_SPC;
421: free(cp);
422: return;
423: }
424: }
1.9 schwarze 425: error_msg(p, "unknown entity &%s;", name);
426: return;
427: }
428:
429: /* Create, append, and close out an entity node. */
1.34 schwarze 430: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 431: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 432: fatal(p);
1.30 schwarze 433: n->node = NODE_ESCAPE;
434: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 435: p->flags &= ~PFLAG_SPC;
1.9 schwarze 436: }
437:
1.1 schwarze 438: /*
1.39 schwarze 439: * Parse an element name.
440: */
441: static enum nodeid
442: xml_name2node(struct parse *p, const char *name)
443: {
444: const struct alias *alias;
445: enum nodeid node;
446:
447: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
448: return node;
449:
450: for (alias = aliases; alias->name != NULL; alias++)
451: if (strcmp(alias->name, name) == 0)
452: return alias->node;
453:
454: return NODE_UNKNOWN;
455: }
456:
457: /*
1.1 schwarze 458: * Begin an element.
459: */
460: static void
1.30 schwarze 461: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 462: {
1.30 schwarze 463: struct pnode *n;
1.1 schwarze 464:
1.4 schwarze 465: /*
466: * An ancestor is excluded from the tree;
467: * keep track of the number of levels excluded.
468: */
1.30 schwarze 469: if (p->del > 0) {
1.23 schwarze 470: if (*name != '!' && *name != '?')
1.30 schwarze 471: p->del++;
1.4 schwarze 472: return;
473: }
474:
1.39 schwarze 475: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 476: case NODE_DELETE_WARN:
1.30 schwarze 477: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 478: /* FALLTHROUGH */
1.4 schwarze 479: case NODE_DELETE:
1.30 schwarze 480: p->del = 1;
1.4 schwarze 481: /* FALLTHROUGH */
1.2 schwarze 482: case NODE_IGNORE:
483: return;
1.39 schwarze 484: case NODE_UNKNOWN:
485: if (*name != '!' && *name != '?')
486: error_msg(p, "unknown element <%s>", name);
487: return;
1.2 schwarze 488: default:
489: break;
490: }
1.1 schwarze 491:
1.30 schwarze 492: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
493: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 494:
1.39 schwarze 495: switch (pnode_class(p->ncur)) {
496: case CLASS_LINE:
497: case CLASS_ENCL:
498: pnode_closetext(p, 1);
499: break;
500: default:
501: pnode_closetext(p, 0);
502: break;
503: }
504:
1.34 schwarze 505: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 506: fatal(p);
1.17 schwarze 507:
508: /*
1.39 schwarze 509: * Some elements are self-closing.
1.17 schwarze 510: * Nodes that begin a new macro or request line or start by
511: * printing text always want whitespace before themselves.
512: */
513:
1.39 schwarze 514: switch (n->node = p->ncur) {
1.23 schwarze 515: case NODE_DOCTYPE:
516: case NODE_ENTITY:
517: case NODE_SBR:
1.30 schwarze 518: p->flags |= PFLAG_EEND;
1.17 schwarze 519: break;
520: default:
1.39 schwarze 521: break;
522: }
523: switch (pnode_class(p->ncur)) {
524: case CLASS_LINE:
525: case CLASS_ENCL:
1.30 schwarze 526: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 527: break;
1.39 schwarze 528: default:
529: n->spc = 1;
530: break;
1.17 schwarze 531: }
1.30 schwarze 532: p->cur = n;
533: if (n->node == NODE_DOCTYPE) {
534: if (p->doctype == NULL)
535: p->doctype = n;
1.23 schwarze 536: else
1.30 schwarze 537: error_msg(p, "duplicate doctype");
538: } else if (n->parent == NULL && p->tree->root == NULL)
539: p->tree->root = n;
1.5 schwarze 540: }
541:
542: static void
1.30 schwarze 543: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 544: {
1.30 schwarze 545: struct pattr *a;
1.23 schwarze 546: const char *value;
1.5 schwarze 547: enum attrkey key;
1.1 schwarze 548:
1.30 schwarze 549: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 550: return;
1.23 schwarze 551:
1.30 schwarze 552: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
553: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 554: value = name;
555: name = "NAME";
556: } else
557: value = NULL;
558:
1.5 schwarze 559: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 560: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 561: return;
562: }
1.30 schwarze 563: if ((a = calloc(1, sizeof(*a))) == NULL)
564: fatal(p);
1.29 schwarze 565:
1.30 schwarze 566: a->key = key;
567: a->val = ATTRVAL__MAX;
1.23 schwarze 568: if (value == NULL) {
1.30 schwarze 569: a->rawval = NULL;
570: p->flags |= PFLAG_ATTR;
1.23 schwarze 571: } else {
1.30 schwarze 572: if ((a->rawval = strdup(value)) == NULL)
573: fatal(p);
574: p->flags &= ~PFLAG_ATTR;
575: }
576: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
577: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
578: xml_attrkey(p, "DEFINITION");
1.5 schwarze 579: }
580:
581: static void
1.30 schwarze 582: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 583: {
1.30 schwarze 584: struct pattr *a;
1.5 schwarze 585:
1.30 schwarze 586: if (p->del > 0 || p->ncur == NODE_IGNORE ||
587: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 588: return;
1.30 schwarze 589: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 590: return;
1.30 schwarze 591: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
592: (a->rawval = strdup(name)) == NULL)
593: fatal(p);
594: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 595: }
596:
597: /*
598: * Roll up the parse tree.
599: * If we're at a text node, roll that one up first.
600: */
601: static void
1.31 schwarze 602: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 603: {
1.26 schwarze 604: struct pnode *n;
605: const char *cp;
1.5 schwarze 606: enum nodeid node;
1.1 schwarze 607:
1.4 schwarze 608: /*
609: * An ancestor is excluded from the tree;
610: * keep track of the number of levels excluded.
611: */
1.31 schwarze 612: if (p->del > 1) {
613: p->del--;
1.4 schwarze 614: return;
615: }
616:
1.31 schwarze 617: if (p->del == 0)
1.37 schwarze 618: pnode_closetext(p, 0);
1.2 schwarze 619:
1.39 schwarze 620: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 621:
1.5 schwarze 622: switch (node) {
1.4 schwarze 623: case NODE_DELETE_WARN:
624: case NODE_DELETE:
1.31 schwarze 625: if (p->del > 0)
626: p->del--;
1.4 schwarze 627: break;
1.2 schwarze 628: case NODE_IGNORE:
1.39 schwarze 629: case NODE_UNKNOWN:
1.26 schwarze 630: break;
631: case NODE_INCLUDE:
1.31 schwarze 632: n = p->cur;
633: p->cur = p->cur->parent;
1.26 schwarze 634: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
635: if (cp == NULL)
1.31 schwarze 636: error_msg(p, "<xi:include> element "
1.26 schwarze 637: "without href attribute");
638: else
1.31 schwarze 639: parse_file(p, -1, cp);
1.26 schwarze 640: pnode_unlink(n);
1.31 schwarze 641: p->flags &= ~PFLAG_SPC;
1.2 schwarze 642: break;
1.23 schwarze 643: case NODE_DOCTYPE:
1.32 schwarze 644: case NODE_SBR:
1.31 schwarze 645: p->flags &= ~PFLAG_EEND;
1.23 schwarze 646: /* FALLTHROUGH */
1.2 schwarze 647: default:
1.31 schwarze 648: if (p->cur == NULL || node != p->cur->node) {
649: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 650: break;
651: }
652:
653: /*
654: * Refrain from actually closing the document element.
655: * If no more content follows, no harm is done, but if
656: * some content still follows, simply processing it is
657: * obviously better than discarding it or crashing.
658: */
659:
1.31 schwarze 660: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
661: p->cur = p->cur->parent;
662: if (p->cur != NULL)
663: p->ncur = p->cur->node;
1.23 schwarze 664: } else
1.31 schwarze 665: p->tree->flags |= TREE_CLOSED;
666: p->flags &= ~PFLAG_SPC;
1.4 schwarze 667: break;
1.2 schwarze 668: }
1.31 schwarze 669: assert(p->del == 0);
1.1 schwarze 670: }
671:
672: struct parse *
673: parse_alloc(int warn)
674: {
675: struct parse *p;
676:
677: if ((p = calloc(1, sizeof(*p))) == NULL)
678: return NULL;
679:
680: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
681: free(p);
682: return NULL;
683: }
1.23 schwarze 684: if (warn)
685: p->flags |= PFLAG_WARN;
686: else
687: p->flags &= ~PFLAG_WARN;
1.1 schwarze 688: return p;
689: }
690:
691: void
692: parse_free(struct parse *p)
693: {
694: if (p == NULL)
695: return;
696: if (p->tree != NULL) {
697: pnode_unlink(p->tree->root);
698: free(p->tree);
699: }
700: free(p);
701: }
702:
1.14 schwarze 703: static void
704: increment(struct parse *p, char *b, size_t *pend, int refill)
705: {
706: if (refill) {
707: if (b[*pend] == '\n') {
708: p->nline++;
709: p->ncol = 1;
710: } else
711: p->ncol++;
712: }
713: ++*pend;
714: }
715:
1.5 schwarze 716: /*
717: * Advance the pend pointer to the next character in the charset.
718: * If the charset starts with a space, it stands for any whitespace.
719: * Update the new input file position, used for messages.
720: * Do not overrun the buffer b of length rlen.
721: * When reaching the end, NUL-terminate the buffer and return 1;
722: * otherwise, return 0.
723: */
724: static int
725: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 726: const char *charset, int refill)
1.5 schwarze 727: {
728: int space;
729:
730: if (*charset == ' ') {
731: space = 1;
732: charset++;
733: } else
734: space = 0;
735:
1.14 schwarze 736: if (refill) {
737: p->nline = p->line;
738: p->ncol = p->col;
739: }
1.5 schwarze 740: while (*pend < rlen) {
741: if (space && isspace((unsigned char)b[*pend]))
742: break;
743: if (strchr(charset, b[*pend]) != NULL)
744: break;
1.14 schwarze 745: increment(p, b, pend, refill);
1.5 schwarze 746: }
747: if (*pend == rlen) {
748: b[rlen] = '\0';
1.14 schwarze 749: return refill;
1.5 schwarze 750: } else
751: return 0;
752: }
753:
1.14 schwarze 754: size_t
755: parse_string(struct parse *p, char *b, size_t rlen,
756: enum pstate *pstate, int refill)
757: {
758: char *cp;
759: size_t poff; /* Parse offset in b[]. */
760: size_t pend; /* Offset of the end of the current word. */
761: int elem_end;
762:
763: pend = 0;
764: for (;;) {
765:
766: /* Proceed to the next token, skipping whitespace. */
767:
768: if (refill) {
769: p->line = p->nline;
770: p->col = p->ncol;
771: }
772: if ((poff = pend) == rlen)
773: break;
774: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 775: p->flags |= PFLAG_SPC;
1.14 schwarze 776: increment(p, b, &pend, refill);
777: continue;
778: }
779:
780: /*
781: * The following four cases (ARG, TAG, and starting an
782: * entity or a tag) all parse a word or quoted string.
783: * If that extends beyond the read buffer and the last
784: * read(2) still got data, they all break out of the
785: * token loop to request more data from the read loop.
786: *
787: * Also, three of them detect self-closing tags, those
788: * ending with "/>", setting the flag elem_end and
789: * calling xml_elem_end() at the very end, after
790: * handling the attribute value, attribute name, or
791: * tag name, respectively.
792: */
793:
794: /* Parse an attribute value. */
795:
796: if (*pstate >= PARSE_ARG) {
797: if (*pstate == PARSE_ARG &&
798: (b[pend] == '\'' || b[pend] == '"')) {
799: *pstate = b[pend] == '"' ?
800: PARSE_DQ : PARSE_SQ;
801: increment(p, b, &pend, refill);
802: continue;
803: }
804: if (advance(p, b, rlen, &pend,
805: *pstate == PARSE_DQ ? "\"" :
806: *pstate == PARSE_SQ ? "'" : " >", refill))
807: break;
808: *pstate = PARSE_TAG;
809: elem_end = 0;
810: if (b[pend] == '>') {
811: *pstate = PARSE_ELEM;
812: if (pend > 0 && b[pend - 1] == '/') {
813: b[pend - 1] = '\0';
814: elem_end = 1;
815: }
1.23 schwarze 816: if (p->flags & PFLAG_EEND)
817: elem_end = 1;
1.14 schwarze 818: }
819: b[pend] = '\0';
820: if (pend < rlen)
821: increment(p, b, &pend, refill);
822: xml_attrval(p, b + poff);
823: if (elem_end)
824: xml_elem_end(p, NULL);
825:
826: /* Look for an attribute name. */
827:
828: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 829: switch (p->ncur) {
830: case NODE_DOCTYPE:
831: if (b[pend] == '[') {
832: *pstate = PARSE_ELEM;
833: increment(p, b, &pend, refill);
834: continue;
835: }
836: /* FALLTHROUGH */
837: case NODE_ENTITY:
838: if (b[pend] == '"' || b[pend] == '\'') {
839: *pstate = PARSE_ARG;
840: continue;
841: }
842: break;
843: default:
844: break;
845: }
1.14 schwarze 846: if (advance(p, b, rlen, &pend, " =>", refill))
847: break;
848: elem_end = 0;
849: switch (b[pend]) {
850: case '>':
851: *pstate = PARSE_ELEM;
852: if (pend > 0 && b[pend - 1] == '/') {
853: b[pend - 1] = '\0';
854: elem_end = 1;
855: }
1.23 schwarze 856: if (p->flags & PFLAG_EEND)
857: elem_end = 1;
1.14 schwarze 858: break;
859: case '=':
860: *pstate = PARSE_ARG;
861: break;
862: default:
863: break;
864: }
865: b[pend] = '\0';
866: if (pend < rlen)
867: increment(p, b, &pend, refill);
868: xml_attrkey(p, b + poff);
869: if (elem_end)
870: xml_elem_end(p, NULL);
871:
872: /* Begin an opening or closing tag. */
873:
874: } else if (b[poff] == '<') {
875: if (advance(p, b, rlen, &pend, " >", refill))
876: break;
877: if (pend > poff + 3 &&
878: strncmp(b + poff, "<!--", 4) == 0) {
879:
880: /* Skip a comment. */
881:
882: cp = strstr(b + pend - 2, "-->");
883: if (cp == NULL) {
884: if (refill)
885: break;
886: cp = b + rlen;
887: } else
888: cp += 3;
889: while (b + pend < cp)
890: increment(p, b, &pend, refill);
891: continue;
892: }
893: elem_end = 0;
894: if (b[pend] != '>')
895: *pstate = PARSE_TAG;
896: else if (pend > 0 && b[pend - 1] == '/') {
897: b[pend - 1] = '\0';
898: elem_end = 1;
899: }
900: b[pend] = '\0';
901: if (pend < rlen)
902: increment(p, b, &pend, refill);
903: if (b[++poff] == '/') {
904: elem_end = 1;
905: poff++;
1.23 schwarze 906: } else {
1.14 schwarze 907: xml_elem_start(p, b + poff);
1.23 schwarze 908: if (*pstate == PARSE_ELEM &&
909: p->flags & PFLAG_EEND)
910: elem_end = 1;
911: }
1.14 schwarze 912: if (elem_end)
913: xml_elem_end(p, b + poff);
914:
1.23 schwarze 915: /* Close a doctype. */
916:
917: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
918: *pstate = PARSE_TAG;
919: increment(p, b, &pend, refill);
920:
1.14 schwarze 921: /* Process an entity. */
922:
923: } else if (b[poff] == '&') {
924: if (advance(p, b, rlen, &pend, ";", refill))
925: break;
926: b[pend] = '\0';
927: if (pend < rlen)
928: increment(p, b, &pend, refill);
929: xml_entity(p, b + poff + 1);
930:
931: /* Process text up to the next tag, entity, or EOL. */
932:
933: } else {
1.28 schwarze 934: advance(p, b, rlen, &pend,
1.33 schwarze 935: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 936: refill);
1.35 schwarze 937: xml_text(p, b + poff, pend - poff);
1.33 schwarze 938: if (b[pend] == '\n')
1.37 schwarze 939: pnode_closetext(p, 0);
1.14 schwarze 940: }
941: }
942: return poff;
943: }
944:
1.24 schwarze 945:
946: /*
947: * The read loop.
948: * If the previous token was incomplete and asked for more input,
949: * we have to enter the read loop once more even on EOF.
950: * Once rsz is 0, incomplete tokens will no longer ask for more input
951: * but instead use whatever there is, and then exit the read loop.
952: * The minus one on the size limit for read(2) is needed such that
953: * advance() can set b[rlen] to NUL when needed.
954: */
955: static void
956: parse_fd(struct parse *p, int fd)
1.1 schwarze 957: {
958: char b[4096];
1.5 schwarze 959: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 960: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 961: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 962: enum pstate pstate;
1.1 schwarze 963:
1.24 schwarze 964: rlen = 0;
1.14 schwarze 965: pstate = PARSE_ELEM;
966: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
967: (rlen += rsz) > 0) {
968: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 969: /* Buffer exhausted; shift left and re-fill. */
970: assert(poff > 0);
971: rlen -= poff;
1.14 schwarze 972: memmove(b, b + poff, rlen);
1.5 schwarze 973: }
1.24 schwarze 974: if (rsz < 0)
975: error_msg(p, "read: %s", strerror(errno));
976: }
977:
978: /*
979: * Open and parse a file.
980: */
981: struct ptree *
982: parse_file(struct parse *p, int fd, const char *fname)
983: {
984: const char *save_fname;
985: int save_line, save_col;
986:
987: /* Save and initialize reporting data. */
988:
989: save_fname = p->fname;
990: save_line = p->nline;
991: save_col = p->ncol;
992: p->fname = fname;
993: p->line = 0;
994: p->col = 0;
995:
996: /* Open the file, unless it is already open. */
997:
998: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
999: error_msg(p, "open: %s", strerror(errno));
1000: p->fname = save_fname;
1001: return p->tree;
1.5 schwarze 1002: }
1.24 schwarze 1003:
1004: /*
1005: * After opening the starting file, change to the directory it
1006: * is located in, in case it wants to include any further files,
1007: * which are typically given with relative paths in DocBook.
1008: * Do this on a best-effort basis; don't complain about failure.
1009: */
1010:
1011: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1012: strcmp(fname, ".") != 0)
1013: (void)chdir(fname);
1014:
1015: /* Run the read loop. */
1016:
1017: p->nline = 1;
1018: p->ncol = 1;
1019: parse_fd(p, fd);
1020:
1021: /* On the top level, finalize the parse tree. */
1022:
1023: if (save_fname == NULL) {
1.37 schwarze 1024: pnode_closetext(p, 0);
1.24 schwarze 1025: if (p->tree->root == NULL)
1026: error_msg(p, "empty document");
1027: else if ((p->tree->flags & TREE_CLOSED) == 0)
1028: warn_msg(p, "document not closed");
1029: pnode_unlink(p->doctype);
1030: }
1031:
1032: /* Clean up. */
1033:
1034: if (fd != STDIN_FILENO)
1035: close(fd);
1036: p->fname = save_fname;
1037: p->nline = save_line;
1038: p->ncol = save_col;
1.1 schwarze 1039: return p->tree;
1040: }
CVSweb