Annotation of docbook2mdoc/parse.c, Revision 1.45
1.45 ! schwarze 1: /* $Id: parse.c,v 1.44 2019/04/14 22:37:56 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.45 ! schwarze 59: int nofill; /* Levels of open no-fill displays. */
1.23 schwarze 60: int flags;
61: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
62: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
63: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
64: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 65: };
66:
1.39 schwarze 67: struct alias {
1.1 schwarze 68: const char *name; /* DocBook element name. */
69: enum nodeid node; /* Node type to generate. */
70: };
71:
1.39 schwarze 72: static const struct alias aliases[] = {
1.3 schwarze 73: { "acronym", NODE_IGNORE },
1.43 schwarze 74: { "affiliation", NODE_IGNORE },
1.4 schwarze 75: { "anchor", NODE_DELETE },
1.42 schwarze 76: { "application", NODE_COMMAND },
1.22 schwarze 77: { "article", NODE_SECTION },
1.41 schwarze 78: { "articleinfo", NODE_BOOKINFO },
1.22 schwarze 79: { "book", NODE_SECTION },
1.1 schwarze 80: { "chapter", NODE_SECTION },
1.44 schwarze 81: { "caption", NODE_IGNORE },
1.13 schwarze 82: { "code", NODE_LITERAL },
1.36 schwarze 83: { "computeroutput", NODE_LITERAL },
1.23 schwarze 84: { "!doctype", NODE_DOCTYPE },
1.44 schwarze 85: { "figure", NODE_IGNORE },
1.7 schwarze 86: { "firstname", NODE_PERSONNAME },
1.21 schwarze 87: { "glossary", NODE_VARIABLELIST },
88: { "glossdef", NODE_IGNORE },
89: { "glossdiv", NODE_IGNORE },
90: { "glossentry", NODE_VARLISTENTRY },
91: { "glosslist", NODE_VARIABLELIST },
1.43 schwarze 92: { "holder", NODE_IGNORE },
1.44 schwarze 93: { "imageobject", NODE_IGNORE },
1.4 schwarze 94: { "indexterm", NODE_DELETE },
1.11 schwarze 95: { "informaltable", NODE_TABLE },
1.42 schwarze 96: { "keycap", NODE_KEYSYM },
97: { "keycode", NODE_IGNORE },
1.44 schwarze 98: { "mediaobject", NODE_BLOCKQUOTE },
1.43 schwarze 99: { "orgname", NODE_IGNORE },
1.40 schwarze 100: { "othercredit", NODE_AUTHOR },
1.7 schwarze 101: { "othername", NODE_PERSONNAME },
1.1 schwarze 102: { "part", NODE_SECTION },
1.3 schwarze 103: { "phrase", NODE_IGNORE },
1.4 schwarze 104: { "primary", NODE_DELETE },
1.42 schwarze 105: { "property", NODE_PARAMETER },
1.1 schwarze 106: { "refsect1", NODE_SECTION },
107: { "refsect2", NODE_SECTION },
108: { "refsect3", NODE_SECTION },
109: { "refsection", NODE_SECTION },
1.43 schwarze 110: { "releaseinfo", NODE_IGNORE },
1.42 schwarze 111: { "returnvalue", NODE_IGNORE },
1.4 schwarze 112: { "secondary", NODE_DELETE },
1.1 schwarze 113: { "sect1", NODE_SECTION },
114: { "sect2", NODE_SECTION },
1.36 schwarze 115: { "sgmltag", NODE_MARKUP },
1.15 schwarze 116: { "simpara", NODE_PARA },
1.13 schwarze 117: { "structfield", NODE_PARAMETER },
118: { "structname", NODE_TYPE },
1.7 schwarze 119: { "surname", NODE_PERSONNAME },
1.12 schwarze 120: { "symbol", NODE_CONSTANT },
1.3 schwarze 121: { "trademark", NODE_IGNORE },
1.18 schwarze 122: { "ulink", NODE_LINK },
1.13 schwarze 123: { "userinput", NODE_LITERAL },
1.43 schwarze 124: { "year", NODE_IGNORE },
1.5 schwarze 125: { NULL, NODE_IGNORE }
1.1 schwarze 126: };
127:
1.9 schwarze 128: struct entity {
129: const char *name;
130: const char *roff;
131: };
132:
133: /*
134: * XML character entity references found in the wild.
135: * Those that don't have an exact mandoc_char(7) representation
136: * are approximated, and the desired codepoint is given as a comment.
137: * Encoding them as \\[u...] would leave -Tascii out in the cold.
138: */
139: static const struct entity entities[] = {
140: { "alpha", "\\(*a" },
141: { "amp", "&" },
142: { "apos", "'" },
143: { "auml", "\\(:a" },
144: { "beta", "\\(*b" },
145: { "circ", "^" }, /* U+02C6 */
146: { "copy", "\\(co" },
147: { "dagger", "\\(dg" },
148: { "Delta", "\\(*D" },
149: { "eacute", "\\('e" },
150: { "emsp", "\\ " }, /* U+2003 */
151: { "gt", ">" },
152: { "hairsp", "\\^" },
153: { "kappa", "\\(*k" },
154: { "larr", "\\(<-" },
155: { "ldquo", "\\(lq" },
156: { "le", "\\(<=" },
157: { "lowbar", "_" },
158: { "lsqb", "[" },
159: { "lt", "<" },
160: { "mdash", "\\(em" },
161: { "minus", "\\-" },
162: { "ndash", "\\(en" },
163: { "nbsp", "\\ " },
164: { "num", "#" },
165: { "oslash", "\\(/o" },
166: { "ouml", "\\(:o" },
167: { "percnt", "%" },
168: { "quot", "\\(dq" },
169: { "rarr", "\\(->" },
170: { "rArr", "\\(rA" },
171: { "rdquo", "\\(rq" },
172: { "reg", "\\(rg" },
173: { "rho", "\\(*r" },
174: { "rsqb", "]" },
175: { "sigma", "\\(*s" },
176: { "shy", "\\&" }, /* U+00AD */
177: { "tau", "\\(*t" },
178: { "tilde", "\\[u02DC]" },
179: { "times", "\\[tmu]" },
180: { "uuml", "\\(:u" },
181: { NULL, NULL }
182: };
183:
1.23 schwarze 184: static size_t parse_string(struct parse *, char *, size_t,
185: enum pstate *, int);
1.24 schwarze 186: static void parse_fd(struct parse *, int);
1.23 schwarze 187:
188:
1.6 schwarze 189: static void
1.29 schwarze 190: fatal(struct parse *p)
191: {
192: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
193: perror(NULL);
194: exit(6);
195: }
196:
197: static void
1.6 schwarze 198: error_msg(struct parse *p, const char *fmt, ...)
199: {
200: va_list ap;
201:
1.29 schwarze 202: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 203: va_start(ap, fmt);
204: vfprintf(stderr, fmt, ap);
205: va_end(ap);
206: fputc('\n', stderr);
1.29 schwarze 207: p->tree->flags |= TREE_ERROR;
1.6 schwarze 208: }
209:
210: static void
211: warn_msg(struct parse *p, const char *fmt, ...)
212: {
213: va_list ap;
214:
1.23 schwarze 215: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 216: return;
217:
1.29 schwarze 218: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 219: va_start(ap, fmt);
220: vfprintf(stderr, fmt, ap);
221: va_end(ap);
222: fputc('\n', stderr);
1.29 schwarze 223: p->tree->flags |= TREE_WARN;
1.6 schwarze 224: }
225:
1.1 schwarze 226: /*
227: * Process a string of characters.
228: * If a text node is already open, append to it.
229: * Otherwise, create a new one as a child of the current node.
230: */
231: static void
1.35 schwarze 232: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 233: {
1.35 schwarze 234: struct pnode *n, *np;
1.32 schwarze 235: size_t oldsz, newsz;
1.35 schwarze 236: int i;
1.1 schwarze 237:
1.32 schwarze 238: assert(sz > 0);
1.30 schwarze 239: if (p->del > 0)
1.1 schwarze 240: return;
241:
1.32 schwarze 242: if ((n = p->cur) == NULL) {
1.35 schwarze 243: error_msg(p, "discarding text before document: %.*s",
244: sz, word);
1.5 schwarze 245: return;
246: }
247:
1.35 schwarze 248: /* Append to the current text node, if one is open. */
249:
250: if (n->node == NODE_TEXT) {
251: oldsz = strlen(n->b);
252: newsz = oldsz + sz;
253: if (oldsz && (p->flags & PFLAG_SPC))
254: newsz++;
255: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 256: fatal(p);
1.35 schwarze 257: if (oldsz && (p->flags & PFLAG_SPC))
258: n->b[oldsz++] = ' ';
259: memcpy(n->b + oldsz, word, sz);
260: n->b[newsz] = '\0';
261: p->flags &= ~PFLAG_SPC;
262: return;
1.1 schwarze 263: }
264:
1.35 schwarze 265: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 266: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 267:
1.35 schwarze 268: /* Create a new text node. */
1.1 schwarze 269:
1.35 schwarze 270: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 271: fatal(p);
1.35 schwarze 272: n->node = NODE_TEXT;
273: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 274: p->flags &= ~PFLAG_SPC;
1.35 schwarze 275:
276: /*
1.39 schwarze 277: * If this node follows an in-line macro without intervening
1.35 schwarze 278: * whitespace, keep the text in it as short as possible,
279: * and do not keep it open.
280: */
281:
1.39 schwarze 282: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
283: while (np != NULL) {
284: switch (pnode_class(np->node)) {
285: case CLASS_VOID:
286: case CLASS_TEXT:
287: case CLASS_BLOCK:
1.45 ! schwarze 288: case CLASS_NOFILL:
1.39 schwarze 289: np = NULL;
290: break;
291: case CLASS_TRANS:
292: np = TAILQ_LAST(&np->childq, pnodeq);
293: continue;
294: case CLASS_LINE:
295: case CLASS_ENCL:
296: break;
297: }
298: break;
299: }
300: if (np != NULL) {
1.35 schwarze 301: i = 0;
302: while (i < sz && !isspace((unsigned char)word[i]))
303: i++;
304: if ((n->b = strndup(word, i)) == NULL)
305: fatal(p);
306: if (i == sz)
307: return;
308: while (i < sz && isspace((unsigned char)word[i]))
309: i++;
310: if (i == sz) {
311: p->flags |= PFLAG_SPC;
312: return;
313: }
314:
315: /* Put any remaining text into a second node. */
316:
317: if ((n = pnode_alloc(p->cur)) == NULL)
318: fatal(p);
319: n->node = NODE_TEXT;
320: n->spc = 1;
321: word += i;
322: sz -= i;
323: }
324: if ((n->b = strndup(word, sz)) == NULL)
325: fatal(p);
326:
327: /* The new node remains open for later pnode_closetext(). */
328:
329: p->cur = n;
1.1 schwarze 330: }
331:
1.16 schwarze 332: /*
333: * Close out the text node and strip trailing whitespace, if one is open.
334: */
1.1 schwarze 335: static void
1.37 schwarze 336: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 337: {
1.16 schwarze 338: struct pnode *n;
1.37 schwarze 339: char *cp, *last_word;
1.16 schwarze 340:
341: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
342: return;
343: p->cur = n->parent;
1.32 schwarze 344: for (cp = strchr(n->b, '\0');
345: cp > n->b && isspace((unsigned char)cp[-1]);
346: *--cp = '\0')
1.23 schwarze 347: p->flags |= PFLAG_SPC;
1.37 schwarze 348:
349: if (p->flags & PFLAG_SPC || !check_last_word)
350: return;
351:
352: /*
353: * Find the beginning of the last word
354: * and delete whitespace before it.
355: */
356:
357: while (cp > n->b && !isspace((unsigned char)cp[-1]))
358: cp--;
359: if (cp == n->b)
360: return;
361:
362: last_word = cp;
363: while (cp > n->b && isspace((unsigned char)cp[-1]))
364: *--cp = '\0';
365:
366: /* Move the last word into its own node, for use with .Pf. */
367:
368: if ((n = pnode_alloc(p->cur)) == NULL)
369: fatal(p);
370: n->node = NODE_TEXT;
371: n->spc = 1;
372: if ((n->b = strdup(last_word)) == NULL)
373: fatal(p);
1.1 schwarze 374: }
375:
1.9 schwarze 376: static void
377: xml_entity(struct parse *p, const char *name)
378: {
379: const struct entity *entity;
1.30 schwarze 380: struct pnode *n;
1.23 schwarze 381: const char *ccp;
382: char *cp;
383: enum pstate pstate;
1.9 schwarze 384:
385: if (p->del > 0)
386: return;
387:
388: if (p->cur == NULL) {
389: error_msg(p, "discarding entity before document: &%s;", name);
390: return;
391: }
392:
1.37 schwarze 393: pnode_closetext(p, 0);
1.9 schwarze 394:
395: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
396: warn_msg(p, "entity after end of document: &%s;", name);
397:
398: for (entity = entities; entity->name != NULL; entity++)
399: if (strcmp(name, entity->name) == 0)
400: break;
401:
402: if (entity->roff == NULL) {
1.23 schwarze 403: if (p->doctype != NULL) {
1.30 schwarze 404: TAILQ_FOREACH(n, &p->doctype->childq, child) {
405: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 406: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 407: strcmp(ccp, name) != 0)
408: continue;
1.30 schwarze 409: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 410: ATTRKEY_SYSTEM, NULL)) != NULL) {
411: parse_file(p, -1, ccp);
412: p->flags &= ~PFLAG_SPC;
413: return;
414: }
1.30 schwarze 415: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 416: ATTRKEY_DEFINITION, NULL)) == NULL)
417: continue;
1.29 schwarze 418: if ((cp = strdup(ccp)) == NULL)
419: fatal(p);
1.23 schwarze 420: pstate = PARSE_ELEM;
421: parse_string(p, cp, strlen(cp), &pstate, 0);
422: p->flags &= ~PFLAG_SPC;
423: free(cp);
424: return;
425: }
426: }
1.9 schwarze 427: error_msg(p, "unknown entity &%s;", name);
428: return;
429: }
430:
431: /* Create, append, and close out an entity node. */
1.34 schwarze 432: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 433: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 434: fatal(p);
1.30 schwarze 435: n->node = NODE_ESCAPE;
436: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 437: p->flags &= ~PFLAG_SPC;
1.9 schwarze 438: }
439:
1.1 schwarze 440: /*
1.39 schwarze 441: * Parse an element name.
442: */
443: static enum nodeid
444: xml_name2node(struct parse *p, const char *name)
445: {
446: const struct alias *alias;
447: enum nodeid node;
448:
449: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
450: return node;
451:
452: for (alias = aliases; alias->name != NULL; alias++)
453: if (strcmp(alias->name, name) == 0)
454: return alias->node;
455:
456: return NODE_UNKNOWN;
457: }
458:
459: /*
1.1 schwarze 460: * Begin an element.
461: */
462: static void
1.30 schwarze 463: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 464: {
1.30 schwarze 465: struct pnode *n;
1.1 schwarze 466:
1.4 schwarze 467: /*
468: * An ancestor is excluded from the tree;
469: * keep track of the number of levels excluded.
470: */
1.30 schwarze 471: if (p->del > 0) {
1.23 schwarze 472: if (*name != '!' && *name != '?')
1.30 schwarze 473: p->del++;
1.4 schwarze 474: return;
475: }
476:
1.39 schwarze 477: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 478: case NODE_DELETE_WARN:
1.30 schwarze 479: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 480: /* FALLTHROUGH */
1.4 schwarze 481: case NODE_DELETE:
1.30 schwarze 482: p->del = 1;
1.4 schwarze 483: /* FALLTHROUGH */
1.2 schwarze 484: case NODE_IGNORE:
485: return;
1.39 schwarze 486: case NODE_UNKNOWN:
487: if (*name != '!' && *name != '?')
488: error_msg(p, "unknown element <%s>", name);
489: return;
1.2 schwarze 490: default:
491: break;
492: }
1.1 schwarze 493:
1.30 schwarze 494: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
495: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 496:
1.39 schwarze 497: switch (pnode_class(p->ncur)) {
498: case CLASS_LINE:
499: case CLASS_ENCL:
500: pnode_closetext(p, 1);
501: break;
502: default:
503: pnode_closetext(p, 0);
504: break;
505: }
506:
1.34 schwarze 507: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 508: fatal(p);
1.17 schwarze 509:
510: /*
1.39 schwarze 511: * Some elements are self-closing.
1.17 schwarze 512: * Nodes that begin a new macro or request line or start by
513: * printing text always want whitespace before themselves.
514: */
515:
1.39 schwarze 516: switch (n->node = p->ncur) {
1.23 schwarze 517: case NODE_DOCTYPE:
518: case NODE_ENTITY:
519: case NODE_SBR:
1.30 schwarze 520: p->flags |= PFLAG_EEND;
1.17 schwarze 521: break;
522: default:
1.39 schwarze 523: break;
524: }
525: switch (pnode_class(p->ncur)) {
526: case CLASS_LINE:
527: case CLASS_ENCL:
1.30 schwarze 528: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 529: break;
1.45 ! schwarze 530: case CLASS_NOFILL:
! 531: p->nofill++;
! 532: /* FALLTHROUGH */
1.39 schwarze 533: default:
534: n->spc = 1;
535: break;
1.17 schwarze 536: }
1.30 schwarze 537: p->cur = n;
538: if (n->node == NODE_DOCTYPE) {
539: if (p->doctype == NULL)
540: p->doctype = n;
1.23 schwarze 541: else
1.30 schwarze 542: error_msg(p, "duplicate doctype");
543: } else if (n->parent == NULL && p->tree->root == NULL)
544: p->tree->root = n;
1.5 schwarze 545: }
546:
547: static void
1.30 schwarze 548: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 549: {
1.30 schwarze 550: struct pattr *a;
1.23 schwarze 551: const char *value;
1.5 schwarze 552: enum attrkey key;
1.1 schwarze 553:
1.30 schwarze 554: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 555: return;
1.23 schwarze 556:
1.30 schwarze 557: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
558: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 559: value = name;
560: name = "NAME";
561: } else
562: value = NULL;
563:
1.5 schwarze 564: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 565: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 566: return;
567: }
1.30 schwarze 568: if ((a = calloc(1, sizeof(*a))) == NULL)
569: fatal(p);
1.29 schwarze 570:
1.30 schwarze 571: a->key = key;
572: a->val = ATTRVAL__MAX;
1.23 schwarze 573: if (value == NULL) {
1.30 schwarze 574: a->rawval = NULL;
575: p->flags |= PFLAG_ATTR;
1.23 schwarze 576: } else {
1.30 schwarze 577: if ((a->rawval = strdup(value)) == NULL)
578: fatal(p);
579: p->flags &= ~PFLAG_ATTR;
580: }
581: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
582: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
583: xml_attrkey(p, "DEFINITION");
1.5 schwarze 584: }
585:
586: static void
1.30 schwarze 587: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 588: {
1.30 schwarze 589: struct pattr *a;
1.5 schwarze 590:
1.30 schwarze 591: if (p->del > 0 || p->ncur == NODE_IGNORE ||
592: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 593: return;
1.30 schwarze 594: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 595: return;
1.30 schwarze 596: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
597: (a->rawval = strdup(name)) == NULL)
598: fatal(p);
599: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 600: }
601:
602: /*
603: * Roll up the parse tree.
604: * If we're at a text node, roll that one up first.
605: */
606: static void
1.31 schwarze 607: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 608: {
1.26 schwarze 609: struct pnode *n;
610: const char *cp;
1.5 schwarze 611: enum nodeid node;
1.1 schwarze 612:
1.4 schwarze 613: /*
614: * An ancestor is excluded from the tree;
615: * keep track of the number of levels excluded.
616: */
1.31 schwarze 617: if (p->del > 1) {
618: p->del--;
1.4 schwarze 619: return;
620: }
621:
1.31 schwarze 622: if (p->del == 0)
1.37 schwarze 623: pnode_closetext(p, 0);
1.2 schwarze 624:
1.39 schwarze 625: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 626:
1.5 schwarze 627: switch (node) {
1.4 schwarze 628: case NODE_DELETE_WARN:
629: case NODE_DELETE:
1.31 schwarze 630: if (p->del > 0)
631: p->del--;
1.4 schwarze 632: break;
1.2 schwarze 633: case NODE_IGNORE:
1.39 schwarze 634: case NODE_UNKNOWN:
1.26 schwarze 635: break;
636: case NODE_INCLUDE:
1.31 schwarze 637: n = p->cur;
638: p->cur = p->cur->parent;
1.26 schwarze 639: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
640: if (cp == NULL)
1.31 schwarze 641: error_msg(p, "<xi:include> element "
1.26 schwarze 642: "without href attribute");
643: else
1.31 schwarze 644: parse_file(p, -1, cp);
1.26 schwarze 645: pnode_unlink(n);
1.31 schwarze 646: p->flags &= ~PFLAG_SPC;
1.2 schwarze 647: break;
1.23 schwarze 648: case NODE_DOCTYPE:
1.32 schwarze 649: case NODE_SBR:
1.31 schwarze 650: p->flags &= ~PFLAG_EEND;
1.23 schwarze 651: /* FALLTHROUGH */
1.2 schwarze 652: default:
1.31 schwarze 653: if (p->cur == NULL || node != p->cur->node) {
654: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 655: break;
656: }
1.45 ! schwarze 657: if (pnode_class(node) == CLASS_NOFILL)
! 658: p->nofill--;
1.5 schwarze 659:
660: /*
661: * Refrain from actually closing the document element.
662: * If no more content follows, no harm is done, but if
663: * some content still follows, simply processing it is
664: * obviously better than discarding it or crashing.
665: */
666:
1.31 schwarze 667: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
668: p->cur = p->cur->parent;
669: if (p->cur != NULL)
670: p->ncur = p->cur->node;
1.23 schwarze 671: } else
1.31 schwarze 672: p->tree->flags |= TREE_CLOSED;
673: p->flags &= ~PFLAG_SPC;
1.4 schwarze 674: break;
1.2 schwarze 675: }
1.31 schwarze 676: assert(p->del == 0);
1.1 schwarze 677: }
678:
679: struct parse *
680: parse_alloc(int warn)
681: {
682: struct parse *p;
683:
684: if ((p = calloc(1, sizeof(*p))) == NULL)
685: return NULL;
686:
687: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
688: free(p);
689: return NULL;
690: }
1.23 schwarze 691: if (warn)
692: p->flags |= PFLAG_WARN;
693: else
694: p->flags &= ~PFLAG_WARN;
1.1 schwarze 695: return p;
696: }
697:
698: void
699: parse_free(struct parse *p)
700: {
701: if (p == NULL)
702: return;
703: if (p->tree != NULL) {
704: pnode_unlink(p->tree->root);
705: free(p->tree);
706: }
707: free(p);
708: }
709:
1.14 schwarze 710: static void
711: increment(struct parse *p, char *b, size_t *pend, int refill)
712: {
713: if (refill) {
714: if (b[*pend] == '\n') {
715: p->nline++;
716: p->ncol = 1;
717: } else
718: p->ncol++;
719: }
720: ++*pend;
721: }
722:
1.5 schwarze 723: /*
724: * Advance the pend pointer to the next character in the charset.
725: * If the charset starts with a space, it stands for any whitespace.
726: * Update the new input file position, used for messages.
727: * Do not overrun the buffer b of length rlen.
728: * When reaching the end, NUL-terminate the buffer and return 1;
729: * otherwise, return 0.
730: */
731: static int
732: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 733: const char *charset, int refill)
1.5 schwarze 734: {
735: int space;
736:
737: if (*charset == ' ') {
738: space = 1;
739: charset++;
740: } else
741: space = 0;
742:
1.14 schwarze 743: if (refill) {
744: p->nline = p->line;
745: p->ncol = p->col;
746: }
1.5 schwarze 747: while (*pend < rlen) {
748: if (space && isspace((unsigned char)b[*pend]))
749: break;
750: if (strchr(charset, b[*pend]) != NULL)
751: break;
1.14 schwarze 752: increment(p, b, pend, refill);
1.5 schwarze 753: }
754: if (*pend == rlen) {
755: b[rlen] = '\0';
1.14 schwarze 756: return refill;
1.5 schwarze 757: } else
758: return 0;
759: }
760:
1.14 schwarze 761: size_t
762: parse_string(struct parse *p, char *b, size_t rlen,
763: enum pstate *pstate, int refill)
764: {
765: char *cp;
1.45 ! schwarze 766: size_t pws; /* Parse offset including whitespace. */
1.14 schwarze 767: size_t poff; /* Parse offset in b[]. */
768: size_t pend; /* Offset of the end of the current word. */
769: int elem_end;
770:
1.45 ! schwarze 771: pend = pws = 0;
1.14 schwarze 772: for (;;) {
773:
774: /* Proceed to the next token, skipping whitespace. */
775:
776: if (refill) {
777: p->line = p->nline;
778: p->col = p->ncol;
779: }
780: if ((poff = pend) == rlen)
781: break;
782: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 783: p->flags |= PFLAG_SPC;
1.45 ! schwarze 784: if (b[pend] == '\n')
! 785: pws = pend + 1;
1.14 schwarze 786: increment(p, b, &pend, refill);
787: continue;
788: }
789:
790: /*
791: * The following four cases (ARG, TAG, and starting an
792: * entity or a tag) all parse a word or quoted string.
793: * If that extends beyond the read buffer and the last
794: * read(2) still got data, they all break out of the
795: * token loop to request more data from the read loop.
796: *
797: * Also, three of them detect self-closing tags, those
798: * ending with "/>", setting the flag elem_end and
799: * calling xml_elem_end() at the very end, after
800: * handling the attribute value, attribute name, or
801: * tag name, respectively.
802: */
803:
804: /* Parse an attribute value. */
805:
806: if (*pstate >= PARSE_ARG) {
807: if (*pstate == PARSE_ARG &&
808: (b[pend] == '\'' || b[pend] == '"')) {
809: *pstate = b[pend] == '"' ?
810: PARSE_DQ : PARSE_SQ;
811: increment(p, b, &pend, refill);
812: continue;
813: }
814: if (advance(p, b, rlen, &pend,
815: *pstate == PARSE_DQ ? "\"" :
816: *pstate == PARSE_SQ ? "'" : " >", refill))
817: break;
818: *pstate = PARSE_TAG;
819: elem_end = 0;
820: if (b[pend] == '>') {
821: *pstate = PARSE_ELEM;
822: if (pend > 0 && b[pend - 1] == '/') {
823: b[pend - 1] = '\0';
824: elem_end = 1;
825: }
1.23 schwarze 826: if (p->flags & PFLAG_EEND)
827: elem_end = 1;
1.14 schwarze 828: }
829: b[pend] = '\0';
830: if (pend < rlen)
831: increment(p, b, &pend, refill);
832: xml_attrval(p, b + poff);
833: if (elem_end)
834: xml_elem_end(p, NULL);
835:
836: /* Look for an attribute name. */
837:
838: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 839: switch (p->ncur) {
840: case NODE_DOCTYPE:
841: if (b[pend] == '[') {
842: *pstate = PARSE_ELEM;
843: increment(p, b, &pend, refill);
844: continue;
845: }
846: /* FALLTHROUGH */
847: case NODE_ENTITY:
848: if (b[pend] == '"' || b[pend] == '\'') {
849: *pstate = PARSE_ARG;
850: continue;
851: }
852: break;
853: default:
854: break;
855: }
1.14 schwarze 856: if (advance(p, b, rlen, &pend, " =>", refill))
857: break;
858: elem_end = 0;
859: switch (b[pend]) {
860: case '>':
861: *pstate = PARSE_ELEM;
862: if (pend > 0 && b[pend - 1] == '/') {
863: b[pend - 1] = '\0';
864: elem_end = 1;
865: }
1.23 schwarze 866: if (p->flags & PFLAG_EEND)
867: elem_end = 1;
1.14 schwarze 868: break;
869: case '=':
870: *pstate = PARSE_ARG;
871: break;
872: default:
873: break;
874: }
875: b[pend] = '\0';
876: if (pend < rlen)
877: increment(p, b, &pend, refill);
878: xml_attrkey(p, b + poff);
879: if (elem_end)
880: xml_elem_end(p, NULL);
881:
882: /* Begin an opening or closing tag. */
883:
884: } else if (b[poff] == '<') {
885: if (advance(p, b, rlen, &pend, " >", refill))
886: break;
887: if (pend > poff + 3 &&
888: strncmp(b + poff, "<!--", 4) == 0) {
889:
890: /* Skip a comment. */
891:
892: cp = strstr(b + pend - 2, "-->");
893: if (cp == NULL) {
894: if (refill)
895: break;
896: cp = b + rlen;
897: } else
898: cp += 3;
899: while (b + pend < cp)
900: increment(p, b, &pend, refill);
901: continue;
902: }
903: elem_end = 0;
904: if (b[pend] != '>')
905: *pstate = PARSE_TAG;
906: else if (pend > 0 && b[pend - 1] == '/') {
907: b[pend - 1] = '\0';
908: elem_end = 1;
909: }
910: b[pend] = '\0';
911: if (pend < rlen)
912: increment(p, b, &pend, refill);
913: if (b[++poff] == '/') {
914: elem_end = 1;
915: poff++;
1.23 schwarze 916: } else {
1.14 schwarze 917: xml_elem_start(p, b + poff);
1.23 schwarze 918: if (*pstate == PARSE_ELEM &&
919: p->flags & PFLAG_EEND)
920: elem_end = 1;
921: }
1.14 schwarze 922: if (elem_end)
923: xml_elem_end(p, b + poff);
924:
1.23 schwarze 925: /* Close a doctype. */
926:
927: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
928: *pstate = PARSE_TAG;
929: increment(p, b, &pend, refill);
930:
1.14 schwarze 931: /* Process an entity. */
932:
933: } else if (b[poff] == '&') {
934: if (advance(p, b, rlen, &pend, ";", refill))
935: break;
936: b[pend] = '\0';
937: if (pend < rlen)
938: increment(p, b, &pend, refill);
939: xml_entity(p, b + poff + 1);
940:
941: /* Process text up to the next tag, entity, or EOL. */
942:
943: } else {
1.28 schwarze 944: advance(p, b, rlen, &pend,
1.33 schwarze 945: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 946: refill);
1.45 ! schwarze 947: if (p->nofill)
! 948: poff = pws;
1.35 schwarze 949: xml_text(p, b + poff, pend - poff);
1.33 schwarze 950: if (b[pend] == '\n')
1.37 schwarze 951: pnode_closetext(p, 0);
1.14 schwarze 952: }
1.45 ! schwarze 953: pws = pend;
1.14 schwarze 954: }
955: return poff;
956: }
957:
1.24 schwarze 958:
959: /*
960: * The read loop.
961: * If the previous token was incomplete and asked for more input,
962: * we have to enter the read loop once more even on EOF.
963: * Once rsz is 0, incomplete tokens will no longer ask for more input
964: * but instead use whatever there is, and then exit the read loop.
965: * The minus one on the size limit for read(2) is needed such that
966: * advance() can set b[rlen] to NUL when needed.
967: */
968: static void
969: parse_fd(struct parse *p, int fd)
1.1 schwarze 970: {
971: char b[4096];
1.5 schwarze 972: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 973: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 974: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 975: enum pstate pstate;
1.1 schwarze 976:
1.24 schwarze 977: rlen = 0;
1.14 schwarze 978: pstate = PARSE_ELEM;
979: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
980: (rlen += rsz) > 0) {
981: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 982: /* Buffer exhausted; shift left and re-fill. */
983: assert(poff > 0);
984: rlen -= poff;
1.14 schwarze 985: memmove(b, b + poff, rlen);
1.5 schwarze 986: }
1.24 schwarze 987: if (rsz < 0)
988: error_msg(p, "read: %s", strerror(errno));
989: }
990:
991: /*
992: * Open and parse a file.
993: */
994: struct ptree *
995: parse_file(struct parse *p, int fd, const char *fname)
996: {
997: const char *save_fname;
998: int save_line, save_col;
999:
1000: /* Save and initialize reporting data. */
1001:
1002: save_fname = p->fname;
1003: save_line = p->nline;
1004: save_col = p->ncol;
1005: p->fname = fname;
1006: p->line = 0;
1007: p->col = 0;
1008:
1009: /* Open the file, unless it is already open. */
1010:
1011: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1012: error_msg(p, "open: %s", strerror(errno));
1013: p->fname = save_fname;
1014: return p->tree;
1.5 schwarze 1015: }
1.24 schwarze 1016:
1017: /*
1018: * After opening the starting file, change to the directory it
1019: * is located in, in case it wants to include any further files,
1020: * which are typically given with relative paths in DocBook.
1021: * Do this on a best-effort basis; don't complain about failure.
1022: */
1023:
1024: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1025: strcmp(fname, ".") != 0)
1026: (void)chdir(fname);
1027:
1028: /* Run the read loop. */
1029:
1030: p->nline = 1;
1031: p->ncol = 1;
1032: parse_fd(p, fd);
1033:
1034: /* On the top level, finalize the parse tree. */
1035:
1036: if (save_fname == NULL) {
1.37 schwarze 1037: pnode_closetext(p, 0);
1.24 schwarze 1038: if (p->tree->root == NULL)
1039: error_msg(p, "empty document");
1040: else if ((p->tree->flags & TREE_CLOSED) == 0)
1041: warn_msg(p, "document not closed");
1042: pnode_unlink(p->doctype);
1043: }
1044:
1045: /* Clean up. */
1046:
1047: if (fd != STDIN_FILENO)
1048: close(fd);
1049: p->fname = save_fname;
1050: p->nline = save_line;
1051: p->ncol = save_col;
1.1 schwarze 1052: return p->tree;
1053: }
CVSweb