Annotation of docbook2mdoc/parse.c, Revision 1.39
1.39 ! schwarze 1: /* $Id: parse.c,v 1.38 2019/04/12 11:37:09 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
1.39 ! schwarze 66: struct alias {
1.1 schwarze 67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
1.39 ! schwarze 71: static const struct alias aliases[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.4 schwarze 73: { "anchor", NODE_DELETE },
1.22 schwarze 74: { "article", NODE_SECTION },
75: { "book", NODE_SECTION },
1.1 schwarze 76: { "chapter", NODE_SECTION },
1.13 schwarze 77: { "code", NODE_LITERAL },
1.36 schwarze 78: { "computeroutput", NODE_LITERAL },
1.23 schwarze 79: { "!doctype", NODE_DOCTYPE },
1.7 schwarze 80: { "firstname", NODE_PERSONNAME },
1.21 schwarze 81: { "glossary", NODE_VARIABLELIST },
82: { "glossdef", NODE_IGNORE },
83: { "glossdiv", NODE_IGNORE },
84: { "glossentry", NODE_VARLISTENTRY },
85: { "glosslist", NODE_VARIABLELIST },
1.4 schwarze 86: { "indexterm", NODE_DELETE },
1.11 schwarze 87: { "informaltable", NODE_TABLE },
1.7 schwarze 88: { "othername", NODE_PERSONNAME },
1.1 schwarze 89: { "part", NODE_SECTION },
1.3 schwarze 90: { "phrase", NODE_IGNORE },
1.4 schwarze 91: { "primary", NODE_DELETE },
1.1 schwarze 92: { "refsect1", NODE_SECTION },
93: { "refsect2", NODE_SECTION },
94: { "refsect3", NODE_SECTION },
95: { "refsection", NODE_SECTION },
1.4 schwarze 96: { "secondary", NODE_DELETE },
1.1 schwarze 97: { "sect1", NODE_SECTION },
98: { "sect2", NODE_SECTION },
1.36 schwarze 99: { "sgmltag", NODE_MARKUP },
1.15 schwarze 100: { "simpara", NODE_PARA },
1.13 schwarze 101: { "structfield", NODE_PARAMETER },
102: { "structname", NODE_TYPE },
1.7 schwarze 103: { "surname", NODE_PERSONNAME },
1.12 schwarze 104: { "symbol", NODE_CONSTANT },
1.3 schwarze 105: { "trademark", NODE_IGNORE },
1.18 schwarze 106: { "ulink", NODE_LINK },
1.13 schwarze 107: { "userinput", NODE_LITERAL },
1.5 schwarze 108: { NULL, NODE_IGNORE }
1.1 schwarze 109: };
110:
1.9 schwarze 111: struct entity {
112: const char *name;
113: const char *roff;
114: };
115:
116: /*
117: * XML character entity references found in the wild.
118: * Those that don't have an exact mandoc_char(7) representation
119: * are approximated, and the desired codepoint is given as a comment.
120: * Encoding them as \\[u...] would leave -Tascii out in the cold.
121: */
122: static const struct entity entities[] = {
123: { "alpha", "\\(*a" },
124: { "amp", "&" },
125: { "apos", "'" },
126: { "auml", "\\(:a" },
127: { "beta", "\\(*b" },
128: { "circ", "^" }, /* U+02C6 */
129: { "copy", "\\(co" },
130: { "dagger", "\\(dg" },
131: { "Delta", "\\(*D" },
132: { "eacute", "\\('e" },
133: { "emsp", "\\ " }, /* U+2003 */
134: { "gt", ">" },
135: { "hairsp", "\\^" },
136: { "kappa", "\\(*k" },
137: { "larr", "\\(<-" },
138: { "ldquo", "\\(lq" },
139: { "le", "\\(<=" },
140: { "lowbar", "_" },
141: { "lsqb", "[" },
142: { "lt", "<" },
143: { "mdash", "\\(em" },
144: { "minus", "\\-" },
145: { "ndash", "\\(en" },
146: { "nbsp", "\\ " },
147: { "num", "#" },
148: { "oslash", "\\(/o" },
149: { "ouml", "\\(:o" },
150: { "percnt", "%" },
151: { "quot", "\\(dq" },
152: { "rarr", "\\(->" },
153: { "rArr", "\\(rA" },
154: { "rdquo", "\\(rq" },
155: { "reg", "\\(rg" },
156: { "rho", "\\(*r" },
157: { "rsqb", "]" },
158: { "sigma", "\\(*s" },
159: { "shy", "\\&" }, /* U+00AD */
160: { "tau", "\\(*t" },
161: { "tilde", "\\[u02DC]" },
162: { "times", "\\[tmu]" },
163: { "uuml", "\\(:u" },
164: { NULL, NULL }
165: };
166:
1.23 schwarze 167: static size_t parse_string(struct parse *, char *, size_t,
168: enum pstate *, int);
1.24 schwarze 169: static void parse_fd(struct parse *, int);
1.23 schwarze 170:
171:
1.6 schwarze 172: static void
1.29 schwarze 173: fatal(struct parse *p)
174: {
175: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
176: perror(NULL);
177: exit(6);
178: }
179:
180: static void
1.6 schwarze 181: error_msg(struct parse *p, const char *fmt, ...)
182: {
183: va_list ap;
184:
1.29 schwarze 185: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 186: va_start(ap, fmt);
187: vfprintf(stderr, fmt, ap);
188: va_end(ap);
189: fputc('\n', stderr);
1.29 schwarze 190: p->tree->flags |= TREE_ERROR;
1.6 schwarze 191: }
192:
193: static void
194: warn_msg(struct parse *p, const char *fmt, ...)
195: {
196: va_list ap;
197:
1.23 schwarze 198: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 199: return;
200:
1.29 schwarze 201: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 202: va_start(ap, fmt);
203: vfprintf(stderr, fmt, ap);
204: va_end(ap);
205: fputc('\n', stderr);
1.29 schwarze 206: p->tree->flags |= TREE_WARN;
1.6 schwarze 207: }
208:
1.1 schwarze 209: /*
210: * Process a string of characters.
211: * If a text node is already open, append to it.
212: * Otherwise, create a new one as a child of the current node.
213: */
214: static void
1.35 schwarze 215: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 216: {
1.35 schwarze 217: struct pnode *n, *np;
1.32 schwarze 218: size_t oldsz, newsz;
1.35 schwarze 219: int i;
1.1 schwarze 220:
1.32 schwarze 221: assert(sz > 0);
1.30 schwarze 222: if (p->del > 0)
1.1 schwarze 223: return;
224:
1.32 schwarze 225: if ((n = p->cur) == NULL) {
1.35 schwarze 226: error_msg(p, "discarding text before document: %.*s",
227: sz, word);
1.5 schwarze 228: return;
229: }
230:
1.35 schwarze 231: /* Append to the current text node, if one is open. */
232:
233: if (n->node == NODE_TEXT) {
234: oldsz = strlen(n->b);
235: newsz = oldsz + sz;
236: if (oldsz && (p->flags & PFLAG_SPC))
237: newsz++;
238: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 239: fatal(p);
1.35 schwarze 240: if (oldsz && (p->flags & PFLAG_SPC))
241: n->b[oldsz++] = ' ';
242: memcpy(n->b + oldsz, word, sz);
243: n->b[newsz] = '\0';
244: p->flags &= ~PFLAG_SPC;
245: return;
1.1 schwarze 246: }
247:
1.35 schwarze 248: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 249: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 250:
1.35 schwarze 251: /* Create a new text node. */
1.1 schwarze 252:
1.35 schwarze 253: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 254: fatal(p);
1.35 schwarze 255: n->node = NODE_TEXT;
256: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 257: p->flags &= ~PFLAG_SPC;
1.35 schwarze 258:
259: /*
1.39 ! schwarze 260: * If this node follows an in-line macro without intervening
1.35 schwarze 261: * whitespace, keep the text in it as short as possible,
262: * and do not keep it open.
263: */
264:
1.39 ! schwarze 265: np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
! 266: while (np != NULL) {
! 267: switch (pnode_class(np->node)) {
! 268: case CLASS_VOID:
! 269: case CLASS_TEXT:
! 270: case CLASS_BLOCK:
! 271: np = NULL;
! 272: break;
! 273: case CLASS_TRANS:
! 274: np = TAILQ_LAST(&np->childq, pnodeq);
! 275: continue;
! 276: case CLASS_LINE:
! 277: case CLASS_ENCL:
! 278: break;
! 279: }
! 280: break;
! 281: }
! 282: if (np != NULL) {
1.35 schwarze 283: i = 0;
284: while (i < sz && !isspace((unsigned char)word[i]))
285: i++;
286: if ((n->b = strndup(word, i)) == NULL)
287: fatal(p);
288: if (i == sz)
289: return;
290: while (i < sz && isspace((unsigned char)word[i]))
291: i++;
292: if (i == sz) {
293: p->flags |= PFLAG_SPC;
294: return;
295: }
296:
297: /* Put any remaining text into a second node. */
298:
299: if ((n = pnode_alloc(p->cur)) == NULL)
300: fatal(p);
301: n->node = NODE_TEXT;
302: n->spc = 1;
303: word += i;
304: sz -= i;
305: }
306: if ((n->b = strndup(word, sz)) == NULL)
307: fatal(p);
308:
309: /* The new node remains open for later pnode_closetext(). */
310:
311: p->cur = n;
1.1 schwarze 312: }
313:
1.16 schwarze 314: /*
315: * Close out the text node and strip trailing whitespace, if one is open.
316: */
1.1 schwarze 317: static void
1.37 schwarze 318: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 319: {
1.16 schwarze 320: struct pnode *n;
1.37 schwarze 321: char *cp, *last_word;
1.16 schwarze 322:
323: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
324: return;
325: p->cur = n->parent;
1.32 schwarze 326: for (cp = strchr(n->b, '\0');
327: cp > n->b && isspace((unsigned char)cp[-1]);
328: *--cp = '\0')
1.23 schwarze 329: p->flags |= PFLAG_SPC;
1.37 schwarze 330:
331: if (p->flags & PFLAG_SPC || !check_last_word)
332: return;
333:
334: /*
335: * Find the beginning of the last word
336: * and delete whitespace before it.
337: */
338:
339: while (cp > n->b && !isspace((unsigned char)cp[-1]))
340: cp--;
341: if (cp == n->b)
342: return;
343:
344: last_word = cp;
345: while (cp > n->b && isspace((unsigned char)cp[-1]))
346: *--cp = '\0';
347:
348: /* Move the last word into its own node, for use with .Pf. */
349:
350: if ((n = pnode_alloc(p->cur)) == NULL)
351: fatal(p);
352: n->node = NODE_TEXT;
353: n->spc = 1;
354: if ((n->b = strdup(last_word)) == NULL)
355: fatal(p);
1.1 schwarze 356: }
357:
1.9 schwarze 358: static void
359: xml_entity(struct parse *p, const char *name)
360: {
361: const struct entity *entity;
1.30 schwarze 362: struct pnode *n;
1.23 schwarze 363: const char *ccp;
364: char *cp;
365: enum pstate pstate;
1.9 schwarze 366:
367: if (p->del > 0)
368: return;
369:
370: if (p->cur == NULL) {
371: error_msg(p, "discarding entity before document: &%s;", name);
372: return;
373: }
374:
1.37 schwarze 375: pnode_closetext(p, 0);
1.9 schwarze 376:
377: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
378: warn_msg(p, "entity after end of document: &%s;", name);
379:
380: for (entity = entities; entity->name != NULL; entity++)
381: if (strcmp(name, entity->name) == 0)
382: break;
383:
384: if (entity->roff == NULL) {
1.23 schwarze 385: if (p->doctype != NULL) {
1.30 schwarze 386: TAILQ_FOREACH(n, &p->doctype->childq, child) {
387: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 388: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 389: strcmp(ccp, name) != 0)
390: continue;
1.30 schwarze 391: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 392: ATTRKEY_SYSTEM, NULL)) != NULL) {
393: parse_file(p, -1, ccp);
394: p->flags &= ~PFLAG_SPC;
395: return;
396: }
1.30 schwarze 397: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 398: ATTRKEY_DEFINITION, NULL)) == NULL)
399: continue;
1.29 schwarze 400: if ((cp = strdup(ccp)) == NULL)
401: fatal(p);
1.23 schwarze 402: pstate = PARSE_ELEM;
403: parse_string(p, cp, strlen(cp), &pstate, 0);
404: p->flags &= ~PFLAG_SPC;
405: free(cp);
406: return;
407: }
408: }
1.9 schwarze 409: error_msg(p, "unknown entity &%s;", name);
410: return;
411: }
412:
413: /* Create, append, and close out an entity node. */
1.34 schwarze 414: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 415: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 416: fatal(p);
1.30 schwarze 417: n->node = NODE_ESCAPE;
418: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 419: p->flags &= ~PFLAG_SPC;
1.9 schwarze 420: }
421:
1.1 schwarze 422: /*
1.39 ! schwarze 423: * Parse an element name.
! 424: */
! 425: static enum nodeid
! 426: xml_name2node(struct parse *p, const char *name)
! 427: {
! 428: const struct alias *alias;
! 429: enum nodeid node;
! 430:
! 431: if ((node = pnode_parse(name)) < NODE_UNKNOWN)
! 432: return node;
! 433:
! 434: for (alias = aliases; alias->name != NULL; alias++)
! 435: if (strcmp(alias->name, name) == 0)
! 436: return alias->node;
! 437:
! 438: return NODE_UNKNOWN;
! 439: }
! 440:
! 441: /*
1.1 schwarze 442: * Begin an element.
443: */
444: static void
1.30 schwarze 445: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 446: {
1.30 schwarze 447: struct pnode *n;
1.1 schwarze 448:
1.4 schwarze 449: /*
450: * An ancestor is excluded from the tree;
451: * keep track of the number of levels excluded.
452: */
1.30 schwarze 453: if (p->del > 0) {
1.23 schwarze 454: if (*name != '!' && *name != '?')
1.30 schwarze 455: p->del++;
1.4 schwarze 456: return;
457: }
458:
1.39 ! schwarze 459: switch (p->ncur = xml_name2node(p, name)) {
1.4 schwarze 460: case NODE_DELETE_WARN:
1.30 schwarze 461: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 462: /* FALLTHROUGH */
1.4 schwarze 463: case NODE_DELETE:
1.30 schwarze 464: p->del = 1;
1.4 schwarze 465: /* FALLTHROUGH */
1.2 schwarze 466: case NODE_IGNORE:
467: return;
1.39 ! schwarze 468: case NODE_UNKNOWN:
! 469: if (*name != '!' && *name != '?')
! 470: error_msg(p, "unknown element <%s>", name);
! 471: return;
1.2 schwarze 472: default:
473: break;
474: }
1.1 schwarze 475:
1.30 schwarze 476: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
477: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 478:
1.39 ! schwarze 479: switch (pnode_class(p->ncur)) {
! 480: case CLASS_LINE:
! 481: case CLASS_ENCL:
! 482: pnode_closetext(p, 1);
! 483: break;
! 484: default:
! 485: pnode_closetext(p, 0);
! 486: break;
! 487: }
! 488:
1.34 schwarze 489: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 490: fatal(p);
1.17 schwarze 491:
492: /*
1.39 ! schwarze 493: * Some elements are self-closing.
1.17 schwarze 494: * Nodes that begin a new macro or request line or start by
495: * printing text always want whitespace before themselves.
496: */
497:
1.39 ! schwarze 498: switch (n->node = p->ncur) {
1.23 schwarze 499: case NODE_DOCTYPE:
500: case NODE_ENTITY:
501: case NODE_SBR:
1.30 schwarze 502: p->flags |= PFLAG_EEND;
1.17 schwarze 503: break;
504: default:
1.39 ! schwarze 505: break;
! 506: }
! 507: switch (pnode_class(p->ncur)) {
! 508: case CLASS_LINE:
! 509: case CLASS_ENCL:
1.30 schwarze 510: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 511: break;
1.39 ! schwarze 512: default:
! 513: n->spc = 1;
! 514: break;
1.17 schwarze 515: }
1.30 schwarze 516: p->cur = n;
517: if (n->node == NODE_DOCTYPE) {
518: if (p->doctype == NULL)
519: p->doctype = n;
1.23 schwarze 520: else
1.30 schwarze 521: error_msg(p, "duplicate doctype");
522: } else if (n->parent == NULL && p->tree->root == NULL)
523: p->tree->root = n;
1.5 schwarze 524: }
525:
526: static void
1.30 schwarze 527: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 528: {
1.30 schwarze 529: struct pattr *a;
1.23 schwarze 530: const char *value;
1.5 schwarze 531: enum attrkey key;
1.1 schwarze 532:
1.30 schwarze 533: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 534: return;
1.23 schwarze 535:
1.30 schwarze 536: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
537: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 538: value = name;
539: name = "NAME";
540: } else
541: value = NULL;
542:
1.5 schwarze 543: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 544: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 545: return;
546: }
1.30 schwarze 547: if ((a = calloc(1, sizeof(*a))) == NULL)
548: fatal(p);
1.29 schwarze 549:
1.30 schwarze 550: a->key = key;
551: a->val = ATTRVAL__MAX;
1.23 schwarze 552: if (value == NULL) {
1.30 schwarze 553: a->rawval = NULL;
554: p->flags |= PFLAG_ATTR;
1.23 schwarze 555: } else {
1.30 schwarze 556: if ((a->rawval = strdup(value)) == NULL)
557: fatal(p);
558: p->flags &= ~PFLAG_ATTR;
559: }
560: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
561: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
562: xml_attrkey(p, "DEFINITION");
1.5 schwarze 563: }
564:
565: static void
1.30 schwarze 566: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 567: {
1.30 schwarze 568: struct pattr *a;
1.5 schwarze 569:
1.30 schwarze 570: if (p->del > 0 || p->ncur == NODE_IGNORE ||
571: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 572: return;
1.30 schwarze 573: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 574: return;
1.30 schwarze 575: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
576: (a->rawval = strdup(name)) == NULL)
577: fatal(p);
578: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 579: }
580:
581: /*
582: * Roll up the parse tree.
583: * If we're at a text node, roll that one up first.
584: */
585: static void
1.31 schwarze 586: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 587: {
1.26 schwarze 588: struct pnode *n;
589: const char *cp;
1.5 schwarze 590: enum nodeid node;
1.1 schwarze 591:
1.4 schwarze 592: /*
593: * An ancestor is excluded from the tree;
594: * keep track of the number of levels excluded.
595: */
1.31 schwarze 596: if (p->del > 1) {
597: p->del--;
1.4 schwarze 598: return;
599: }
600:
1.31 schwarze 601: if (p->del == 0)
1.37 schwarze 602: pnode_closetext(p, 0);
1.2 schwarze 603:
1.39 ! schwarze 604: node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2 schwarze 605:
1.5 schwarze 606: switch (node) {
1.4 schwarze 607: case NODE_DELETE_WARN:
608: case NODE_DELETE:
1.31 schwarze 609: if (p->del > 0)
610: p->del--;
1.4 schwarze 611: break;
1.2 schwarze 612: case NODE_IGNORE:
1.39 ! schwarze 613: case NODE_UNKNOWN:
1.26 schwarze 614: break;
615: case NODE_INCLUDE:
1.31 schwarze 616: n = p->cur;
617: p->cur = p->cur->parent;
1.26 schwarze 618: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
619: if (cp == NULL)
1.31 schwarze 620: error_msg(p, "<xi:include> element "
1.26 schwarze 621: "without href attribute");
622: else
1.31 schwarze 623: parse_file(p, -1, cp);
1.26 schwarze 624: pnode_unlink(n);
1.31 schwarze 625: p->flags &= ~PFLAG_SPC;
1.2 schwarze 626: break;
1.23 schwarze 627: case NODE_DOCTYPE:
1.32 schwarze 628: case NODE_SBR:
1.31 schwarze 629: p->flags &= ~PFLAG_EEND;
1.23 schwarze 630: /* FALLTHROUGH */
1.2 schwarze 631: default:
1.31 schwarze 632: if (p->cur == NULL || node != p->cur->node) {
633: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 634: break;
635: }
636:
637: /*
638: * Refrain from actually closing the document element.
639: * If no more content follows, no harm is done, but if
640: * some content still follows, simply processing it is
641: * obviously better than discarding it or crashing.
642: */
643:
1.31 schwarze 644: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
645: p->cur = p->cur->parent;
646: if (p->cur != NULL)
647: p->ncur = p->cur->node;
1.23 schwarze 648: } else
1.31 schwarze 649: p->tree->flags |= TREE_CLOSED;
650: p->flags &= ~PFLAG_SPC;
1.4 schwarze 651: break;
1.2 schwarze 652: }
1.31 schwarze 653: assert(p->del == 0);
1.1 schwarze 654: }
655:
656: struct parse *
657: parse_alloc(int warn)
658: {
659: struct parse *p;
660:
661: if ((p = calloc(1, sizeof(*p))) == NULL)
662: return NULL;
663:
664: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
665: free(p);
666: return NULL;
667: }
1.23 schwarze 668: if (warn)
669: p->flags |= PFLAG_WARN;
670: else
671: p->flags &= ~PFLAG_WARN;
1.1 schwarze 672: return p;
673: }
674:
675: void
676: parse_free(struct parse *p)
677: {
678: if (p == NULL)
679: return;
680: if (p->tree != NULL) {
681: pnode_unlink(p->tree->root);
682: free(p->tree);
683: }
684: free(p);
685: }
686:
1.14 schwarze 687: static void
688: increment(struct parse *p, char *b, size_t *pend, int refill)
689: {
690: if (refill) {
691: if (b[*pend] == '\n') {
692: p->nline++;
693: p->ncol = 1;
694: } else
695: p->ncol++;
696: }
697: ++*pend;
698: }
699:
1.5 schwarze 700: /*
701: * Advance the pend pointer to the next character in the charset.
702: * If the charset starts with a space, it stands for any whitespace.
703: * Update the new input file position, used for messages.
704: * Do not overrun the buffer b of length rlen.
705: * When reaching the end, NUL-terminate the buffer and return 1;
706: * otherwise, return 0.
707: */
708: static int
709: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 710: const char *charset, int refill)
1.5 schwarze 711: {
712: int space;
713:
714: if (*charset == ' ') {
715: space = 1;
716: charset++;
717: } else
718: space = 0;
719:
1.14 schwarze 720: if (refill) {
721: p->nline = p->line;
722: p->ncol = p->col;
723: }
1.5 schwarze 724: while (*pend < rlen) {
725: if (space && isspace((unsigned char)b[*pend]))
726: break;
727: if (strchr(charset, b[*pend]) != NULL)
728: break;
1.14 schwarze 729: increment(p, b, pend, refill);
1.5 schwarze 730: }
731: if (*pend == rlen) {
732: b[rlen] = '\0';
1.14 schwarze 733: return refill;
1.5 schwarze 734: } else
735: return 0;
736: }
737:
1.14 schwarze 738: size_t
739: parse_string(struct parse *p, char *b, size_t rlen,
740: enum pstate *pstate, int refill)
741: {
742: char *cp;
743: size_t poff; /* Parse offset in b[]. */
744: size_t pend; /* Offset of the end of the current word. */
745: int elem_end;
746:
747: pend = 0;
748: for (;;) {
749:
750: /* Proceed to the next token, skipping whitespace. */
751:
752: if (refill) {
753: p->line = p->nline;
754: p->col = p->ncol;
755: }
756: if ((poff = pend) == rlen)
757: break;
758: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 759: p->flags |= PFLAG_SPC;
1.14 schwarze 760: increment(p, b, &pend, refill);
761: continue;
762: }
763:
764: /*
765: * The following four cases (ARG, TAG, and starting an
766: * entity or a tag) all parse a word or quoted string.
767: * If that extends beyond the read buffer and the last
768: * read(2) still got data, they all break out of the
769: * token loop to request more data from the read loop.
770: *
771: * Also, three of them detect self-closing tags, those
772: * ending with "/>", setting the flag elem_end and
773: * calling xml_elem_end() at the very end, after
774: * handling the attribute value, attribute name, or
775: * tag name, respectively.
776: */
777:
778: /* Parse an attribute value. */
779:
780: if (*pstate >= PARSE_ARG) {
781: if (*pstate == PARSE_ARG &&
782: (b[pend] == '\'' || b[pend] == '"')) {
783: *pstate = b[pend] == '"' ?
784: PARSE_DQ : PARSE_SQ;
785: increment(p, b, &pend, refill);
786: continue;
787: }
788: if (advance(p, b, rlen, &pend,
789: *pstate == PARSE_DQ ? "\"" :
790: *pstate == PARSE_SQ ? "'" : " >", refill))
791: break;
792: *pstate = PARSE_TAG;
793: elem_end = 0;
794: if (b[pend] == '>') {
795: *pstate = PARSE_ELEM;
796: if (pend > 0 && b[pend - 1] == '/') {
797: b[pend - 1] = '\0';
798: elem_end = 1;
799: }
1.23 schwarze 800: if (p->flags & PFLAG_EEND)
801: elem_end = 1;
1.14 schwarze 802: }
803: b[pend] = '\0';
804: if (pend < rlen)
805: increment(p, b, &pend, refill);
806: xml_attrval(p, b + poff);
807: if (elem_end)
808: xml_elem_end(p, NULL);
809:
810: /* Look for an attribute name. */
811:
812: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 813: switch (p->ncur) {
814: case NODE_DOCTYPE:
815: if (b[pend] == '[') {
816: *pstate = PARSE_ELEM;
817: increment(p, b, &pend, refill);
818: continue;
819: }
820: /* FALLTHROUGH */
821: case NODE_ENTITY:
822: if (b[pend] == '"' || b[pend] == '\'') {
823: *pstate = PARSE_ARG;
824: continue;
825: }
826: break;
827: default:
828: break;
829: }
1.14 schwarze 830: if (advance(p, b, rlen, &pend, " =>", refill))
831: break;
832: elem_end = 0;
833: switch (b[pend]) {
834: case '>':
835: *pstate = PARSE_ELEM;
836: if (pend > 0 && b[pend - 1] == '/') {
837: b[pend - 1] = '\0';
838: elem_end = 1;
839: }
1.23 schwarze 840: if (p->flags & PFLAG_EEND)
841: elem_end = 1;
1.14 schwarze 842: break;
843: case '=':
844: *pstate = PARSE_ARG;
845: break;
846: default:
847: break;
848: }
849: b[pend] = '\0';
850: if (pend < rlen)
851: increment(p, b, &pend, refill);
852: xml_attrkey(p, b + poff);
853: if (elem_end)
854: xml_elem_end(p, NULL);
855:
856: /* Begin an opening or closing tag. */
857:
858: } else if (b[poff] == '<') {
859: if (advance(p, b, rlen, &pend, " >", refill))
860: break;
861: if (pend > poff + 3 &&
862: strncmp(b + poff, "<!--", 4) == 0) {
863:
864: /* Skip a comment. */
865:
866: cp = strstr(b + pend - 2, "-->");
867: if (cp == NULL) {
868: if (refill)
869: break;
870: cp = b + rlen;
871: } else
872: cp += 3;
873: while (b + pend < cp)
874: increment(p, b, &pend, refill);
875: continue;
876: }
877: elem_end = 0;
878: if (b[pend] != '>')
879: *pstate = PARSE_TAG;
880: else if (pend > 0 && b[pend - 1] == '/') {
881: b[pend - 1] = '\0';
882: elem_end = 1;
883: }
884: b[pend] = '\0';
885: if (pend < rlen)
886: increment(p, b, &pend, refill);
887: if (b[++poff] == '/') {
888: elem_end = 1;
889: poff++;
1.23 schwarze 890: } else {
1.14 schwarze 891: xml_elem_start(p, b + poff);
1.23 schwarze 892: if (*pstate == PARSE_ELEM &&
893: p->flags & PFLAG_EEND)
894: elem_end = 1;
895: }
1.14 schwarze 896: if (elem_end)
897: xml_elem_end(p, b + poff);
898:
1.23 schwarze 899: /* Close a doctype. */
900:
901: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
902: *pstate = PARSE_TAG;
903: increment(p, b, &pend, refill);
904:
1.14 schwarze 905: /* Process an entity. */
906:
907: } else if (b[poff] == '&') {
908: if (advance(p, b, rlen, &pend, ";", refill))
909: break;
910: b[pend] = '\0';
911: if (pend < rlen)
912: increment(p, b, &pend, refill);
913: xml_entity(p, b + poff + 1);
914:
915: /* Process text up to the next tag, entity, or EOL. */
916:
917: } else {
1.28 schwarze 918: advance(p, b, rlen, &pend,
1.33 schwarze 919: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 920: refill);
1.35 schwarze 921: xml_text(p, b + poff, pend - poff);
1.33 schwarze 922: if (b[pend] == '\n')
1.37 schwarze 923: pnode_closetext(p, 0);
1.14 schwarze 924: }
925: }
926: return poff;
927: }
928:
1.24 schwarze 929:
930: /*
931: * The read loop.
932: * If the previous token was incomplete and asked for more input,
933: * we have to enter the read loop once more even on EOF.
934: * Once rsz is 0, incomplete tokens will no longer ask for more input
935: * but instead use whatever there is, and then exit the read loop.
936: * The minus one on the size limit for read(2) is needed such that
937: * advance() can set b[rlen] to NUL when needed.
938: */
939: static void
940: parse_fd(struct parse *p, int fd)
1.1 schwarze 941: {
942: char b[4096];
1.5 schwarze 943: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 944: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 945: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 946: enum pstate pstate;
1.1 schwarze 947:
1.24 schwarze 948: rlen = 0;
1.14 schwarze 949: pstate = PARSE_ELEM;
950: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
951: (rlen += rsz) > 0) {
952: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 953: /* Buffer exhausted; shift left and re-fill. */
954: assert(poff > 0);
955: rlen -= poff;
1.14 schwarze 956: memmove(b, b + poff, rlen);
1.5 schwarze 957: }
1.24 schwarze 958: if (rsz < 0)
959: error_msg(p, "read: %s", strerror(errno));
960: }
961:
962: /*
963: * Open and parse a file.
964: */
965: struct ptree *
966: parse_file(struct parse *p, int fd, const char *fname)
967: {
968: const char *save_fname;
969: int save_line, save_col;
970:
971: /* Save and initialize reporting data. */
972:
973: save_fname = p->fname;
974: save_line = p->nline;
975: save_col = p->ncol;
976: p->fname = fname;
977: p->line = 0;
978: p->col = 0;
979:
980: /* Open the file, unless it is already open. */
981:
982: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
983: error_msg(p, "open: %s", strerror(errno));
984: p->fname = save_fname;
985: return p->tree;
1.5 schwarze 986: }
1.24 schwarze 987:
988: /*
989: * After opening the starting file, change to the directory it
990: * is located in, in case it wants to include any further files,
991: * which are typically given with relative paths in DocBook.
992: * Do this on a best-effort basis; don't complain about failure.
993: */
994:
995: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
996: strcmp(fname, ".") != 0)
997: (void)chdir(fname);
998:
999: /* Run the read loop. */
1000:
1001: p->nline = 1;
1002: p->ncol = 1;
1003: parse_fd(p, fd);
1004:
1005: /* On the top level, finalize the parse tree. */
1006:
1007: if (save_fname == NULL) {
1.37 schwarze 1008: pnode_closetext(p, 0);
1.24 schwarze 1009: if (p->tree->root == NULL)
1010: error_msg(p, "empty document");
1011: else if ((p->tree->flags & TREE_CLOSED) == 0)
1012: warn_msg(p, "document not closed");
1013: pnode_unlink(p->doctype);
1014: }
1015:
1016: /* Clean up. */
1017:
1018: if (fd != STDIN_FILENO)
1019: close(fd);
1020: p->fname = save_fname;
1021: p->nline = save_line;
1022: p->ncol = save_col;
1.1 schwarze 1023: return p->tree;
1024: }
CVSweb