Annotation of docbook2mdoc/parse.c, Revision 1.37
1.37 ! schwarze 1: /* $Id: parse.c,v 1.36 2019/04/12 07:05:19 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
66: struct element {
67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
71: static const struct element elements[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.1 schwarze 73: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 74: { "anchor", NODE_DELETE },
1.22 schwarze 75: { "appendix", NODE_APPENDIX },
1.1 schwarze 76: { "application", NODE_APPLICATION },
77: { "arg", NODE_ARG },
1.22 schwarze 78: { "article", NODE_SECTION },
1.1 schwarze 79: { "author", NODE_AUTHOR },
80: { "authorgroup", NODE_AUTHORGROUP },
81: { "blockquote", NODE_BLOCKQUOTE },
1.22 schwarze 82: { "book", NODE_SECTION },
1.1 schwarze 83: { "bookinfo", NODE_BOOKINFO },
84: { "caution", NODE_CAUTION },
85: { "chapter", NODE_SECTION },
86: { "citerefentry", NODE_CITEREFENTRY },
87: { "citetitle", NODE_CITETITLE },
88: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 89: { "code", NODE_LITERAL },
1.1 schwarze 90: { "colspec", NODE_COLSPEC },
91: { "command", NODE_COMMAND },
1.36 schwarze 92: { "computeroutput", NODE_LITERAL },
1.1 schwarze 93: { "constant", NODE_CONSTANT },
1.7 schwarze 94: { "contrib", NODE_CONTRIB },
1.1 schwarze 95: { "copyright", NODE_COPYRIGHT },
96: { "date", NODE_DATE },
1.23 schwarze 97: { "!doctype", NODE_DOCTYPE },
98: { "!DOCTYPE", NODE_DOCTYPE },
1.1 schwarze 99: { "editor", NODE_EDITOR },
100: { "email", NODE_EMAIL },
101: { "emphasis", NODE_EMPHASIS },
1.23 schwarze 102: { "!ENTITY", NODE_ENTITY },
1.1 schwarze 103: { "entry", NODE_ENTRY },
104: { "envar", NODE_ENVAR },
1.13 schwarze 105: { "errorname", NODE_ERRORNAME },
1.1 schwarze 106: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
107: { "filename", NODE_FILENAME },
1.7 schwarze 108: { "firstname", NODE_PERSONNAME },
1.1 schwarze 109: { "firstterm", NODE_FIRSTTERM },
110: { "footnote", NODE_FOOTNOTE },
111: { "funcdef", NODE_FUNCDEF },
112: { "funcprototype", NODE_FUNCPROTOTYPE },
113: { "funcsynopsis", NODE_FUNCSYNOPSIS },
114: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
115: { "function", NODE_FUNCTION },
1.21 schwarze 116: { "glossary", NODE_VARIABLELIST },
117: { "glossdef", NODE_IGNORE },
118: { "glossdiv", NODE_IGNORE },
119: { "glossentry", NODE_VARLISTENTRY },
120: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 121: { "glossterm", NODE_GLOSSTERM },
122: { "group", NODE_GROUP },
123: { "holder", NODE_HOLDER },
124: { "index", NODE_INDEX },
1.4 schwarze 125: { "indexterm", NODE_DELETE },
1.1 schwarze 126: { "info", NODE_INFO },
127: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 128: { "informaltable", NODE_TABLE },
1.1 schwarze 129: { "inlineequation", NODE_INLINEEQUATION },
130: { "itemizedlist", NODE_ITEMIZEDLIST },
131: { "keysym", NODE_KEYSYM },
132: { "legalnotice", NODE_LEGALNOTICE },
133: { "link", NODE_LINK },
134: { "listitem", NODE_LISTITEM },
135: { "literal", NODE_LITERAL },
136: { "literallayout", NODE_LITERALLAYOUT },
137: { "manvolnum", NODE_MANVOLNUM },
1.36 schwarze 138: { "markup", NODE_MARKUP },
1.1 schwarze 139: { "member", NODE_MEMBER },
140: { "mml:math", NODE_MML_MATH },
141: { "mml:mfenced", NODE_MML_MFENCED },
142: { "mml:mfrac", NODE_MML_MFRAC },
143: { "mml:mi", NODE_MML_MI },
144: { "mml:mn", NODE_MML_MN },
145: { "mml:mo", NODE_MML_MO },
146: { "mml:mrow", NODE_MML_MROW },
147: { "mml:msub", NODE_MML_MSUB },
148: { "mml:msup", NODE_MML_MSUP },
149: { "modifier", NODE_MODIFIER },
150: { "note", NODE_NOTE },
151: { "option", NODE_OPTION },
152: { "orderedlist", NODE_ORDEREDLIST },
153: { "orgname", NODE_ORGNAME },
1.7 schwarze 154: { "othername", NODE_PERSONNAME },
1.1 schwarze 155: { "para", NODE_PARA },
156: { "paramdef", NODE_PARAMDEF },
157: { "parameter", NODE_PARAMETER },
158: { "part", NODE_SECTION },
159: { "personname", NODE_PERSONNAME },
1.3 schwarze 160: { "phrase", NODE_IGNORE },
1.1 schwarze 161: { "preface", NODE_PREFACE },
1.4 schwarze 162: { "primary", NODE_DELETE },
1.1 schwarze 163: { "programlisting", NODE_PROGRAMLISTING },
164: { "prompt", NODE_PROMPT },
165: { "quote", NODE_QUOTE },
166: { "refclass", NODE_REFCLASS },
167: { "refdescriptor", NODE_REFDESCRIPTOR },
168: { "refentry", NODE_REFENTRY },
169: { "refentryinfo", NODE_REFENTRYINFO },
170: { "refentrytitle", NODE_REFENTRYTITLE },
171: { "refmeta", NODE_REFMETA },
172: { "refmetainfo", NODE_REFMETAINFO },
173: { "refmiscinfo", NODE_REFMISCINFO },
174: { "refname", NODE_REFNAME },
175: { "refnamediv", NODE_REFNAMEDIV },
176: { "refpurpose", NODE_REFPURPOSE },
177: { "refsect1", NODE_SECTION },
178: { "refsect2", NODE_SECTION },
179: { "refsect3", NODE_SECTION },
180: { "refsection", NODE_SECTION },
181: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
182: { "releaseinfo", NODE_RELEASEINFO },
183: { "replaceable", NODE_REPLACEABLE },
184: { "row", NODE_ROW },
185: { "sbr", NODE_SBR },
186: { "screen", NODE_SCREEN },
1.4 schwarze 187: { "secondary", NODE_DELETE },
1.1 schwarze 188: { "sect1", NODE_SECTION },
189: { "sect2", NODE_SECTION },
190: { "section", NODE_SECTION },
1.36 schwarze 191: { "sgmltag", NODE_MARKUP },
1.15 schwarze 192: { "simpara", NODE_PARA },
1.1 schwarze 193: { "simplelist", NODE_SIMPLELIST },
194: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 195: { "structfield", NODE_PARAMETER },
196: { "structname", NODE_TYPE },
1.1 schwarze 197: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 198: { "surname", NODE_PERSONNAME },
1.12 schwarze 199: { "symbol", NODE_CONSTANT },
1.1 schwarze 200: { "synopsis", NODE_SYNOPSIS },
201: { "table", NODE_TABLE },
202: { "tbody", NODE_TBODY },
203: { "term", NODE_TERM },
204: { "tfoot", NODE_TFOOT },
205: { "tgroup", NODE_TGROUP },
206: { "thead", NODE_THEAD },
207: { "tip", NODE_TIP },
208: { "title", NODE_TITLE },
1.3 schwarze 209: { "trademark", NODE_IGNORE },
1.1 schwarze 210: { "type", NODE_TYPE },
1.18 schwarze 211: { "ulink", NODE_LINK },
1.13 schwarze 212: { "userinput", NODE_LITERAL },
1.1 schwarze 213: { "variablelist", NODE_VARIABLELIST },
214: { "varlistentry", NODE_VARLISTENTRY },
215: { "varname", NODE_VARNAME },
216: { "warning", NODE_WARNING },
217: { "wordasword", NODE_WORDASWORD },
1.26 schwarze 218: { "xi:include", NODE_INCLUDE },
1.1 schwarze 219: { "year", NODE_YEAR },
1.5 schwarze 220: { NULL, NODE_IGNORE }
1.1 schwarze 221: };
222:
1.9 schwarze 223: struct entity {
224: const char *name;
225: const char *roff;
226: };
227:
228: /*
229: * XML character entity references found in the wild.
230: * Those that don't have an exact mandoc_char(7) representation
231: * are approximated, and the desired codepoint is given as a comment.
232: * Encoding them as \\[u...] would leave -Tascii out in the cold.
233: */
234: static const struct entity entities[] = {
235: { "alpha", "\\(*a" },
236: { "amp", "&" },
237: { "apos", "'" },
238: { "auml", "\\(:a" },
239: { "beta", "\\(*b" },
240: { "circ", "^" }, /* U+02C6 */
241: { "copy", "\\(co" },
242: { "dagger", "\\(dg" },
243: { "Delta", "\\(*D" },
244: { "eacute", "\\('e" },
245: { "emsp", "\\ " }, /* U+2003 */
246: { "gt", ">" },
247: { "hairsp", "\\^" },
248: { "kappa", "\\(*k" },
249: { "larr", "\\(<-" },
250: { "ldquo", "\\(lq" },
251: { "le", "\\(<=" },
252: { "lowbar", "_" },
253: { "lsqb", "[" },
254: { "lt", "<" },
255: { "mdash", "\\(em" },
256: { "minus", "\\-" },
257: { "ndash", "\\(en" },
258: { "nbsp", "\\ " },
259: { "num", "#" },
260: { "oslash", "\\(/o" },
261: { "ouml", "\\(:o" },
262: { "percnt", "%" },
263: { "quot", "\\(dq" },
264: { "rarr", "\\(->" },
265: { "rArr", "\\(rA" },
266: { "rdquo", "\\(rq" },
267: { "reg", "\\(rg" },
268: { "rho", "\\(*r" },
269: { "rsqb", "]" },
270: { "sigma", "\\(*s" },
271: { "shy", "\\&" }, /* U+00AD */
272: { "tau", "\\(*t" },
273: { "tilde", "\\[u02DC]" },
274: { "times", "\\[tmu]" },
275: { "uuml", "\\(:u" },
276: { NULL, NULL }
277: };
278:
1.23 schwarze 279: static size_t parse_string(struct parse *, char *, size_t,
280: enum pstate *, int);
1.24 schwarze 281: static void parse_fd(struct parse *, int);
1.23 schwarze 282:
283:
1.6 schwarze 284: static void
1.29 schwarze 285: fatal(struct parse *p)
286: {
287: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
288: perror(NULL);
289: exit(6);
290: }
291:
292: static void
1.6 schwarze 293: error_msg(struct parse *p, const char *fmt, ...)
294: {
295: va_list ap;
296:
1.29 schwarze 297: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 298: va_start(ap, fmt);
299: vfprintf(stderr, fmt, ap);
300: va_end(ap);
301: fputc('\n', stderr);
1.29 schwarze 302: p->tree->flags |= TREE_ERROR;
1.6 schwarze 303: }
304:
305: static void
306: warn_msg(struct parse *p, const char *fmt, ...)
307: {
308: va_list ap;
309:
1.23 schwarze 310: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 311: return;
312:
1.29 schwarze 313: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 314: va_start(ap, fmt);
315: vfprintf(stderr, fmt, ap);
316: va_end(ap);
317: fputc('\n', stderr);
1.29 schwarze 318: p->tree->flags |= TREE_WARN;
1.6 schwarze 319: }
320:
1.1 schwarze 321: /*
322: * Process a string of characters.
323: * If a text node is already open, append to it.
324: * Otherwise, create a new one as a child of the current node.
325: */
326: static void
1.35 schwarze 327: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 328: {
1.35 schwarze 329: struct pnode *n, *np;
1.32 schwarze 330: size_t oldsz, newsz;
1.35 schwarze 331: int i;
1.1 schwarze 332:
1.32 schwarze 333: assert(sz > 0);
1.30 schwarze 334: if (p->del > 0)
1.1 schwarze 335: return;
336:
1.32 schwarze 337: if ((n = p->cur) == NULL) {
1.35 schwarze 338: error_msg(p, "discarding text before document: %.*s",
339: sz, word);
1.5 schwarze 340: return;
341: }
342:
1.35 schwarze 343: /* Append to the current text node, if one is open. */
344:
345: if (n->node == NODE_TEXT) {
346: oldsz = strlen(n->b);
347: newsz = oldsz + sz;
348: if (oldsz && (p->flags & PFLAG_SPC))
349: newsz++;
350: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 351: fatal(p);
1.35 schwarze 352: if (oldsz && (p->flags & PFLAG_SPC))
353: n->b[oldsz++] = ' ';
354: memcpy(n->b + oldsz, word, sz);
355: n->b[newsz] = '\0';
356: p->flags &= ~PFLAG_SPC;
357: return;
1.1 schwarze 358: }
359:
1.35 schwarze 360: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 361: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 362:
1.35 schwarze 363: /* Create a new text node. */
1.1 schwarze 364:
1.35 schwarze 365: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 366: fatal(p);
1.35 schwarze 367: n->node = NODE_TEXT;
368: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 369: p->flags &= ~PFLAG_SPC;
1.35 schwarze 370:
371: /*
372: * If this node follows a non-text node without intervening
373: * whitespace, keep the text in it as short as possible,
374: * and do not keep it open.
375: */
376:
377: if (n->spc == 0 &&
378: (np = TAILQ_PREV(n, pnodeq, child)) != NULL &&
379: np->node != NODE_TEXT && np->node != NODE_ESCAPE) {
380: i = 0;
381: while (i < sz && !isspace((unsigned char)word[i]))
382: i++;
383: if ((n->b = strndup(word, i)) == NULL)
384: fatal(p);
385: if (i == sz)
386: return;
387: while (i < sz && isspace((unsigned char)word[i]))
388: i++;
389: if (i == sz) {
390: p->flags |= PFLAG_SPC;
391: return;
392: }
393:
394: /* Put any remaining text into a second node. */
395:
396: if ((n = pnode_alloc(p->cur)) == NULL)
397: fatal(p);
398: n->node = NODE_TEXT;
399: n->spc = 1;
400: word += i;
401: sz -= i;
402: }
403: if ((n->b = strndup(word, sz)) == NULL)
404: fatal(p);
405:
406: /* The new node remains open for later pnode_closetext(). */
407:
408: p->cur = n;
1.1 schwarze 409: }
410:
1.16 schwarze 411: /*
412: * Close out the text node and strip trailing whitespace, if one is open.
413: */
1.1 schwarze 414: static void
1.37 ! schwarze 415: pnode_closetext(struct parse *p, int check_last_word)
1.1 schwarze 416: {
1.16 schwarze 417: struct pnode *n;
1.37 ! schwarze 418: char *cp, *last_word;
1.16 schwarze 419:
420: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
421: return;
422: p->cur = n->parent;
1.32 schwarze 423: for (cp = strchr(n->b, '\0');
424: cp > n->b && isspace((unsigned char)cp[-1]);
425: *--cp = '\0')
1.23 schwarze 426: p->flags |= PFLAG_SPC;
1.37 ! schwarze 427:
! 428: if (p->flags & PFLAG_SPC || !check_last_word)
! 429: return;
! 430:
! 431: /*
! 432: * Find the beginning of the last word
! 433: * and delete whitespace before it.
! 434: */
! 435:
! 436: while (cp > n->b && !isspace((unsigned char)cp[-1]))
! 437: cp--;
! 438: if (cp == n->b)
! 439: return;
! 440:
! 441: last_word = cp;
! 442: while (cp > n->b && isspace((unsigned char)cp[-1]))
! 443: *--cp = '\0';
! 444:
! 445: /* Move the last word into its own node, for use with .Pf. */
! 446:
! 447: if ((n = pnode_alloc(p->cur)) == NULL)
! 448: fatal(p);
! 449: n->node = NODE_TEXT;
! 450: n->spc = 1;
! 451: if ((n->b = strdup(last_word)) == NULL)
! 452: fatal(p);
1.1 schwarze 453: }
454:
1.9 schwarze 455: static void
456: xml_entity(struct parse *p, const char *name)
457: {
458: const struct entity *entity;
1.30 schwarze 459: struct pnode *n;
1.23 schwarze 460: const char *ccp;
461: char *cp;
462: enum pstate pstate;
1.9 schwarze 463:
464: if (p->del > 0)
465: return;
466:
467: if (p->cur == NULL) {
468: error_msg(p, "discarding entity before document: &%s;", name);
469: return;
470: }
471:
1.37 ! schwarze 472: pnode_closetext(p, 0);
1.9 schwarze 473:
474: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
475: warn_msg(p, "entity after end of document: &%s;", name);
476:
477: for (entity = entities; entity->name != NULL; entity++)
478: if (strcmp(name, entity->name) == 0)
479: break;
480:
481: if (entity->roff == NULL) {
1.23 schwarze 482: if (p->doctype != NULL) {
1.30 schwarze 483: TAILQ_FOREACH(n, &p->doctype->childq, child) {
484: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 485: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 486: strcmp(ccp, name) != 0)
487: continue;
1.30 schwarze 488: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 489: ATTRKEY_SYSTEM, NULL)) != NULL) {
490: parse_file(p, -1, ccp);
491: p->flags &= ~PFLAG_SPC;
492: return;
493: }
1.30 schwarze 494: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 495: ATTRKEY_DEFINITION, NULL)) == NULL)
496: continue;
1.29 schwarze 497: if ((cp = strdup(ccp)) == NULL)
498: fatal(p);
1.23 schwarze 499: pstate = PARSE_ELEM;
500: parse_string(p, cp, strlen(cp), &pstate, 0);
501: p->flags &= ~PFLAG_SPC;
502: free(cp);
503: return;
504: }
505: }
1.9 schwarze 506: error_msg(p, "unknown entity &%s;", name);
507: return;
508: }
509:
510: /* Create, append, and close out an entity node. */
1.34 schwarze 511: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 512: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 513: fatal(p);
1.30 schwarze 514: n->node = NODE_ESCAPE;
515: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 516: p->flags &= ~PFLAG_SPC;
1.9 schwarze 517: }
518:
1.1 schwarze 519: /*
520: * Begin an element.
521: */
522: static void
1.30 schwarze 523: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 524: {
1.5 schwarze 525: const struct element *elem;
1.30 schwarze 526: struct pnode *n;
1.1 schwarze 527:
1.4 schwarze 528: /*
529: * An ancestor is excluded from the tree;
530: * keep track of the number of levels excluded.
531: */
1.30 schwarze 532: if (p->del > 0) {
1.23 schwarze 533: if (*name != '!' && *name != '?')
1.30 schwarze 534: p->del++;
1.4 schwarze 535: return;
536: }
537:
1.37 ! schwarze 538: pnode_closetext(p, 1);
1.1 schwarze 539:
540: for (elem = elements; elem->name != NULL; elem++)
541: if (strcmp(elem->name, name) == 0)
542: break;
543:
1.23 schwarze 544: if (elem->name == NULL) {
545: if (*name == '!' || *name == '?')
546: return;
1.30 schwarze 547: error_msg(p, "unknown element <%s>", name);
1.23 schwarze 548: }
1.6 schwarze 549:
1.30 schwarze 550: p->ncur = elem->node;
1.1 schwarze 551:
1.30 schwarze 552: switch (p->ncur) {
1.4 schwarze 553: case NODE_DELETE_WARN:
1.30 schwarze 554: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 555: /* FALLTHROUGH */
1.4 schwarze 556: case NODE_DELETE:
1.30 schwarze 557: p->del = 1;
1.4 schwarze 558: /* FALLTHROUGH */
1.2 schwarze 559: case NODE_IGNORE:
560: return;
561: case NODE_INLINEEQUATION:
1.30 schwarze 562: p->tree->flags |= TREE_EQN;
1.2 schwarze 563: break;
564: default:
565: break;
566: }
1.1 schwarze 567:
1.30 schwarze 568: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
569: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 570:
1.34 schwarze 571: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 572: fatal(p);
1.17 schwarze 573:
574: /*
575: * Nodes that begin a new macro or request line or start by
576: * printing text always want whitespace before themselves.
577: */
578:
1.30 schwarze 579: switch (n->node = elem->node) {
1.23 schwarze 580: case NODE_DOCTYPE:
581: case NODE_ENTITY:
582: case NODE_SBR:
1.30 schwarze 583: p->flags |= PFLAG_EEND;
1.23 schwarze 584: /* FALLTHROUGH */
1.22 schwarze 585: case NODE_APPENDIX:
1.17 schwarze 586: case NODE_AUTHORGROUP:
1.20 schwarze 587: case NODE_BLOCKQUOTE:
1.17 schwarze 588: case NODE_BOOKINFO:
589: case NODE_CAUTION:
590: case NODE_EDITOR:
591: case NODE_ENTRY:
592: case NODE_FUNCDEF:
593: case NODE_FUNCPROTOTYPE:
594: case NODE_INFORMALEQUATION:
595: case NODE_INLINEEQUATION:
596: case NODE_ITEMIZEDLIST:
597: case NODE_LEGALNOTICE:
598: case NODE_LITERALLAYOUT:
599: case NODE_NOTE:
600: case NODE_ORDEREDLIST:
601: case NODE_PARA:
602: case NODE_PREFACE:
603: case NODE_PROGRAMLISTING:
604: case NODE_REFMETA:
605: case NODE_REFNAMEDIV:
606: case NODE_REFSYNOPSISDIV:
607: case NODE_ROW:
608: case NODE_SCREEN:
609: case NODE_SECTION:
610: case NODE_SYNOPSIS:
611: case NODE_TGROUP:
612: case NODE_TIP:
613: case NODE_TITLE:
614: case NODE_VARIABLELIST:
615: case NODE_VARLISTENTRY:
616: case NODE_WARNING:
1.30 schwarze 617: n->spc = 1;
1.17 schwarze 618: break;
619: default:
1.30 schwarze 620: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 621: break;
622: }
1.30 schwarze 623: p->cur = n;
624: if (n->node == NODE_DOCTYPE) {
625: if (p->doctype == NULL)
626: p->doctype = n;
1.23 schwarze 627: else
1.30 schwarze 628: error_msg(p, "duplicate doctype");
629: } else if (n->parent == NULL && p->tree->root == NULL)
630: p->tree->root = n;
1.5 schwarze 631: }
632:
633: static void
1.30 schwarze 634: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 635: {
1.30 schwarze 636: struct pattr *a;
1.23 schwarze 637: const char *value;
1.5 schwarze 638: enum attrkey key;
1.1 schwarze 639:
1.30 schwarze 640: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 641: return;
1.23 schwarze 642:
1.30 schwarze 643: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
644: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 645: value = name;
646: name = "NAME";
647: } else
648: value = NULL;
649:
1.5 schwarze 650: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 651: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 652: return;
653: }
1.30 schwarze 654: if ((a = calloc(1, sizeof(*a))) == NULL)
655: fatal(p);
1.29 schwarze 656:
1.30 schwarze 657: a->key = key;
658: a->val = ATTRVAL__MAX;
1.23 schwarze 659: if (value == NULL) {
1.30 schwarze 660: a->rawval = NULL;
661: p->flags |= PFLAG_ATTR;
1.23 schwarze 662: } else {
1.30 schwarze 663: if ((a->rawval = strdup(value)) == NULL)
664: fatal(p);
665: p->flags &= ~PFLAG_ATTR;
666: }
667: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
668: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
669: xml_attrkey(p, "DEFINITION");
1.5 schwarze 670: }
671:
672: static void
1.30 schwarze 673: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 674: {
1.30 schwarze 675: struct pattr *a;
1.5 schwarze 676:
1.30 schwarze 677: if (p->del > 0 || p->ncur == NODE_IGNORE ||
678: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 679: return;
1.30 schwarze 680: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 681: return;
1.30 schwarze 682: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
683: (a->rawval = strdup(name)) == NULL)
684: fatal(p);
685: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 686: }
687:
688: /*
689: * Roll up the parse tree.
690: * If we're at a text node, roll that one up first.
691: */
692: static void
1.31 schwarze 693: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 694: {
1.5 schwarze 695: const struct element *elem;
1.26 schwarze 696: struct pnode *n;
697: const char *cp;
1.5 schwarze 698: enum nodeid node;
1.1 schwarze 699:
1.4 schwarze 700: /*
701: * An ancestor is excluded from the tree;
702: * keep track of the number of levels excluded.
703: */
1.31 schwarze 704: if (p->del > 1) {
705: p->del--;
1.4 schwarze 706: return;
707: }
708:
1.31 schwarze 709: if (p->del == 0)
1.37 ! schwarze 710: pnode_closetext(p, 0);
1.2 schwarze 711:
1.5 schwarze 712: if (name != NULL) {
713: for (elem = elements; elem->name != NULL; elem++)
714: if (strcmp(elem->name, name) == 0)
715: break;
716: node = elem->node;
717: } else
1.31 schwarze 718: node = p->ncur;
1.2 schwarze 719:
1.5 schwarze 720: switch (node) {
1.4 schwarze 721: case NODE_DELETE_WARN:
722: case NODE_DELETE:
1.31 schwarze 723: if (p->del > 0)
724: p->del--;
1.4 schwarze 725: break;
1.2 schwarze 726: case NODE_IGNORE:
1.26 schwarze 727: break;
728: case NODE_INCLUDE:
1.31 schwarze 729: n = p->cur;
730: p->cur = p->cur->parent;
1.26 schwarze 731: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
732: if (cp == NULL)
1.31 schwarze 733: error_msg(p, "<xi:include> element "
1.26 schwarze 734: "without href attribute");
735: else
1.31 schwarze 736: parse_file(p, -1, cp);
1.26 schwarze 737: pnode_unlink(n);
1.31 schwarze 738: p->flags &= ~PFLAG_SPC;
1.2 schwarze 739: break;
1.23 schwarze 740: case NODE_DOCTYPE:
1.32 schwarze 741: case NODE_SBR:
1.31 schwarze 742: p->flags &= ~PFLAG_EEND;
1.23 schwarze 743: /* FALLTHROUGH */
1.2 schwarze 744: default:
1.31 schwarze 745: if (p->cur == NULL || node != p->cur->node) {
746: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 747: break;
748: }
749:
750: /*
751: * Refrain from actually closing the document element.
752: * If no more content follows, no harm is done, but if
753: * some content still follows, simply processing it is
754: * obviously better than discarding it or crashing.
755: */
756:
1.31 schwarze 757: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
758: p->cur = p->cur->parent;
759: if (p->cur != NULL)
760: p->ncur = p->cur->node;
1.23 schwarze 761: } else
1.31 schwarze 762: p->tree->flags |= TREE_CLOSED;
763: p->flags &= ~PFLAG_SPC;
1.4 schwarze 764: break;
1.2 schwarze 765: }
1.31 schwarze 766: assert(p->del == 0);
1.1 schwarze 767: }
768:
769: struct parse *
770: parse_alloc(int warn)
771: {
772: struct parse *p;
773:
774: if ((p = calloc(1, sizeof(*p))) == NULL)
775: return NULL;
776:
777: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
778: free(p);
779: return NULL;
780: }
1.23 schwarze 781: if (warn)
782: p->flags |= PFLAG_WARN;
783: else
784: p->flags &= ~PFLAG_WARN;
1.1 schwarze 785: return p;
786: }
787:
788: void
789: parse_free(struct parse *p)
790: {
791: if (p == NULL)
792: return;
793: if (p->tree != NULL) {
794: pnode_unlink(p->tree->root);
795: free(p->tree);
796: }
797: free(p);
798: }
799:
1.14 schwarze 800: static void
801: increment(struct parse *p, char *b, size_t *pend, int refill)
802: {
803: if (refill) {
804: if (b[*pend] == '\n') {
805: p->nline++;
806: p->ncol = 1;
807: } else
808: p->ncol++;
809: }
810: ++*pend;
811: }
812:
1.5 schwarze 813: /*
814: * Advance the pend pointer to the next character in the charset.
815: * If the charset starts with a space, it stands for any whitespace.
816: * Update the new input file position, used for messages.
817: * Do not overrun the buffer b of length rlen.
818: * When reaching the end, NUL-terminate the buffer and return 1;
819: * otherwise, return 0.
820: */
821: static int
822: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 823: const char *charset, int refill)
1.5 schwarze 824: {
825: int space;
826:
827: if (*charset == ' ') {
828: space = 1;
829: charset++;
830: } else
831: space = 0;
832:
1.14 schwarze 833: if (refill) {
834: p->nline = p->line;
835: p->ncol = p->col;
836: }
1.5 schwarze 837: while (*pend < rlen) {
838: if (space && isspace((unsigned char)b[*pend]))
839: break;
840: if (strchr(charset, b[*pend]) != NULL)
841: break;
1.14 schwarze 842: increment(p, b, pend, refill);
1.5 schwarze 843: }
844: if (*pend == rlen) {
845: b[rlen] = '\0';
1.14 schwarze 846: return refill;
1.5 schwarze 847: } else
848: return 0;
849: }
850:
1.14 schwarze 851: size_t
852: parse_string(struct parse *p, char *b, size_t rlen,
853: enum pstate *pstate, int refill)
854: {
855: char *cp;
856: size_t poff; /* Parse offset in b[]. */
857: size_t pend; /* Offset of the end of the current word. */
858: int elem_end;
859:
860: pend = 0;
861: for (;;) {
862:
863: /* Proceed to the next token, skipping whitespace. */
864:
865: if (refill) {
866: p->line = p->nline;
867: p->col = p->ncol;
868: }
869: if ((poff = pend) == rlen)
870: break;
871: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 872: p->flags |= PFLAG_SPC;
1.14 schwarze 873: increment(p, b, &pend, refill);
874: continue;
875: }
876:
877: /*
878: * The following four cases (ARG, TAG, and starting an
879: * entity or a tag) all parse a word or quoted string.
880: * If that extends beyond the read buffer and the last
881: * read(2) still got data, they all break out of the
882: * token loop to request more data from the read loop.
883: *
884: * Also, three of them detect self-closing tags, those
885: * ending with "/>", setting the flag elem_end and
886: * calling xml_elem_end() at the very end, after
887: * handling the attribute value, attribute name, or
888: * tag name, respectively.
889: */
890:
891: /* Parse an attribute value. */
892:
893: if (*pstate >= PARSE_ARG) {
894: if (*pstate == PARSE_ARG &&
895: (b[pend] == '\'' || b[pend] == '"')) {
896: *pstate = b[pend] == '"' ?
897: PARSE_DQ : PARSE_SQ;
898: increment(p, b, &pend, refill);
899: continue;
900: }
901: if (advance(p, b, rlen, &pend,
902: *pstate == PARSE_DQ ? "\"" :
903: *pstate == PARSE_SQ ? "'" : " >", refill))
904: break;
905: *pstate = PARSE_TAG;
906: elem_end = 0;
907: if (b[pend] == '>') {
908: *pstate = PARSE_ELEM;
909: if (pend > 0 && b[pend - 1] == '/') {
910: b[pend - 1] = '\0';
911: elem_end = 1;
912: }
1.23 schwarze 913: if (p->flags & PFLAG_EEND)
914: elem_end = 1;
1.14 schwarze 915: }
916: b[pend] = '\0';
917: if (pend < rlen)
918: increment(p, b, &pend, refill);
919: xml_attrval(p, b + poff);
920: if (elem_end)
921: xml_elem_end(p, NULL);
922:
923: /* Look for an attribute name. */
924:
925: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 926: switch (p->ncur) {
927: case NODE_DOCTYPE:
928: if (b[pend] == '[') {
929: *pstate = PARSE_ELEM;
930: increment(p, b, &pend, refill);
931: continue;
932: }
933: /* FALLTHROUGH */
934: case NODE_ENTITY:
935: if (b[pend] == '"' || b[pend] == '\'') {
936: *pstate = PARSE_ARG;
937: continue;
938: }
939: break;
940: default:
941: break;
942: }
1.14 schwarze 943: if (advance(p, b, rlen, &pend, " =>", refill))
944: break;
945: elem_end = 0;
946: switch (b[pend]) {
947: case '>':
948: *pstate = PARSE_ELEM;
949: if (pend > 0 && b[pend - 1] == '/') {
950: b[pend - 1] = '\0';
951: elem_end = 1;
952: }
1.23 schwarze 953: if (p->flags & PFLAG_EEND)
954: elem_end = 1;
1.14 schwarze 955: break;
956: case '=':
957: *pstate = PARSE_ARG;
958: break;
959: default:
960: break;
961: }
962: b[pend] = '\0';
963: if (pend < rlen)
964: increment(p, b, &pend, refill);
965: xml_attrkey(p, b + poff);
966: if (elem_end)
967: xml_elem_end(p, NULL);
968:
969: /* Begin an opening or closing tag. */
970:
971: } else if (b[poff] == '<') {
972: if (advance(p, b, rlen, &pend, " >", refill))
973: break;
974: if (pend > poff + 3 &&
975: strncmp(b + poff, "<!--", 4) == 0) {
976:
977: /* Skip a comment. */
978:
979: cp = strstr(b + pend - 2, "-->");
980: if (cp == NULL) {
981: if (refill)
982: break;
983: cp = b + rlen;
984: } else
985: cp += 3;
986: while (b + pend < cp)
987: increment(p, b, &pend, refill);
988: continue;
989: }
990: elem_end = 0;
991: if (b[pend] != '>')
992: *pstate = PARSE_TAG;
993: else if (pend > 0 && b[pend - 1] == '/') {
994: b[pend - 1] = '\0';
995: elem_end = 1;
996: }
997: b[pend] = '\0';
998: if (pend < rlen)
999: increment(p, b, &pend, refill);
1000: if (b[++poff] == '/') {
1001: elem_end = 1;
1002: poff++;
1.23 schwarze 1003: } else {
1.14 schwarze 1004: xml_elem_start(p, b + poff);
1.23 schwarze 1005: if (*pstate == PARSE_ELEM &&
1006: p->flags & PFLAG_EEND)
1007: elem_end = 1;
1008: }
1.14 schwarze 1009: if (elem_end)
1010: xml_elem_end(p, b + poff);
1011:
1.23 schwarze 1012: /* Close a doctype. */
1013:
1014: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
1015: *pstate = PARSE_TAG;
1016: increment(p, b, &pend, refill);
1017:
1.14 schwarze 1018: /* Process an entity. */
1019:
1020: } else if (b[poff] == '&') {
1021: if (advance(p, b, rlen, &pend, ";", refill))
1022: break;
1023: b[pend] = '\0';
1024: if (pend < rlen)
1025: increment(p, b, &pend, refill);
1026: xml_entity(p, b + poff + 1);
1027:
1028: /* Process text up to the next tag, entity, or EOL. */
1029:
1030: } else {
1.28 schwarze 1031: advance(p, b, rlen, &pend,
1.33 schwarze 1032: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 1033: refill);
1.35 schwarze 1034: xml_text(p, b + poff, pend - poff);
1.33 schwarze 1035: if (b[pend] == '\n')
1.37 ! schwarze 1036: pnode_closetext(p, 0);
1.14 schwarze 1037: }
1038: }
1039: return poff;
1040: }
1041:
1.24 schwarze 1042:
1043: /*
1044: * The read loop.
1045: * If the previous token was incomplete and asked for more input,
1046: * we have to enter the read loop once more even on EOF.
1047: * Once rsz is 0, incomplete tokens will no longer ask for more input
1048: * but instead use whatever there is, and then exit the read loop.
1049: * The minus one on the size limit for read(2) is needed such that
1050: * advance() can set b[rlen] to NUL when needed.
1051: */
1052: static void
1053: parse_fd(struct parse *p, int fd)
1.1 schwarze 1054: {
1055: char b[4096];
1.5 schwarze 1056: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 1057: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 1058: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1059: enum pstate pstate;
1.1 schwarze 1060:
1.24 schwarze 1061: rlen = 0;
1.14 schwarze 1062: pstate = PARSE_ELEM;
1063: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1064: (rlen += rsz) > 0) {
1065: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1066: /* Buffer exhausted; shift left and re-fill. */
1067: assert(poff > 0);
1068: rlen -= poff;
1.14 schwarze 1069: memmove(b, b + poff, rlen);
1.5 schwarze 1070: }
1.24 schwarze 1071: if (rsz < 0)
1072: error_msg(p, "read: %s", strerror(errno));
1073: }
1074:
1075: /*
1076: * Open and parse a file.
1077: */
1078: struct ptree *
1079: parse_file(struct parse *p, int fd, const char *fname)
1080: {
1081: const char *save_fname;
1082: int save_line, save_col;
1083:
1084: /* Save and initialize reporting data. */
1085:
1086: save_fname = p->fname;
1087: save_line = p->nline;
1088: save_col = p->ncol;
1089: p->fname = fname;
1090: p->line = 0;
1091: p->col = 0;
1092:
1093: /* Open the file, unless it is already open. */
1094:
1095: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1096: error_msg(p, "open: %s", strerror(errno));
1097: p->fname = save_fname;
1098: return p->tree;
1.5 schwarze 1099: }
1.24 schwarze 1100:
1101: /*
1102: * After opening the starting file, change to the directory it
1103: * is located in, in case it wants to include any further files,
1104: * which are typically given with relative paths in DocBook.
1105: * Do this on a best-effort basis; don't complain about failure.
1106: */
1107:
1108: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1109: strcmp(fname, ".") != 0)
1110: (void)chdir(fname);
1111:
1112: /* Run the read loop. */
1113:
1114: p->nline = 1;
1115: p->ncol = 1;
1116: parse_fd(p, fd);
1117:
1118: /* On the top level, finalize the parse tree. */
1119:
1120: if (save_fname == NULL) {
1.37 ! schwarze 1121: pnode_closetext(p, 0);
1.24 schwarze 1122: if (p->tree->root == NULL)
1123: error_msg(p, "empty document");
1124: else if ((p->tree->flags & TREE_CLOSED) == 0)
1125: warn_msg(p, "document not closed");
1126: pnode_unlink(p->doctype);
1127: }
1128:
1129: /* Clean up. */
1130:
1131: if (fd != STDIN_FILENO)
1132: close(fd);
1133: p->fname = save_fname;
1134: p->nline = save_line;
1135: p->ncol = save_col;
1.1 schwarze 1136: return p->tree;
1137: }
CVSweb