Annotation of docbook2mdoc/parse.c, Revision 1.35
1.35 ! schwarze 1: /* $Id: parse.c,v 1.34 2019/04/12 04:39:24 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.24 schwarze 20: #include <errno.h>
21: #include <fcntl.h>
22: #include <libgen.h>
1.6 schwarze 23: #include <stdarg.h>
1.1 schwarze 24: #include <stdio.h>
1.5 schwarze 25: #include <stdlib.h>
1.1 schwarze 26: #include <string.h>
27: #include <unistd.h>
28:
29: #include "node.h"
30: #include "parse.h"
31:
32: /*
33: * The implementation of the DocBook parser.
34: */
35:
1.14 schwarze 36: enum pstate {
37: PARSE_ELEM,
38: PARSE_TAG,
39: PARSE_ARG,
40: PARSE_SQ,
41: PARSE_DQ
42: };
43:
1.1 schwarze 44: /*
45: * Global parse state.
46: * Keep this as simple and small as possible.
47: */
48: struct parse {
49: const char *fname; /* Name of the input file. */
50: struct ptree *tree; /* Complete parse result. */
1.23 schwarze 51: struct pnode *doctype;
1.1 schwarze 52: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 53: enum nodeid ncur; /* Type of the current node. */
54: int line; /* Line number in the input file. */
55: int col; /* Column number in the input file. */
56: int nline; /* Line number of next token. */
57: int ncol; /* Column number of next token. */
1.4 schwarze 58: int del; /* Levels of nested nodes being deleted. */
1.23 schwarze 59: int flags;
60: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
61: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
62: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
63: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 64: };
65:
66: struct element {
67: const char *name; /* DocBook element name. */
68: enum nodeid node; /* Node type to generate. */
69: };
70:
71: static const struct element elements[] = {
1.3 schwarze 72: { "acronym", NODE_IGNORE },
1.1 schwarze 73: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 74: { "anchor", NODE_DELETE },
1.22 schwarze 75: { "appendix", NODE_APPENDIX },
1.1 schwarze 76: { "application", NODE_APPLICATION },
77: { "arg", NODE_ARG },
1.22 schwarze 78: { "article", NODE_SECTION },
1.1 schwarze 79: { "author", NODE_AUTHOR },
80: { "authorgroup", NODE_AUTHORGROUP },
81: { "blockquote", NODE_BLOCKQUOTE },
1.22 schwarze 82: { "book", NODE_SECTION },
1.1 schwarze 83: { "bookinfo", NODE_BOOKINFO },
84: { "caution", NODE_CAUTION },
85: { "chapter", NODE_SECTION },
86: { "citerefentry", NODE_CITEREFENTRY },
87: { "citetitle", NODE_CITETITLE },
88: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 89: { "code", NODE_LITERAL },
1.1 schwarze 90: { "colspec", NODE_COLSPEC },
91: { "command", NODE_COMMAND },
92: { "constant", NODE_CONSTANT },
1.7 schwarze 93: { "contrib", NODE_CONTRIB },
1.1 schwarze 94: { "copyright", NODE_COPYRIGHT },
95: { "date", NODE_DATE },
1.23 schwarze 96: { "!doctype", NODE_DOCTYPE },
97: { "!DOCTYPE", NODE_DOCTYPE },
1.1 schwarze 98: { "editor", NODE_EDITOR },
99: { "email", NODE_EMAIL },
100: { "emphasis", NODE_EMPHASIS },
1.23 schwarze 101: { "!ENTITY", NODE_ENTITY },
1.1 schwarze 102: { "entry", NODE_ENTRY },
103: { "envar", NODE_ENVAR },
1.13 schwarze 104: { "errorname", NODE_ERRORNAME },
1.1 schwarze 105: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
106: { "filename", NODE_FILENAME },
1.7 schwarze 107: { "firstname", NODE_PERSONNAME },
1.1 schwarze 108: { "firstterm", NODE_FIRSTTERM },
109: { "footnote", NODE_FOOTNOTE },
110: { "funcdef", NODE_FUNCDEF },
111: { "funcprototype", NODE_FUNCPROTOTYPE },
112: { "funcsynopsis", NODE_FUNCSYNOPSIS },
113: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
114: { "function", NODE_FUNCTION },
1.21 schwarze 115: { "glossary", NODE_VARIABLELIST },
116: { "glossdef", NODE_IGNORE },
117: { "glossdiv", NODE_IGNORE },
118: { "glossentry", NODE_VARLISTENTRY },
119: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 120: { "glossterm", NODE_GLOSSTERM },
121: { "group", NODE_GROUP },
122: { "holder", NODE_HOLDER },
123: { "index", NODE_INDEX },
1.4 schwarze 124: { "indexterm", NODE_DELETE },
1.1 schwarze 125: { "info", NODE_INFO },
126: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 127: { "informaltable", NODE_TABLE },
1.1 schwarze 128: { "inlineequation", NODE_INLINEEQUATION },
129: { "itemizedlist", NODE_ITEMIZEDLIST },
130: { "keysym", NODE_KEYSYM },
131: { "legalnotice", NODE_LEGALNOTICE },
132: { "link", NODE_LINK },
133: { "listitem", NODE_LISTITEM },
134: { "literal", NODE_LITERAL },
135: { "literallayout", NODE_LITERALLAYOUT },
136: { "manvolnum", NODE_MANVOLNUM },
137: { "member", NODE_MEMBER },
138: { "mml:math", NODE_MML_MATH },
139: { "mml:mfenced", NODE_MML_MFENCED },
140: { "mml:mfrac", NODE_MML_MFRAC },
141: { "mml:mi", NODE_MML_MI },
142: { "mml:mn", NODE_MML_MN },
143: { "mml:mo", NODE_MML_MO },
144: { "mml:mrow", NODE_MML_MROW },
145: { "mml:msub", NODE_MML_MSUB },
146: { "mml:msup", NODE_MML_MSUP },
147: { "modifier", NODE_MODIFIER },
148: { "note", NODE_NOTE },
149: { "option", NODE_OPTION },
150: { "orderedlist", NODE_ORDEREDLIST },
151: { "orgname", NODE_ORGNAME },
1.7 schwarze 152: { "othername", NODE_PERSONNAME },
1.1 schwarze 153: { "para", NODE_PARA },
154: { "paramdef", NODE_PARAMDEF },
155: { "parameter", NODE_PARAMETER },
156: { "part", NODE_SECTION },
157: { "personname", NODE_PERSONNAME },
1.3 schwarze 158: { "phrase", NODE_IGNORE },
1.1 schwarze 159: { "preface", NODE_PREFACE },
1.4 schwarze 160: { "primary", NODE_DELETE },
1.1 schwarze 161: { "programlisting", NODE_PROGRAMLISTING },
162: { "prompt", NODE_PROMPT },
163: { "quote", NODE_QUOTE },
164: { "refclass", NODE_REFCLASS },
165: { "refdescriptor", NODE_REFDESCRIPTOR },
166: { "refentry", NODE_REFENTRY },
167: { "refentryinfo", NODE_REFENTRYINFO },
168: { "refentrytitle", NODE_REFENTRYTITLE },
169: { "refmeta", NODE_REFMETA },
170: { "refmetainfo", NODE_REFMETAINFO },
171: { "refmiscinfo", NODE_REFMISCINFO },
172: { "refname", NODE_REFNAME },
173: { "refnamediv", NODE_REFNAMEDIV },
174: { "refpurpose", NODE_REFPURPOSE },
175: { "refsect1", NODE_SECTION },
176: { "refsect2", NODE_SECTION },
177: { "refsect3", NODE_SECTION },
178: { "refsection", NODE_SECTION },
179: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
180: { "releaseinfo", NODE_RELEASEINFO },
181: { "replaceable", NODE_REPLACEABLE },
182: { "row", NODE_ROW },
183: { "sbr", NODE_SBR },
184: { "screen", NODE_SCREEN },
1.4 schwarze 185: { "secondary", NODE_DELETE },
1.1 schwarze 186: { "sect1", NODE_SECTION },
187: { "sect2", NODE_SECTION },
188: { "section", NODE_SECTION },
189: { "sgmltag", NODE_SGMLTAG },
1.15 schwarze 190: { "simpara", NODE_PARA },
1.1 schwarze 191: { "simplelist", NODE_SIMPLELIST },
192: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 193: { "structfield", NODE_PARAMETER },
194: { "structname", NODE_TYPE },
1.1 schwarze 195: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 196: { "surname", NODE_PERSONNAME },
1.12 schwarze 197: { "symbol", NODE_CONSTANT },
1.1 schwarze 198: { "synopsis", NODE_SYNOPSIS },
199: { "table", NODE_TABLE },
200: { "tbody", NODE_TBODY },
201: { "term", NODE_TERM },
202: { "tfoot", NODE_TFOOT },
203: { "tgroup", NODE_TGROUP },
204: { "thead", NODE_THEAD },
205: { "tip", NODE_TIP },
206: { "title", NODE_TITLE },
1.3 schwarze 207: { "trademark", NODE_IGNORE },
1.1 schwarze 208: { "type", NODE_TYPE },
1.18 schwarze 209: { "ulink", NODE_LINK },
1.13 schwarze 210: { "userinput", NODE_LITERAL },
1.1 schwarze 211: { "variablelist", NODE_VARIABLELIST },
212: { "varlistentry", NODE_VARLISTENTRY },
213: { "varname", NODE_VARNAME },
214: { "warning", NODE_WARNING },
215: { "wordasword", NODE_WORDASWORD },
1.26 schwarze 216: { "xi:include", NODE_INCLUDE },
1.1 schwarze 217: { "year", NODE_YEAR },
1.5 schwarze 218: { NULL, NODE_IGNORE }
1.1 schwarze 219: };
220:
1.9 schwarze 221: struct entity {
222: const char *name;
223: const char *roff;
224: };
225:
226: /*
227: * XML character entity references found in the wild.
228: * Those that don't have an exact mandoc_char(7) representation
229: * are approximated, and the desired codepoint is given as a comment.
230: * Encoding them as \\[u...] would leave -Tascii out in the cold.
231: */
232: static const struct entity entities[] = {
233: { "alpha", "\\(*a" },
234: { "amp", "&" },
235: { "apos", "'" },
236: { "auml", "\\(:a" },
237: { "beta", "\\(*b" },
238: { "circ", "^" }, /* U+02C6 */
239: { "copy", "\\(co" },
240: { "dagger", "\\(dg" },
241: { "Delta", "\\(*D" },
242: { "eacute", "\\('e" },
243: { "emsp", "\\ " }, /* U+2003 */
244: { "gt", ">" },
245: { "hairsp", "\\^" },
246: { "kappa", "\\(*k" },
247: { "larr", "\\(<-" },
248: { "ldquo", "\\(lq" },
249: { "le", "\\(<=" },
250: { "lowbar", "_" },
251: { "lsqb", "[" },
252: { "lt", "<" },
253: { "mdash", "\\(em" },
254: { "minus", "\\-" },
255: { "ndash", "\\(en" },
256: { "nbsp", "\\ " },
257: { "num", "#" },
258: { "oslash", "\\(/o" },
259: { "ouml", "\\(:o" },
260: { "percnt", "%" },
261: { "quot", "\\(dq" },
262: { "rarr", "\\(->" },
263: { "rArr", "\\(rA" },
264: { "rdquo", "\\(rq" },
265: { "reg", "\\(rg" },
266: { "rho", "\\(*r" },
267: { "rsqb", "]" },
268: { "sigma", "\\(*s" },
269: { "shy", "\\&" }, /* U+00AD */
270: { "tau", "\\(*t" },
271: { "tilde", "\\[u02DC]" },
272: { "times", "\\[tmu]" },
273: { "uuml", "\\(:u" },
274: { NULL, NULL }
275: };
276:
1.23 schwarze 277: static size_t parse_string(struct parse *, char *, size_t,
278: enum pstate *, int);
1.24 schwarze 279: static void parse_fd(struct parse *, int);
1.23 schwarze 280:
281:
1.6 schwarze 282: static void
1.29 schwarze 283: fatal(struct parse *p)
284: {
285: fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
286: perror(NULL);
287: exit(6);
288: }
289:
290: static void
1.6 schwarze 291: error_msg(struct parse *p, const char *fmt, ...)
292: {
293: va_list ap;
294:
1.29 schwarze 295: fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6 schwarze 296: va_start(ap, fmt);
297: vfprintf(stderr, fmt, ap);
298: va_end(ap);
299: fputc('\n', stderr);
1.29 schwarze 300: p->tree->flags |= TREE_ERROR;
1.6 schwarze 301: }
302:
303: static void
304: warn_msg(struct parse *p, const char *fmt, ...)
305: {
306: va_list ap;
307:
1.23 schwarze 308: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 309: return;
310:
1.29 schwarze 311: fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6 schwarze 312: va_start(ap, fmt);
313: vfprintf(stderr, fmt, ap);
314: va_end(ap);
315: fputc('\n', stderr);
1.29 schwarze 316: p->tree->flags |= TREE_WARN;
1.6 schwarze 317: }
318:
1.1 schwarze 319: /*
320: * Process a string of characters.
321: * If a text node is already open, append to it.
322: * Otherwise, create a new one as a child of the current node.
323: */
324: static void
1.35 ! schwarze 325: xml_text(struct parse *p, const char *word, int sz)
1.1 schwarze 326: {
1.35 ! schwarze 327: struct pnode *n, *np;
1.32 schwarze 328: size_t oldsz, newsz;
1.35 ! schwarze 329: int i;
1.1 schwarze 330:
1.32 schwarze 331: assert(sz > 0);
1.30 schwarze 332: if (p->del > 0)
1.1 schwarze 333: return;
334:
1.32 schwarze 335: if ((n = p->cur) == NULL) {
1.35 ! schwarze 336: error_msg(p, "discarding text before document: %.*s",
! 337: sz, word);
1.5 schwarze 338: return;
339: }
340:
1.35 ! schwarze 341: /* Append to the current text node, if one is open. */
! 342:
! 343: if (n->node == NODE_TEXT) {
! 344: oldsz = strlen(n->b);
! 345: newsz = oldsz + sz;
! 346: if (oldsz && (p->flags & PFLAG_SPC))
! 347: newsz++;
! 348: if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30 schwarze 349: fatal(p);
1.35 ! schwarze 350: if (oldsz && (p->flags & PFLAG_SPC))
! 351: n->b[oldsz++] = ' ';
! 352: memcpy(n->b + oldsz, word, sz);
! 353: n->b[newsz] = '\0';
! 354: p->flags &= ~PFLAG_SPC;
! 355: return;
1.1 schwarze 356: }
357:
1.35 ! schwarze 358: if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30 schwarze 359: warn_msg(p, "text after end of document: %.*s", sz, word);
1.5 schwarze 360:
1.35 ! schwarze 361: /* Create a new text node. */
1.1 schwarze 362:
1.35 ! schwarze 363: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 364: fatal(p);
1.35 ! schwarze 365: n->node = NODE_TEXT;
! 366: n->spc = (p->flags & PFLAG_SPC) != 0;
1.30 schwarze 367: p->flags &= ~PFLAG_SPC;
1.35 ! schwarze 368:
! 369: /*
! 370: * If this node follows a non-text node without intervening
! 371: * whitespace, keep the text in it as short as possible,
! 372: * and do not keep it open.
! 373: */
! 374:
! 375: if (n->spc == 0 &&
! 376: (np = TAILQ_PREV(n, pnodeq, child)) != NULL &&
! 377: np->node != NODE_TEXT && np->node != NODE_ESCAPE) {
! 378: i = 0;
! 379: while (i < sz && !isspace((unsigned char)word[i]))
! 380: i++;
! 381: if ((n->b = strndup(word, i)) == NULL)
! 382: fatal(p);
! 383: if (i == sz)
! 384: return;
! 385: while (i < sz && isspace((unsigned char)word[i]))
! 386: i++;
! 387: if (i == sz) {
! 388: p->flags |= PFLAG_SPC;
! 389: return;
! 390: }
! 391:
! 392: /* Put any remaining text into a second node. */
! 393:
! 394: if ((n = pnode_alloc(p->cur)) == NULL)
! 395: fatal(p);
! 396: n->node = NODE_TEXT;
! 397: n->spc = 1;
! 398: word += i;
! 399: sz -= i;
! 400: }
! 401: if ((n->b = strndup(word, sz)) == NULL)
! 402: fatal(p);
! 403:
! 404: /* The new node remains open for later pnode_closetext(). */
! 405:
! 406: p->cur = n;
1.1 schwarze 407: }
408:
1.16 schwarze 409: /*
410: * Close out the text node and strip trailing whitespace, if one is open.
411: */
1.1 schwarze 412: static void
1.16 schwarze 413: pnode_closetext(struct parse *p)
1.1 schwarze 414: {
1.16 schwarze 415: struct pnode *n;
1.32 schwarze 416: char *cp;
1.16 schwarze 417:
418: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
419: return;
420: p->cur = n->parent;
1.32 schwarze 421: for (cp = strchr(n->b, '\0');
422: cp > n->b && isspace((unsigned char)cp[-1]);
423: *--cp = '\0')
1.23 schwarze 424: p->flags |= PFLAG_SPC;
1.1 schwarze 425: }
426:
1.9 schwarze 427: static void
428: xml_entity(struct parse *p, const char *name)
429: {
430: const struct entity *entity;
1.30 schwarze 431: struct pnode *n;
1.23 schwarze 432: const char *ccp;
433: char *cp;
434: enum pstate pstate;
1.9 schwarze 435:
436: if (p->del > 0)
437: return;
438:
439: if (p->cur == NULL) {
440: error_msg(p, "discarding entity before document: &%s;", name);
441: return;
442: }
443:
1.16 schwarze 444: pnode_closetext(p);
1.9 schwarze 445:
446: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
447: warn_msg(p, "entity after end of document: &%s;", name);
448:
449: for (entity = entities; entity->name != NULL; entity++)
450: if (strcmp(name, entity->name) == 0)
451: break;
452:
453: if (entity->roff == NULL) {
1.23 schwarze 454: if (p->doctype != NULL) {
1.30 schwarze 455: TAILQ_FOREACH(n, &p->doctype->childq, child) {
456: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 457: ATTRKEY_NAME, NULL)) == NULL ||
1.25 schwarze 458: strcmp(ccp, name) != 0)
459: continue;
1.30 schwarze 460: if ((ccp = pnode_getattr_raw(n,
1.25 schwarze 461: ATTRKEY_SYSTEM, NULL)) != NULL) {
462: parse_file(p, -1, ccp);
463: p->flags &= ~PFLAG_SPC;
464: return;
465: }
1.30 schwarze 466: if ((ccp = pnode_getattr_raw(n,
1.23 schwarze 467: ATTRKEY_DEFINITION, NULL)) == NULL)
468: continue;
1.29 schwarze 469: if ((cp = strdup(ccp)) == NULL)
470: fatal(p);
1.23 schwarze 471: pstate = PARSE_ELEM;
472: parse_string(p, cp, strlen(cp), &pstate, 0);
473: p->flags &= ~PFLAG_SPC;
474: free(cp);
475: return;
476: }
477: }
1.9 schwarze 478: error_msg(p, "unknown entity &%s;", name);
479: return;
480: }
481:
482: /* Create, append, and close out an entity node. */
1.34 schwarze 483: if ((n = pnode_alloc(p->cur)) == NULL ||
1.32 schwarze 484: (n->b = strdup(entity->roff)) == NULL)
1.29 schwarze 485: fatal(p);
1.30 schwarze 486: n->node = NODE_ESCAPE;
487: n->spc = (p->flags & PFLAG_SPC) != 0;
1.23 schwarze 488: p->flags &= ~PFLAG_SPC;
1.9 schwarze 489: }
490:
1.1 schwarze 491: /*
492: * Begin an element.
493: */
494: static void
1.30 schwarze 495: xml_elem_start(struct parse *p, const char *name)
1.1 schwarze 496: {
1.5 schwarze 497: const struct element *elem;
1.30 schwarze 498: struct pnode *n;
1.1 schwarze 499:
1.4 schwarze 500: /*
501: * An ancestor is excluded from the tree;
502: * keep track of the number of levels excluded.
503: */
1.30 schwarze 504: if (p->del > 0) {
1.23 schwarze 505: if (*name != '!' && *name != '?')
1.30 schwarze 506: p->del++;
1.4 schwarze 507: return;
508: }
509:
1.30 schwarze 510: pnode_closetext(p);
1.1 schwarze 511:
512: for (elem = elements; elem->name != NULL; elem++)
513: if (strcmp(elem->name, name) == 0)
514: break;
515:
1.23 schwarze 516: if (elem->name == NULL) {
517: if (*name == '!' || *name == '?')
518: return;
1.30 schwarze 519: error_msg(p, "unknown element <%s>", name);
1.23 schwarze 520: }
1.6 schwarze 521:
1.30 schwarze 522: p->ncur = elem->node;
1.1 schwarze 523:
1.30 schwarze 524: switch (p->ncur) {
1.4 schwarze 525: case NODE_DELETE_WARN:
1.30 schwarze 526: warn_msg(p, "skipping element <%s>", name);
1.2 schwarze 527: /* FALLTHROUGH */
1.4 schwarze 528: case NODE_DELETE:
1.30 schwarze 529: p->del = 1;
1.4 schwarze 530: /* FALLTHROUGH */
1.2 schwarze 531: case NODE_IGNORE:
532: return;
533: case NODE_INLINEEQUATION:
1.30 schwarze 534: p->tree->flags |= TREE_EQN;
1.2 schwarze 535: break;
536: default:
537: break;
538: }
1.1 schwarze 539:
1.30 schwarze 540: if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
541: warn_msg(p, "element after end of document: <%s>", name);
1.5 schwarze 542:
1.34 schwarze 543: if ((n = pnode_alloc(p->cur)) == NULL)
1.30 schwarze 544: fatal(p);
1.17 schwarze 545:
546: /*
547: * Nodes that begin a new macro or request line or start by
548: * printing text always want whitespace before themselves.
549: */
550:
1.30 schwarze 551: switch (n->node = elem->node) {
1.23 schwarze 552: case NODE_DOCTYPE:
553: case NODE_ENTITY:
554: case NODE_SBR:
1.30 schwarze 555: p->flags |= PFLAG_EEND;
1.23 schwarze 556: /* FALLTHROUGH */
1.22 schwarze 557: case NODE_APPENDIX:
1.17 schwarze 558: case NODE_AUTHORGROUP:
1.20 schwarze 559: case NODE_BLOCKQUOTE:
1.17 schwarze 560: case NODE_BOOKINFO:
561: case NODE_CAUTION:
562: case NODE_EDITOR:
563: case NODE_ENTRY:
564: case NODE_FUNCDEF:
565: case NODE_FUNCPROTOTYPE:
566: case NODE_INFORMALEQUATION:
567: case NODE_INLINEEQUATION:
568: case NODE_ITEMIZEDLIST:
569: case NODE_LEGALNOTICE:
570: case NODE_LITERALLAYOUT:
571: case NODE_NOTE:
572: case NODE_ORDEREDLIST:
573: case NODE_PARA:
574: case NODE_PREFACE:
575: case NODE_PROGRAMLISTING:
576: case NODE_REFMETA:
577: case NODE_REFNAMEDIV:
578: case NODE_REFSYNOPSISDIV:
579: case NODE_ROW:
580: case NODE_SCREEN:
581: case NODE_SECTION:
582: case NODE_SYNOPSIS:
583: case NODE_TGROUP:
584: case NODE_TIP:
585: case NODE_TITLE:
586: case NODE_VARIABLELIST:
587: case NODE_VARLISTENTRY:
588: case NODE_WARNING:
1.30 schwarze 589: n->spc = 1;
1.17 schwarze 590: break;
591: default:
1.30 schwarze 592: n->spc = (p->flags & PFLAG_SPC) != 0;
1.17 schwarze 593: break;
594: }
1.30 schwarze 595: p->cur = n;
596: if (n->node == NODE_DOCTYPE) {
597: if (p->doctype == NULL)
598: p->doctype = n;
1.23 schwarze 599: else
1.30 schwarze 600: error_msg(p, "duplicate doctype");
601: } else if (n->parent == NULL && p->tree->root == NULL)
602: p->tree->root = n;
1.5 schwarze 603: }
604:
605: static void
1.30 schwarze 606: xml_attrkey(struct parse *p, const char *name)
1.5 schwarze 607: {
1.30 schwarze 608: struct pattr *a;
1.23 schwarze 609: const char *value;
1.5 schwarze 610: enum attrkey key;
1.1 schwarze 611:
1.30 schwarze 612: if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 613: return;
1.23 schwarze 614:
1.30 schwarze 615: if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
616: TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23 schwarze 617: value = name;
618: name = "NAME";
619: } else
620: value = NULL;
621:
1.5 schwarze 622: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30 schwarze 623: p->flags &= ~PFLAG_ATTR;
1.5 schwarze 624: return;
625: }
1.30 schwarze 626: if ((a = calloc(1, sizeof(*a))) == NULL)
627: fatal(p);
1.29 schwarze 628:
1.30 schwarze 629: a->key = key;
630: a->val = ATTRVAL__MAX;
1.23 schwarze 631: if (value == NULL) {
1.30 schwarze 632: a->rawval = NULL;
633: p->flags |= PFLAG_ATTR;
1.23 schwarze 634: } else {
1.30 schwarze 635: if ((a->rawval = strdup(value)) == NULL)
636: fatal(p);
637: p->flags &= ~PFLAG_ATTR;
638: }
639: TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
640: if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
641: xml_attrkey(p, "DEFINITION");
1.5 schwarze 642: }
643:
644: static void
1.30 schwarze 645: xml_attrval(struct parse *p, const char *name)
1.5 schwarze 646: {
1.30 schwarze 647: struct pattr *a;
1.5 schwarze 648:
1.30 schwarze 649: if (p->del > 0 || p->ncur == NODE_IGNORE ||
650: (p->flags & PFLAG_ATTR) == 0)
1.5 schwarze 651: return;
1.30 schwarze 652: if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5 schwarze 653: return;
1.30 schwarze 654: if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
655: (a->rawval = strdup(name)) == NULL)
656: fatal(p);
657: p->flags &= ~PFLAG_ATTR;
1.1 schwarze 658: }
659:
660: /*
661: * Roll up the parse tree.
662: * If we're at a text node, roll that one up first.
663: */
664: static void
1.31 schwarze 665: xml_elem_end(struct parse *p, const char *name)
1.1 schwarze 666: {
1.5 schwarze 667: const struct element *elem;
1.26 schwarze 668: struct pnode *n;
669: const char *cp;
1.5 schwarze 670: enum nodeid node;
1.1 schwarze 671:
1.4 schwarze 672: /*
673: * An ancestor is excluded from the tree;
674: * keep track of the number of levels excluded.
675: */
1.31 schwarze 676: if (p->del > 1) {
677: p->del--;
1.4 schwarze 678: return;
679: }
680:
1.31 schwarze 681: if (p->del == 0)
682: pnode_closetext(p);
1.2 schwarze 683:
1.5 schwarze 684: if (name != NULL) {
685: for (elem = elements; elem->name != NULL; elem++)
686: if (strcmp(elem->name, name) == 0)
687: break;
688: node = elem->node;
689: } else
1.31 schwarze 690: node = p->ncur;
1.2 schwarze 691:
1.5 schwarze 692: switch (node) {
1.4 schwarze 693: case NODE_DELETE_WARN:
694: case NODE_DELETE:
1.31 schwarze 695: if (p->del > 0)
696: p->del--;
1.4 schwarze 697: break;
1.2 schwarze 698: case NODE_IGNORE:
1.26 schwarze 699: break;
700: case NODE_INCLUDE:
1.31 schwarze 701: n = p->cur;
702: p->cur = p->cur->parent;
1.26 schwarze 703: cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
704: if (cp == NULL)
1.31 schwarze 705: error_msg(p, "<xi:include> element "
1.26 schwarze 706: "without href attribute");
707: else
1.31 schwarze 708: parse_file(p, -1, cp);
1.26 schwarze 709: pnode_unlink(n);
1.31 schwarze 710: p->flags &= ~PFLAG_SPC;
1.2 schwarze 711: break;
1.23 schwarze 712: case NODE_DOCTYPE:
1.32 schwarze 713: case NODE_SBR:
1.31 schwarze 714: p->flags &= ~PFLAG_EEND;
1.23 schwarze 715: /* FALLTHROUGH */
1.2 schwarze 716: default:
1.31 schwarze 717: if (p->cur == NULL || node != p->cur->node) {
718: warn_msg(p, "element not open: </%s>", name);
1.5 schwarze 719: break;
720: }
721:
722: /*
723: * Refrain from actually closing the document element.
724: * If no more content follows, no harm is done, but if
725: * some content still follows, simply processing it is
726: * obviously better than discarding it or crashing.
727: */
728:
1.31 schwarze 729: if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
730: p->cur = p->cur->parent;
731: if (p->cur != NULL)
732: p->ncur = p->cur->node;
1.23 schwarze 733: } else
1.31 schwarze 734: p->tree->flags |= TREE_CLOSED;
735: p->flags &= ~PFLAG_SPC;
1.4 schwarze 736: break;
1.2 schwarze 737: }
1.31 schwarze 738: assert(p->del == 0);
1.1 schwarze 739: }
740:
741: struct parse *
742: parse_alloc(int warn)
743: {
744: struct parse *p;
745:
746: if ((p = calloc(1, sizeof(*p))) == NULL)
747: return NULL;
748:
749: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
750: free(p);
751: return NULL;
752: }
1.23 schwarze 753: if (warn)
754: p->flags |= PFLAG_WARN;
755: else
756: p->flags &= ~PFLAG_WARN;
1.1 schwarze 757: return p;
758: }
759:
760: void
761: parse_free(struct parse *p)
762: {
763: if (p == NULL)
764: return;
765: if (p->tree != NULL) {
766: pnode_unlink(p->tree->root);
767: free(p->tree);
768: }
769: free(p);
770: }
771:
1.14 schwarze 772: static void
773: increment(struct parse *p, char *b, size_t *pend, int refill)
774: {
775: if (refill) {
776: if (b[*pend] == '\n') {
777: p->nline++;
778: p->ncol = 1;
779: } else
780: p->ncol++;
781: }
782: ++*pend;
783: }
784:
1.5 schwarze 785: /*
786: * Advance the pend pointer to the next character in the charset.
787: * If the charset starts with a space, it stands for any whitespace.
788: * Update the new input file position, used for messages.
789: * Do not overrun the buffer b of length rlen.
790: * When reaching the end, NUL-terminate the buffer and return 1;
791: * otherwise, return 0.
792: */
793: static int
794: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 795: const char *charset, int refill)
1.5 schwarze 796: {
797: int space;
798:
799: if (*charset == ' ') {
800: space = 1;
801: charset++;
802: } else
803: space = 0;
804:
1.14 schwarze 805: if (refill) {
806: p->nline = p->line;
807: p->ncol = p->col;
808: }
1.5 schwarze 809: while (*pend < rlen) {
810: if (space && isspace((unsigned char)b[*pend]))
811: break;
812: if (strchr(charset, b[*pend]) != NULL)
813: break;
1.14 schwarze 814: increment(p, b, pend, refill);
1.5 schwarze 815: }
816: if (*pend == rlen) {
817: b[rlen] = '\0';
1.14 schwarze 818: return refill;
1.5 schwarze 819: } else
820: return 0;
821: }
822:
1.14 schwarze 823: size_t
824: parse_string(struct parse *p, char *b, size_t rlen,
825: enum pstate *pstate, int refill)
826: {
827: char *cp;
828: size_t poff; /* Parse offset in b[]. */
829: size_t pend; /* Offset of the end of the current word. */
830: int elem_end;
831:
832: pend = 0;
833: for (;;) {
834:
835: /* Proceed to the next token, skipping whitespace. */
836:
837: if (refill) {
838: p->line = p->nline;
839: p->col = p->ncol;
840: }
841: if ((poff = pend) == rlen)
842: break;
843: if (isspace((unsigned char)b[pend])) {
1.23 schwarze 844: p->flags |= PFLAG_SPC;
1.14 schwarze 845: increment(p, b, &pend, refill);
846: continue;
847: }
848:
849: /*
850: * The following four cases (ARG, TAG, and starting an
851: * entity or a tag) all parse a word or quoted string.
852: * If that extends beyond the read buffer and the last
853: * read(2) still got data, they all break out of the
854: * token loop to request more data from the read loop.
855: *
856: * Also, three of them detect self-closing tags, those
857: * ending with "/>", setting the flag elem_end and
858: * calling xml_elem_end() at the very end, after
859: * handling the attribute value, attribute name, or
860: * tag name, respectively.
861: */
862:
863: /* Parse an attribute value. */
864:
865: if (*pstate >= PARSE_ARG) {
866: if (*pstate == PARSE_ARG &&
867: (b[pend] == '\'' || b[pend] == '"')) {
868: *pstate = b[pend] == '"' ?
869: PARSE_DQ : PARSE_SQ;
870: increment(p, b, &pend, refill);
871: continue;
872: }
873: if (advance(p, b, rlen, &pend,
874: *pstate == PARSE_DQ ? "\"" :
875: *pstate == PARSE_SQ ? "'" : " >", refill))
876: break;
877: *pstate = PARSE_TAG;
878: elem_end = 0;
879: if (b[pend] == '>') {
880: *pstate = PARSE_ELEM;
881: if (pend > 0 && b[pend - 1] == '/') {
882: b[pend - 1] = '\0';
883: elem_end = 1;
884: }
1.23 schwarze 885: if (p->flags & PFLAG_EEND)
886: elem_end = 1;
1.14 schwarze 887: }
888: b[pend] = '\0';
889: if (pend < rlen)
890: increment(p, b, &pend, refill);
891: xml_attrval(p, b + poff);
892: if (elem_end)
893: xml_elem_end(p, NULL);
894:
895: /* Look for an attribute name. */
896:
897: } else if (*pstate == PARSE_TAG) {
1.23 schwarze 898: switch (p->ncur) {
899: case NODE_DOCTYPE:
900: if (b[pend] == '[') {
901: *pstate = PARSE_ELEM;
902: increment(p, b, &pend, refill);
903: continue;
904: }
905: /* FALLTHROUGH */
906: case NODE_ENTITY:
907: if (b[pend] == '"' || b[pend] == '\'') {
908: *pstate = PARSE_ARG;
909: continue;
910: }
911: break;
912: default:
913: break;
914: }
1.14 schwarze 915: if (advance(p, b, rlen, &pend, " =>", refill))
916: break;
917: elem_end = 0;
918: switch (b[pend]) {
919: case '>':
920: *pstate = PARSE_ELEM;
921: if (pend > 0 && b[pend - 1] == '/') {
922: b[pend - 1] = '\0';
923: elem_end = 1;
924: }
1.23 schwarze 925: if (p->flags & PFLAG_EEND)
926: elem_end = 1;
1.14 schwarze 927: break;
928: case '=':
929: *pstate = PARSE_ARG;
930: break;
931: default:
932: break;
933: }
934: b[pend] = '\0';
935: if (pend < rlen)
936: increment(p, b, &pend, refill);
937: xml_attrkey(p, b + poff);
938: if (elem_end)
939: xml_elem_end(p, NULL);
940:
941: /* Begin an opening or closing tag. */
942:
943: } else if (b[poff] == '<') {
944: if (advance(p, b, rlen, &pend, " >", refill))
945: break;
946: if (pend > poff + 3 &&
947: strncmp(b + poff, "<!--", 4) == 0) {
948:
949: /* Skip a comment. */
950:
951: cp = strstr(b + pend - 2, "-->");
952: if (cp == NULL) {
953: if (refill)
954: break;
955: cp = b + rlen;
956: } else
957: cp += 3;
958: while (b + pend < cp)
959: increment(p, b, &pend, refill);
960: continue;
961: }
962: elem_end = 0;
963: if (b[pend] != '>')
964: *pstate = PARSE_TAG;
965: else if (pend > 0 && b[pend - 1] == '/') {
966: b[pend - 1] = '\0';
967: elem_end = 1;
968: }
969: b[pend] = '\0';
970: if (pend < rlen)
971: increment(p, b, &pend, refill);
972: if (b[++poff] == '/') {
973: elem_end = 1;
974: poff++;
1.23 schwarze 975: } else {
1.14 schwarze 976: xml_elem_start(p, b + poff);
1.23 schwarze 977: if (*pstate == PARSE_ELEM &&
978: p->flags & PFLAG_EEND)
979: elem_end = 1;
980: }
1.14 schwarze 981: if (elem_end)
982: xml_elem_end(p, b + poff);
983:
1.23 schwarze 984: /* Close a doctype. */
985:
986: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
987: *pstate = PARSE_TAG;
988: increment(p, b, &pend, refill);
989:
1.14 schwarze 990: /* Process an entity. */
991:
992: } else if (b[poff] == '&') {
993: if (advance(p, b, rlen, &pend, ";", refill))
994: break;
995: b[pend] = '\0';
996: if (pend < rlen)
997: increment(p, b, &pend, refill);
998: xml_entity(p, b + poff + 1);
999:
1000: /* Process text up to the next tag, entity, or EOL. */
1001:
1002: } else {
1.28 schwarze 1003: advance(p, b, rlen, &pend,
1.33 schwarze 1004: p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28 schwarze 1005: refill);
1.35 ! schwarze 1006: xml_text(p, b + poff, pend - poff);
1.33 schwarze 1007: if (b[pend] == '\n')
1008: pnode_closetext(p);
1.14 schwarze 1009: }
1010: }
1011: return poff;
1012: }
1013:
1.24 schwarze 1014:
1015: /*
1016: * The read loop.
1017: * If the previous token was incomplete and asked for more input,
1018: * we have to enter the read loop once more even on EOF.
1019: * Once rsz is 0, incomplete tokens will no longer ask for more input
1020: * but instead use whatever there is, and then exit the read loop.
1021: * The minus one on the size limit for read(2) is needed such that
1022: * advance() can set b[rlen] to NUL when needed.
1023: */
1024: static void
1025: parse_fd(struct parse *p, int fd)
1.1 schwarze 1026: {
1027: char b[4096];
1.5 schwarze 1028: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 1029: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 1030: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 1031: enum pstate pstate;
1.1 schwarze 1032:
1.24 schwarze 1033: rlen = 0;
1.14 schwarze 1034: pstate = PARSE_ELEM;
1035: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
1036: (rlen += rsz) > 0) {
1037: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 1038: /* Buffer exhausted; shift left and re-fill. */
1039: assert(poff > 0);
1040: rlen -= poff;
1.14 schwarze 1041: memmove(b, b + poff, rlen);
1.5 schwarze 1042: }
1.24 schwarze 1043: if (rsz < 0)
1044: error_msg(p, "read: %s", strerror(errno));
1045: }
1046:
1047: /*
1048: * Open and parse a file.
1049: */
1050: struct ptree *
1051: parse_file(struct parse *p, int fd, const char *fname)
1052: {
1053: const char *save_fname;
1054: int save_line, save_col;
1055:
1056: /* Save and initialize reporting data. */
1057:
1058: save_fname = p->fname;
1059: save_line = p->nline;
1060: save_col = p->ncol;
1061: p->fname = fname;
1062: p->line = 0;
1063: p->col = 0;
1064:
1065: /* Open the file, unless it is already open. */
1066:
1067: if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
1068: error_msg(p, "open: %s", strerror(errno));
1069: p->fname = save_fname;
1070: return p->tree;
1.5 schwarze 1071: }
1.24 schwarze 1072:
1073: /*
1074: * After opening the starting file, change to the directory it
1075: * is located in, in case it wants to include any further files,
1076: * which are typically given with relative paths in DocBook.
1077: * Do this on a best-effort basis; don't complain about failure.
1078: */
1079:
1080: if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
1081: strcmp(fname, ".") != 0)
1082: (void)chdir(fname);
1083:
1084: /* Run the read loop. */
1085:
1086: p->nline = 1;
1087: p->ncol = 1;
1088: parse_fd(p, fd);
1089:
1090: /* On the top level, finalize the parse tree. */
1091:
1092: if (save_fname == NULL) {
1093: pnode_closetext(p);
1094: if (p->tree->root == NULL)
1095: error_msg(p, "empty document");
1096: else if ((p->tree->flags & TREE_CLOSED) == 0)
1097: warn_msg(p, "document not closed");
1098: pnode_unlink(p->doctype);
1099: }
1100:
1101: /* Clean up. */
1102:
1103: if (fd != STDIN_FILENO)
1104: close(fd);
1105: p->fname = save_fname;
1106: p->nline = save_line;
1107: p->ncol = save_col;
1.1 schwarze 1108: return p->tree;
1109: }
CVSweb