Annotation of docbook2mdoc/parse.c, Revision 1.23
1.23 ! schwarze 1: /* $Id: parse.c,v 1.22 2019/04/07 19:33:27 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
1.14 schwarze 33: enum pstate {
34: PARSE_ELEM,
35: PARSE_TAG,
36: PARSE_ARG,
37: PARSE_SQ,
38: PARSE_DQ
39: };
40:
1.1 schwarze 41: /*
42: * Global parse state.
43: * Keep this as simple and small as possible.
44: */
45: struct parse {
46: const char *fname; /* Name of the input file. */
47: struct ptree *tree; /* Complete parse result. */
1.23 ! schwarze 48: struct pnode *doctype;
1.1 schwarze 49: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 50: enum nodeid ncur; /* Type of the current node. */
51: int line; /* Line number in the input file. */
52: int col; /* Column number in the input file. */
53: int nline; /* Line number of next token. */
54: int ncol; /* Column number of next token. */
1.4 schwarze 55: int del; /* Levels of nested nodes being deleted. */
1.23 ! schwarze 56: int flags;
! 57: #define PFLAG_WARN (1 << 0) /* Print warning messages. */
! 58: #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */
! 59: #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */
! 60: #define PFLAG_EEND (1 << 3) /* This element is self-closing. */
1.1 schwarze 61: };
62:
63: struct element {
64: const char *name; /* DocBook element name. */
65: enum nodeid node; /* Node type to generate. */
66: };
67:
68: static const struct element elements[] = {
1.3 schwarze 69: { "acronym", NODE_IGNORE },
1.1 schwarze 70: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 71: { "anchor", NODE_DELETE },
1.22 schwarze 72: { "appendix", NODE_APPENDIX },
1.1 schwarze 73: { "application", NODE_APPLICATION },
74: { "arg", NODE_ARG },
1.22 schwarze 75: { "article", NODE_SECTION },
1.1 schwarze 76: { "author", NODE_AUTHOR },
77: { "authorgroup", NODE_AUTHORGROUP },
78: { "blockquote", NODE_BLOCKQUOTE },
1.22 schwarze 79: { "book", NODE_SECTION },
1.1 schwarze 80: { "bookinfo", NODE_BOOKINFO },
81: { "caution", NODE_CAUTION },
82: { "chapter", NODE_SECTION },
83: { "citerefentry", NODE_CITEREFENTRY },
84: { "citetitle", NODE_CITETITLE },
85: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 86: { "code", NODE_LITERAL },
1.1 schwarze 87: { "colspec", NODE_COLSPEC },
88: { "command", NODE_COMMAND },
89: { "constant", NODE_CONSTANT },
1.7 schwarze 90: { "contrib", NODE_CONTRIB },
1.1 schwarze 91: { "copyright", NODE_COPYRIGHT },
92: { "date", NODE_DATE },
1.23 ! schwarze 93: { "!doctype", NODE_DOCTYPE },
! 94: { "!DOCTYPE", NODE_DOCTYPE },
1.1 schwarze 95: { "editor", NODE_EDITOR },
96: { "email", NODE_EMAIL },
97: { "emphasis", NODE_EMPHASIS },
1.23 ! schwarze 98: { "!ENTITY", NODE_ENTITY },
1.1 schwarze 99: { "entry", NODE_ENTRY },
100: { "envar", NODE_ENVAR },
1.13 schwarze 101: { "errorname", NODE_ERRORNAME },
1.1 schwarze 102: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
103: { "filename", NODE_FILENAME },
1.7 schwarze 104: { "firstname", NODE_PERSONNAME },
1.1 schwarze 105: { "firstterm", NODE_FIRSTTERM },
106: { "footnote", NODE_FOOTNOTE },
107: { "funcdef", NODE_FUNCDEF },
108: { "funcprototype", NODE_FUNCPROTOTYPE },
109: { "funcsynopsis", NODE_FUNCSYNOPSIS },
110: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
111: { "function", NODE_FUNCTION },
1.21 schwarze 112: { "glossary", NODE_VARIABLELIST },
113: { "glossdef", NODE_IGNORE },
114: { "glossdiv", NODE_IGNORE },
115: { "glossentry", NODE_VARLISTENTRY },
116: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 117: { "glossterm", NODE_GLOSSTERM },
118: { "group", NODE_GROUP },
119: { "holder", NODE_HOLDER },
120: { "index", NODE_INDEX },
1.4 schwarze 121: { "indexterm", NODE_DELETE },
1.1 schwarze 122: { "info", NODE_INFO },
123: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 124: { "informaltable", NODE_TABLE },
1.1 schwarze 125: { "inlineequation", NODE_INLINEEQUATION },
126: { "itemizedlist", NODE_ITEMIZEDLIST },
127: { "keysym", NODE_KEYSYM },
128: { "legalnotice", NODE_LEGALNOTICE },
129: { "link", NODE_LINK },
130: { "listitem", NODE_LISTITEM },
131: { "literal", NODE_LITERAL },
132: { "literallayout", NODE_LITERALLAYOUT },
133: { "manvolnum", NODE_MANVOLNUM },
134: { "member", NODE_MEMBER },
135: { "mml:math", NODE_MML_MATH },
136: { "mml:mfenced", NODE_MML_MFENCED },
137: { "mml:mfrac", NODE_MML_MFRAC },
138: { "mml:mi", NODE_MML_MI },
139: { "mml:mn", NODE_MML_MN },
140: { "mml:mo", NODE_MML_MO },
141: { "mml:mrow", NODE_MML_MROW },
142: { "mml:msub", NODE_MML_MSUB },
143: { "mml:msup", NODE_MML_MSUP },
144: { "modifier", NODE_MODIFIER },
145: { "note", NODE_NOTE },
146: { "option", NODE_OPTION },
147: { "orderedlist", NODE_ORDEREDLIST },
148: { "orgname", NODE_ORGNAME },
1.7 schwarze 149: { "othername", NODE_PERSONNAME },
1.1 schwarze 150: { "para", NODE_PARA },
151: { "paramdef", NODE_PARAMDEF },
152: { "parameter", NODE_PARAMETER },
153: { "part", NODE_SECTION },
154: { "personname", NODE_PERSONNAME },
1.3 schwarze 155: { "phrase", NODE_IGNORE },
1.1 schwarze 156: { "preface", NODE_PREFACE },
1.4 schwarze 157: { "primary", NODE_DELETE },
1.1 schwarze 158: { "programlisting", NODE_PROGRAMLISTING },
159: { "prompt", NODE_PROMPT },
160: { "quote", NODE_QUOTE },
161: { "refclass", NODE_REFCLASS },
162: { "refdescriptor", NODE_REFDESCRIPTOR },
163: { "refentry", NODE_REFENTRY },
164: { "refentryinfo", NODE_REFENTRYINFO },
165: { "refentrytitle", NODE_REFENTRYTITLE },
166: { "refmeta", NODE_REFMETA },
167: { "refmetainfo", NODE_REFMETAINFO },
168: { "refmiscinfo", NODE_REFMISCINFO },
169: { "refname", NODE_REFNAME },
170: { "refnamediv", NODE_REFNAMEDIV },
171: { "refpurpose", NODE_REFPURPOSE },
172: { "refsect1", NODE_SECTION },
173: { "refsect2", NODE_SECTION },
174: { "refsect3", NODE_SECTION },
175: { "refsection", NODE_SECTION },
176: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
177: { "releaseinfo", NODE_RELEASEINFO },
178: { "replaceable", NODE_REPLACEABLE },
179: { "row", NODE_ROW },
180: { "sbr", NODE_SBR },
181: { "screen", NODE_SCREEN },
1.4 schwarze 182: { "secondary", NODE_DELETE },
1.1 schwarze 183: { "sect1", NODE_SECTION },
184: { "sect2", NODE_SECTION },
185: { "section", NODE_SECTION },
186: { "sgmltag", NODE_SGMLTAG },
1.15 schwarze 187: { "simpara", NODE_PARA },
1.1 schwarze 188: { "simplelist", NODE_SIMPLELIST },
189: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 190: { "structfield", NODE_PARAMETER },
191: { "structname", NODE_TYPE },
1.1 schwarze 192: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 193: { "surname", NODE_PERSONNAME },
1.12 schwarze 194: { "symbol", NODE_CONSTANT },
1.1 schwarze 195: { "synopsis", NODE_SYNOPSIS },
196: { "table", NODE_TABLE },
197: { "tbody", NODE_TBODY },
198: { "term", NODE_TERM },
199: { "tfoot", NODE_TFOOT },
200: { "tgroup", NODE_TGROUP },
201: { "thead", NODE_THEAD },
202: { "tip", NODE_TIP },
203: { "title", NODE_TITLE },
1.3 schwarze 204: { "trademark", NODE_IGNORE },
1.1 schwarze 205: { "type", NODE_TYPE },
1.18 schwarze 206: { "ulink", NODE_LINK },
1.13 schwarze 207: { "userinput", NODE_LITERAL },
1.1 schwarze 208: { "variablelist", NODE_VARIABLELIST },
209: { "varlistentry", NODE_VARLISTENTRY },
210: { "varname", NODE_VARNAME },
211: { "warning", NODE_WARNING },
212: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 213: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 214: { "year", NODE_YEAR },
1.5 schwarze 215: { NULL, NODE_IGNORE }
1.1 schwarze 216: };
217:
1.9 schwarze 218: struct entity {
219: const char *name;
220: const char *roff;
221: };
222:
223: /*
224: * XML character entity references found in the wild.
225: * Those that don't have an exact mandoc_char(7) representation
226: * are approximated, and the desired codepoint is given as a comment.
227: * Encoding them as \\[u...] would leave -Tascii out in the cold.
228: */
229: static const struct entity entities[] = {
230: { "alpha", "\\(*a" },
231: { "amp", "&" },
232: { "apos", "'" },
233: { "auml", "\\(:a" },
234: { "beta", "\\(*b" },
235: { "circ", "^" }, /* U+02C6 */
236: { "copy", "\\(co" },
237: { "dagger", "\\(dg" },
238: { "Delta", "\\(*D" },
239: { "eacute", "\\('e" },
240: { "emsp", "\\ " }, /* U+2003 */
241: { "gt", ">" },
242: { "hairsp", "\\^" },
243: { "kappa", "\\(*k" },
244: { "larr", "\\(<-" },
245: { "ldquo", "\\(lq" },
246: { "le", "\\(<=" },
247: { "lowbar", "_" },
248: { "lsqb", "[" },
249: { "lt", "<" },
250: { "mdash", "\\(em" },
251: { "minus", "\\-" },
252: { "ndash", "\\(en" },
253: { "nbsp", "\\ " },
254: { "num", "#" },
255: { "oslash", "\\(/o" },
256: { "ouml", "\\(:o" },
257: { "percnt", "%" },
258: { "quot", "\\(dq" },
259: { "rarr", "\\(->" },
260: { "rArr", "\\(rA" },
261: { "rdquo", "\\(rq" },
262: { "reg", "\\(rg" },
263: { "rho", "\\(*r" },
264: { "rsqb", "]" },
265: { "sigma", "\\(*s" },
266: { "shy", "\\&" }, /* U+00AD */
267: { "tau", "\\(*t" },
268: { "tilde", "\\[u02DC]" },
269: { "times", "\\[tmu]" },
270: { "uuml", "\\(:u" },
271: { NULL, NULL }
272: };
273:
1.23 ! schwarze 274: static size_t parse_string(struct parse *, char *, size_t,
! 275: enum pstate *, int);
! 276:
! 277:
1.6 schwarze 278: static void
279: error_msg(struct parse *p, const char *fmt, ...)
280: {
281: va_list ap;
282:
283: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
284: va_start(ap, fmt);
285: vfprintf(stderr, fmt, ap);
286: va_end(ap);
287: fputc('\n', stderr);
288: p->tree->flags |= TREE_FAIL;
289: }
290:
291: static void
292: warn_msg(struct parse *p, const char *fmt, ...)
293: {
294: va_list ap;
295:
1.23 ! schwarze 296: if ((p->flags & PFLAG_WARN) == 0)
1.6 schwarze 297: return;
298:
299: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
300: va_start(ap, fmt);
301: vfprintf(stderr, fmt, ap);
302: va_end(ap);
303: fputc('\n', stderr);
304: }
305:
1.1 schwarze 306: /*
307: * Process a string of characters.
308: * If a text node is already open, append to it.
309: * Otherwise, create a new one as a child of the current node.
310: */
311: static void
1.5 schwarze 312: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 313: {
314: struct pnode *dat;
1.16 schwarze 315: size_t newsz;
1.1 schwarze 316:
1.5 schwarze 317: if (ps->del > 0)
1.1 schwarze 318: return;
319:
1.5 schwarze 320: if (ps->cur == NULL) {
1.6 schwarze 321: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 322: return;
323: }
324:
1.1 schwarze 325: if (ps->cur->node != NODE_TEXT) {
326: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
327: perror(NULL);
328: exit(1);
329: }
330: dat->node = NODE_TEXT;
1.23 ! schwarze 331: dat->spc = (ps->flags & PFLAG_SPC) != 0;
1.1 schwarze 332: dat->parent = ps->cur;
333: TAILQ_INIT(&dat->childq);
334: TAILQ_INIT(&dat->attrq);
335: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
336: ps->cur = dat;
337: }
338:
1.5 schwarze 339: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 340: ps->cur->parent == ps->tree->root)
341: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 342:
1.1 schwarze 343: /* Append to the current text node. */
344:
345: assert(sz >= 0);
1.23 ! schwarze 346: newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz;
1.16 schwarze 347: ps->cur->b = realloc(ps->cur->b, newsz + 1);
1.1 schwarze 348: if (ps->cur->b == NULL) {
349: perror(NULL);
350: exit(1);
351: }
1.23 ! schwarze 352: if (ps->cur->bsz && (ps->flags & PFLAG_SPC))
1.16 schwarze 353: ps->cur->b[ps->cur->bsz++] = ' ';
1.1 schwarze 354: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
1.16 schwarze 355: ps->cur->b[ps->cur->bsz = newsz] = '\0';
1.1 schwarze 356: ps->cur->real = ps->cur->b;
1.23 ! schwarze 357: ps->flags &= ~PFLAG_SPC;
1.1 schwarze 358: }
359:
1.16 schwarze 360: /*
361: * Close out the text node and strip trailing whitespace, if one is open.
362: */
1.1 schwarze 363: static void
1.16 schwarze 364: pnode_closetext(struct parse *p)
1.1 schwarze 365: {
1.16 schwarze 366: struct pnode *n;
367:
368: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
369: return;
370: p->cur = n->parent;
371: while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
372: n->b[--n->bsz] = '\0';
1.23 ! schwarze 373: p->flags |= PFLAG_SPC;
1.16 schwarze 374: }
1.1 schwarze 375: }
376:
1.9 schwarze 377: static void
378: xml_entity(struct parse *p, const char *name)
379: {
380: const struct entity *entity;
381: struct pnode *dat;
1.23 ! schwarze 382: const char *ccp;
! 383: char *cp;
! 384: enum pstate pstate;
1.9 schwarze 385:
386: if (p->del > 0)
387: return;
388:
389: if (p->cur == NULL) {
390: error_msg(p, "discarding entity before document: &%s;", name);
391: return;
392: }
393:
1.16 schwarze 394: pnode_closetext(p);
1.9 schwarze 395:
396: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
397: warn_msg(p, "entity after end of document: &%s;", name);
398:
399: for (entity = entities; entity->name != NULL; entity++)
400: if (strcmp(name, entity->name) == 0)
401: break;
402:
403: if (entity->roff == NULL) {
1.23 ! schwarze 404: if (p->doctype != NULL) {
! 405: TAILQ_FOREACH(dat, &p->doctype->childq, child) {
! 406: if ((ccp = pnode_getattr_raw(dat,
! 407: ATTRKEY_NAME, NULL)) == NULL ||
! 408: strcmp(ccp, name) != 0 ||
! 409: (ccp = pnode_getattr_raw(dat,
! 410: ATTRKEY_DEFINITION, NULL)) == NULL)
! 411: continue;
! 412: if ((cp = strdup(ccp)) == NULL) {
! 413: perror(NULL);
! 414: exit(1);
! 415: }
! 416: pstate = PARSE_ELEM;
! 417: parse_string(p, cp, strlen(cp), &pstate, 0);
! 418: p->flags &= ~PFLAG_SPC;
! 419: free(cp);
! 420: return;
! 421: }
! 422: }
1.9 schwarze 423: error_msg(p, "unknown entity &%s;", name);
424: return;
425: }
426:
427: /* Create, append, and close out an entity node. */
428: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
429: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
430: perror(NULL);
431: exit(1);
432: }
433: dat->node = NODE_ESCAPE;
434: dat->bsz = strlen(dat->b);
1.23 ! schwarze 435: dat->spc = (p->flags & PFLAG_SPC) != 0;
1.9 schwarze 436: dat->parent = p->cur;
437: TAILQ_INIT(&dat->childq);
438: TAILQ_INIT(&dat->attrq);
439: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
1.23 ! schwarze 440: p->flags &= ~PFLAG_SPC;
1.9 schwarze 441: }
442:
1.1 schwarze 443: /*
444: * Begin an element.
445: */
446: static void
1.5 schwarze 447: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 448: {
1.5 schwarze 449: const struct element *elem;
450: struct pnode *dat;
1.1 schwarze 451:
1.4 schwarze 452: /*
453: * An ancestor is excluded from the tree;
454: * keep track of the number of levels excluded.
455: */
456: if (ps->del > 0) {
1.23 ! schwarze 457: if (*name != '!' && *name != '?')
! 458: ps->del++;
1.4 schwarze 459: return;
460: }
461:
1.16 schwarze 462: pnode_closetext(ps);
1.1 schwarze 463:
464: for (elem = elements; elem->name != NULL; elem++)
465: if (strcmp(elem->name, name) == 0)
466: break;
467:
1.23 ! schwarze 468: if (elem->name == NULL) {
! 469: if (*name == '!' || *name == '?')
! 470: return;
1.6 schwarze 471: error_msg(ps, "unknown element <%s>", name);
1.23 ! schwarze 472: }
1.6 schwarze 473:
1.5 schwarze 474: ps->ncur = elem->node;
1.1 schwarze 475:
1.5 schwarze 476: switch (ps->ncur) {
1.4 schwarze 477: case NODE_DELETE_WARN:
1.6 schwarze 478: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 479: /* FALLTHROUGH */
1.4 schwarze 480: case NODE_DELETE:
481: ps->del = 1;
482: /* FALLTHROUGH */
1.2 schwarze 483: case NODE_IGNORE:
484: return;
485: case NODE_INLINEEQUATION:
1.1 schwarze 486: ps->tree->flags |= TREE_EQN;
1.2 schwarze 487: break;
488: default:
489: break;
490: }
1.1 schwarze 491:
1.6 schwarze 492: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
493: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 494:
1.1 schwarze 495: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
496: perror(NULL);
497: exit(1);
498: }
1.17 schwarze 499:
500: /*
501: * Nodes that begin a new macro or request line or start by
502: * printing text always want whitespace before themselves.
503: */
504:
505: switch (dat->node = elem->node) {
1.23 ! schwarze 506: case NODE_DOCTYPE:
! 507: case NODE_ENTITY:
! 508: case NODE_SBR:
! 509: ps->flags |= PFLAG_EEND;
! 510: /* FALLTHROUGH */
1.22 schwarze 511: case NODE_APPENDIX:
1.17 schwarze 512: case NODE_AUTHORGROUP:
1.20 schwarze 513: case NODE_BLOCKQUOTE:
1.17 schwarze 514: case NODE_BOOKINFO:
515: case NODE_CAUTION:
516: case NODE_EDITOR:
517: case NODE_ENTRY:
518: case NODE_FUNCDEF:
519: case NODE_FUNCPROTOTYPE:
520: case NODE_INFORMALEQUATION:
521: case NODE_INLINEEQUATION:
522: case NODE_ITEMIZEDLIST:
523: case NODE_LEGALNOTICE:
524: case NODE_LITERALLAYOUT:
525: case NODE_NOTE:
526: case NODE_ORDEREDLIST:
527: case NODE_PARA:
528: case NODE_PREFACE:
529: case NODE_PROGRAMLISTING:
530: case NODE_REFMETA:
531: case NODE_REFNAMEDIV:
532: case NODE_REFSYNOPSISDIV:
533: case NODE_ROW:
534: case NODE_SCREEN:
535: case NODE_SECTION:
536: case NODE_SYNOPSIS:
537: case NODE_TGROUP:
538: case NODE_TIP:
539: case NODE_TITLE:
540: case NODE_VARIABLELIST:
541: case NODE_VARLISTENTRY:
542: case NODE_WARNING:
543: dat->spc = 1;
544: break;
545: default:
1.23 ! schwarze 546: dat->spc = (ps->flags & PFLAG_SPC) != 0;
1.17 schwarze 547: break;
548: }
1.1 schwarze 549: dat->parent = ps->cur;
550: TAILQ_INIT(&dat->childq);
551: TAILQ_INIT(&dat->attrq);
552:
553: if (ps->cur != NULL)
554: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
555:
556: ps->cur = dat;
1.23 ! schwarze 557: if (dat->node == NODE_DOCTYPE) {
! 558: if (ps->doctype == NULL)
! 559: ps->doctype = dat;
! 560: else
! 561: error_msg(ps, "duplicate doctype");
! 562: } else if (dat->parent == NULL && ps->tree->root == NULL)
1.1 schwarze 563: ps->tree->root = dat;
1.5 schwarze 564: }
565:
566: static void
567: xml_attrkey(struct parse *ps, const char *name)
568: {
569: struct pattr *attr;
1.23 ! schwarze 570: const char *value;
1.5 schwarze 571: enum attrkey key;
1.1 schwarze 572:
1.19 schwarze 573: if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 574: return;
1.23 ! schwarze 575:
! 576: if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) &&
! 577: TAILQ_FIRST(&ps->cur->attrq) == NULL) {
! 578: value = name;
! 579: name = "NAME";
! 580: } else
! 581: value = NULL;
! 582:
1.5 schwarze 583: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.23 ! schwarze 584: ps->flags &= ~PFLAG_ATTR;
1.5 schwarze 585: return;
586: }
587: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
588: perror(NULL);
589: exit(1);
590: }
591: attr->key = key;
592: attr->val = ATTRVAL__MAX;
1.23 ! schwarze 593: if (value == NULL) {
! 594: attr->rawval = NULL;
! 595: ps->flags |= PFLAG_ATTR;
! 596: } else {
! 597: if ((attr->rawval = strdup(value)) == NULL) {
! 598: perror(NULL);
! 599: exit(1);
! 600: }
! 601: ps->flags &= ~PFLAG_ATTR;
! 602: }
1.5 schwarze 603: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
1.23 ! schwarze 604: if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
! 605: xml_attrkey(ps, "DEFINITION");
1.5 schwarze 606: }
607:
608: static void
609: xml_attrval(struct parse *ps, const char *name)
610: {
611: struct pattr *attr;
612:
1.23 ! schwarze 613: if (ps->del > 0 || ps->ncur == NODE_IGNORE ||
! 614: (ps->flags & PFLAG_ATTR) == 0)
1.5 schwarze 615: return;
616: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
617: return;
618: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
619: (attr->rawval = strdup(name)) == NULL) {
620: perror(NULL);
621: exit(1);
1.1 schwarze 622: }
623: }
624:
625: /*
626: * Roll up the parse tree.
627: * If we're at a text node, roll that one up first.
628: */
629: static void
1.5 schwarze 630: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 631: {
1.5 schwarze 632: const struct element *elem;
633: enum nodeid node;
1.1 schwarze 634:
1.4 schwarze 635: /*
636: * An ancestor is excluded from the tree;
637: * keep track of the number of levels excluded.
638: */
639: if (ps->del > 1) {
640: ps->del--;
641: return;
642: }
643:
1.16 schwarze 644: if (ps->del == 0)
645: pnode_closetext(ps);
1.2 schwarze 646:
1.5 schwarze 647: if (name != NULL) {
648: for (elem = elements; elem->name != NULL; elem++)
649: if (strcmp(elem->name, name) == 0)
650: break;
651: node = elem->node;
652: } else
653: node = ps->ncur;
1.2 schwarze 654:
1.5 schwarze 655: switch (node) {
1.4 schwarze 656: case NODE_DELETE_WARN:
657: case NODE_DELETE:
1.5 schwarze 658: if (ps->del > 0)
659: ps->del--;
1.4 schwarze 660: break;
1.2 schwarze 661: case NODE_IGNORE:
662: break;
1.23 ! schwarze 663: case NODE_DOCTYPE:
! 664: ps->flags &= ~PFLAG_EEND;
! 665: /* FALLTHROUGH */
1.2 schwarze 666: default:
1.5 schwarze 667: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 668: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 669: break;
670: }
671:
672: /*
673: * Refrain from actually closing the document element.
674: * If no more content follows, no harm is done, but if
675: * some content still follows, simply processing it is
676: * obviously better than discarding it or crashing.
677: */
678:
1.23 ! schwarze 679: if (ps->cur->parent != NULL || node == NODE_DOCTYPE) {
! 680: ps->cur = ps->cur->parent;
! 681: if (ps->cur != NULL)
! 682: ps->ncur = ps->cur->node;
! 683: } else
1.5 schwarze 684: ps->tree->flags |= TREE_CLOSED;
1.23 ! schwarze 685: ps->flags &= ~PFLAG_SPC;
1.4 schwarze 686: break;
1.2 schwarze 687: }
1.4 schwarze 688: assert(ps->del == 0);
1.1 schwarze 689: }
690:
691: struct parse *
692: parse_alloc(int warn)
693: {
694: struct parse *p;
695:
696: if ((p = calloc(1, sizeof(*p))) == NULL)
697: return NULL;
698:
699: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
700: free(p);
701: return NULL;
702: }
1.23 ! schwarze 703: if (warn)
! 704: p->flags |= PFLAG_WARN;
! 705: else
! 706: p->flags &= ~PFLAG_WARN;
1.1 schwarze 707: return p;
708: }
709:
710: void
711: parse_free(struct parse *p)
712: {
713: if (p == NULL)
714: return;
715: if (p->tree != NULL) {
716: pnode_unlink(p->tree->root);
717: free(p->tree);
718: }
719: free(p);
720: }
721:
1.14 schwarze 722: static void
723: increment(struct parse *p, char *b, size_t *pend, int refill)
724: {
725: if (refill) {
726: if (b[*pend] == '\n') {
727: p->nline++;
728: p->ncol = 1;
729: } else
730: p->ncol++;
731: }
732: ++*pend;
733: }
734:
1.5 schwarze 735: /*
736: * Advance the pend pointer to the next character in the charset.
737: * If the charset starts with a space, it stands for any whitespace.
738: * Update the new input file position, used for messages.
739: * Do not overrun the buffer b of length rlen.
740: * When reaching the end, NUL-terminate the buffer and return 1;
741: * otherwise, return 0.
742: */
743: static int
744: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 745: const char *charset, int refill)
1.5 schwarze 746: {
747: int space;
748:
749: if (*charset == ' ') {
750: space = 1;
751: charset++;
752: } else
753: space = 0;
754:
1.14 schwarze 755: if (refill) {
756: p->nline = p->line;
757: p->ncol = p->col;
758: }
1.5 schwarze 759: while (*pend < rlen) {
760: if (space && isspace((unsigned char)b[*pend]))
761: break;
762: if (strchr(charset, b[*pend]) != NULL)
763: break;
1.14 schwarze 764: increment(p, b, pend, refill);
1.5 schwarze 765: }
766: if (*pend == rlen) {
767: b[rlen] = '\0';
1.14 schwarze 768: return refill;
1.5 schwarze 769: } else
770: return 0;
771: }
772:
1.14 schwarze 773: size_t
774: parse_string(struct parse *p, char *b, size_t rlen,
775: enum pstate *pstate, int refill)
776: {
777: char *cp;
778: size_t poff; /* Parse offset in b[]. */
779: size_t pend; /* Offset of the end of the current word. */
780: int elem_end;
781:
782: pend = 0;
783: for (;;) {
784:
785: /* Proceed to the next token, skipping whitespace. */
786:
787: if (refill) {
788: p->line = p->nline;
789: p->col = p->ncol;
790: }
791: if ((poff = pend) == rlen)
792: break;
793: if (isspace((unsigned char)b[pend])) {
1.23 ! schwarze 794: p->flags |= PFLAG_SPC;
1.14 schwarze 795: increment(p, b, &pend, refill);
796: continue;
797: }
798:
799: /*
800: * The following four cases (ARG, TAG, and starting an
801: * entity or a tag) all parse a word or quoted string.
802: * If that extends beyond the read buffer and the last
803: * read(2) still got data, they all break out of the
804: * token loop to request more data from the read loop.
805: *
806: * Also, three of them detect self-closing tags, those
807: * ending with "/>", setting the flag elem_end and
808: * calling xml_elem_end() at the very end, after
809: * handling the attribute value, attribute name, or
810: * tag name, respectively.
811: */
812:
813: /* Parse an attribute value. */
814:
815: if (*pstate >= PARSE_ARG) {
816: if (*pstate == PARSE_ARG &&
817: (b[pend] == '\'' || b[pend] == '"')) {
818: *pstate = b[pend] == '"' ?
819: PARSE_DQ : PARSE_SQ;
820: increment(p, b, &pend, refill);
821: continue;
822: }
823: if (advance(p, b, rlen, &pend,
824: *pstate == PARSE_DQ ? "\"" :
825: *pstate == PARSE_SQ ? "'" : " >", refill))
826: break;
827: *pstate = PARSE_TAG;
828: elem_end = 0;
829: if (b[pend] == '>') {
830: *pstate = PARSE_ELEM;
831: if (pend > 0 && b[pend - 1] == '/') {
832: b[pend - 1] = '\0';
833: elem_end = 1;
834: }
1.23 ! schwarze 835: if (p->flags & PFLAG_EEND)
! 836: elem_end = 1;
1.14 schwarze 837: }
838: b[pend] = '\0';
839: if (pend < rlen)
840: increment(p, b, &pend, refill);
841: xml_attrval(p, b + poff);
842: if (elem_end)
843: xml_elem_end(p, NULL);
844:
845: /* Look for an attribute name. */
846:
847: } else if (*pstate == PARSE_TAG) {
1.23 ! schwarze 848: switch (p->ncur) {
! 849: case NODE_DOCTYPE:
! 850: if (b[pend] == '[') {
! 851: *pstate = PARSE_ELEM;
! 852: increment(p, b, &pend, refill);
! 853: continue;
! 854: }
! 855: /* FALLTHROUGH */
! 856: case NODE_ENTITY:
! 857: if (b[pend] == '"' || b[pend] == '\'') {
! 858: *pstate = PARSE_ARG;
! 859: continue;
! 860: }
! 861: break;
! 862: default:
! 863: break;
! 864: }
1.14 schwarze 865: if (advance(p, b, rlen, &pend, " =>", refill))
866: break;
867: elem_end = 0;
868: switch (b[pend]) {
869: case '>':
870: *pstate = PARSE_ELEM;
871: if (pend > 0 && b[pend - 1] == '/') {
872: b[pend - 1] = '\0';
873: elem_end = 1;
874: }
1.23 ! schwarze 875: if (p->flags & PFLAG_EEND)
! 876: elem_end = 1;
1.14 schwarze 877: break;
878: case '=':
879: *pstate = PARSE_ARG;
880: break;
881: default:
882: break;
883: }
884: b[pend] = '\0';
885: if (pend < rlen)
886: increment(p, b, &pend, refill);
887: xml_attrkey(p, b + poff);
888: if (elem_end)
889: xml_elem_end(p, NULL);
890:
891: /* Begin an opening or closing tag. */
892:
893: } else if (b[poff] == '<') {
894: if (advance(p, b, rlen, &pend, " >", refill))
895: break;
896: if (pend > poff + 3 &&
897: strncmp(b + poff, "<!--", 4) == 0) {
898:
899: /* Skip a comment. */
900:
901: cp = strstr(b + pend - 2, "-->");
902: if (cp == NULL) {
903: if (refill)
904: break;
905: cp = b + rlen;
906: } else
907: cp += 3;
908: while (b + pend < cp)
909: increment(p, b, &pend, refill);
910: continue;
911: }
912: elem_end = 0;
913: if (b[pend] != '>')
914: *pstate = PARSE_TAG;
915: else if (pend > 0 && b[pend - 1] == '/') {
916: b[pend - 1] = '\0';
917: elem_end = 1;
918: }
919: b[pend] = '\0';
920: if (pend < rlen)
921: increment(p, b, &pend, refill);
922: if (b[++poff] == '/') {
923: elem_end = 1;
924: poff++;
1.23 ! schwarze 925: } else {
1.14 schwarze 926: xml_elem_start(p, b + poff);
1.23 ! schwarze 927: if (*pstate == PARSE_ELEM &&
! 928: p->flags & PFLAG_EEND)
! 929: elem_end = 1;
! 930: }
1.14 schwarze 931: if (elem_end)
932: xml_elem_end(p, b + poff);
933:
1.23 ! schwarze 934: /* Close a doctype. */
! 935:
! 936: } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
! 937: *pstate = PARSE_TAG;
! 938: increment(p, b, &pend, refill);
! 939:
1.14 schwarze 940: /* Process an entity. */
941:
942: } else if (b[poff] == '&') {
943: if (advance(p, b, rlen, &pend, ";", refill))
944: break;
945: b[pend] = '\0';
946: if (pend < rlen)
947: increment(p, b, &pend, refill);
948: xml_entity(p, b + poff + 1);
949:
950: /* Process text up to the next tag, entity, or EOL. */
951:
952: } else {
953: advance(p, b, rlen, &pend, "<&", refill);
954: xml_char(p, b + poff, pend - poff);
955: }
956: }
957: return poff;
958: }
959:
1.1 schwarze 960: struct ptree *
961: parse_file(struct parse *p, int fd, const char *fname)
962: {
963: char b[4096];
1.5 schwarze 964: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 965: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 966: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 967: enum pstate pstate;
1.1 schwarze 968:
969: p->fname = fname;
1.5 schwarze 970: p->nline = 1;
971: p->ncol = 1;
1.14 schwarze 972: pstate = PARSE_ELEM;
1.5 schwarze 973: rlen = 0;
974:
975: /*
976: * Read loop.
977: *
1.14 schwarze 978: * If the previous token was incomplete and asked for more
979: * input, we have to enter the read loop once more even on EOF.
1.5 schwarze 980: * Once rsz is 0, incomplete tokens will no longer ask
981: * for more input but instead use whatever there is,
982: * and then exit the read loop.
983: * The minus one on the size limit for read(2) is needed
984: * such that advance() can set b[rlen] to NUL when needed.
985: */
986:
1.14 schwarze 987: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
988: (rlen += rsz) > 0) {
989: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 990: /* Buffer exhausted; shift left and re-fill. */
991: assert(poff > 0);
992: rlen -= poff;
1.14 schwarze 993: memmove(b, b + poff, rlen);
1.5 schwarze 994: }
995: if (rsz < 0) {
996: perror(fname);
997: p->tree->flags |= TREE_FAIL;
998: }
1.16 schwarze 999: pnode_closetext(p);
1.6 schwarze 1000: if ((p->tree->flags & TREE_CLOSED) == 0)
1001: warn_msg(p, "document not closed");
1.23 ! schwarze 1002: pnode_unlink(p->doctype);
1.1 schwarze 1003: return p->tree;
1004: }
CVSweb