Annotation of docbook2mdoc/parse.c, Revision 1.21
1.21 ! schwarze 1: /* $Id: parse.c,v 1.20 2019/04/07 17:55:18 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
1.14 schwarze 33: enum pstate {
34: PARSE_ELEM,
35: PARSE_TAG,
36: PARSE_ARG,
37: PARSE_SQ,
38: PARSE_DQ
39: };
40:
1.1 schwarze 41: /*
42: * Global parse state.
43: * Keep this as simple and small as possible.
44: */
45: struct parse {
46: const char *fname; /* Name of the input file. */
47: struct ptree *tree; /* Complete parse result. */
48: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 49: enum nodeid ncur; /* Type of the current node. */
50: int line; /* Line number in the input file. */
51: int col; /* Column number in the input file. */
52: int nline; /* Line number of next token. */
53: int ncol; /* Column number of next token. */
1.4 schwarze 54: int del; /* Levels of nested nodes being deleted. */
1.16 schwarze 55: int spc; /* Whitespace before the next element. */
1.5 schwarze 56: int attr; /* The most recent attribute is valid. */
1.1 schwarze 57: int warn;
58: };
59:
60: struct element {
61: const char *name; /* DocBook element name. */
62: enum nodeid node; /* Node type to generate. */
63: };
64:
65: static const struct element elements[] = {
1.3 schwarze 66: { "acronym", NODE_IGNORE },
1.1 schwarze 67: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 68: { "anchor", NODE_DELETE },
1.1 schwarze 69: { "application", NODE_APPLICATION },
70: { "arg", NODE_ARG },
71: { "author", NODE_AUTHOR },
72: { "authorgroup", NODE_AUTHORGROUP },
73: { "blockquote", NODE_BLOCKQUOTE },
74: { "book", NODE_BOOK },
75: { "bookinfo", NODE_BOOKINFO },
76: { "caution", NODE_CAUTION },
77: { "chapter", NODE_SECTION },
78: { "citerefentry", NODE_CITEREFENTRY },
79: { "citetitle", NODE_CITETITLE },
80: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 81: { "code", NODE_LITERAL },
1.1 schwarze 82: { "colspec", NODE_COLSPEC },
83: { "command", NODE_COMMAND },
84: { "constant", NODE_CONSTANT },
1.7 schwarze 85: { "contrib", NODE_CONTRIB },
1.1 schwarze 86: { "copyright", NODE_COPYRIGHT },
87: { "date", NODE_DATE },
88: { "editor", NODE_EDITOR },
89: { "email", NODE_EMAIL },
90: { "emphasis", NODE_EMPHASIS },
91: { "entry", NODE_ENTRY },
92: { "envar", NODE_ENVAR },
1.13 schwarze 93: { "errorname", NODE_ERRORNAME },
1.1 schwarze 94: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
95: { "filename", NODE_FILENAME },
1.7 schwarze 96: { "firstname", NODE_PERSONNAME },
1.1 schwarze 97: { "firstterm", NODE_FIRSTTERM },
98: { "footnote", NODE_FOOTNOTE },
99: { "funcdef", NODE_FUNCDEF },
100: { "funcprototype", NODE_FUNCPROTOTYPE },
101: { "funcsynopsis", NODE_FUNCSYNOPSIS },
102: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
103: { "function", NODE_FUNCTION },
1.21 ! schwarze 104: { "glossary", NODE_VARIABLELIST },
! 105: { "glossdef", NODE_IGNORE },
! 106: { "glossdiv", NODE_IGNORE },
! 107: { "glossentry", NODE_VARLISTENTRY },
! 108: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 109: { "glossterm", NODE_GLOSSTERM },
110: { "group", NODE_GROUP },
111: { "holder", NODE_HOLDER },
112: { "index", NODE_INDEX },
1.4 schwarze 113: { "indexterm", NODE_DELETE },
1.1 schwarze 114: { "info", NODE_INFO },
115: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 116: { "informaltable", NODE_TABLE },
1.1 schwarze 117: { "inlineequation", NODE_INLINEEQUATION },
118: { "itemizedlist", NODE_ITEMIZEDLIST },
119: { "keysym", NODE_KEYSYM },
120: { "legalnotice", NODE_LEGALNOTICE },
121: { "link", NODE_LINK },
122: { "listitem", NODE_LISTITEM },
123: { "literal", NODE_LITERAL },
124: { "literallayout", NODE_LITERALLAYOUT },
125: { "manvolnum", NODE_MANVOLNUM },
126: { "member", NODE_MEMBER },
127: { "mml:math", NODE_MML_MATH },
128: { "mml:mfenced", NODE_MML_MFENCED },
129: { "mml:mfrac", NODE_MML_MFRAC },
130: { "mml:mi", NODE_MML_MI },
131: { "mml:mn", NODE_MML_MN },
132: { "mml:mo", NODE_MML_MO },
133: { "mml:mrow", NODE_MML_MROW },
134: { "mml:msub", NODE_MML_MSUB },
135: { "mml:msup", NODE_MML_MSUP },
136: { "modifier", NODE_MODIFIER },
137: { "note", NODE_NOTE },
138: { "option", NODE_OPTION },
139: { "orderedlist", NODE_ORDEREDLIST },
140: { "orgname", NODE_ORGNAME },
1.7 schwarze 141: { "othername", NODE_PERSONNAME },
1.1 schwarze 142: { "para", NODE_PARA },
143: { "paramdef", NODE_PARAMDEF },
144: { "parameter", NODE_PARAMETER },
145: { "part", NODE_SECTION },
146: { "personname", NODE_PERSONNAME },
1.3 schwarze 147: { "phrase", NODE_IGNORE },
1.1 schwarze 148: { "preface", NODE_PREFACE },
1.4 schwarze 149: { "primary", NODE_DELETE },
1.1 schwarze 150: { "programlisting", NODE_PROGRAMLISTING },
151: { "prompt", NODE_PROMPT },
152: { "quote", NODE_QUOTE },
153: { "refclass", NODE_REFCLASS },
154: { "refdescriptor", NODE_REFDESCRIPTOR },
155: { "refentry", NODE_REFENTRY },
156: { "refentryinfo", NODE_REFENTRYINFO },
157: { "refentrytitle", NODE_REFENTRYTITLE },
158: { "refmeta", NODE_REFMETA },
159: { "refmetainfo", NODE_REFMETAINFO },
160: { "refmiscinfo", NODE_REFMISCINFO },
161: { "refname", NODE_REFNAME },
162: { "refnamediv", NODE_REFNAMEDIV },
163: { "refpurpose", NODE_REFPURPOSE },
164: { "refsect1", NODE_SECTION },
165: { "refsect2", NODE_SECTION },
166: { "refsect3", NODE_SECTION },
167: { "refsection", NODE_SECTION },
168: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
169: { "releaseinfo", NODE_RELEASEINFO },
170: { "replaceable", NODE_REPLACEABLE },
171: { "row", NODE_ROW },
172: { "sbr", NODE_SBR },
173: { "screen", NODE_SCREEN },
1.4 schwarze 174: { "secondary", NODE_DELETE },
1.1 schwarze 175: { "sect1", NODE_SECTION },
176: { "sect2", NODE_SECTION },
177: { "section", NODE_SECTION },
178: { "sgmltag", NODE_SGMLTAG },
1.15 schwarze 179: { "simpara", NODE_PARA },
1.1 schwarze 180: { "simplelist", NODE_SIMPLELIST },
181: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 182: { "structfield", NODE_PARAMETER },
183: { "structname", NODE_TYPE },
1.1 schwarze 184: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 185: { "surname", NODE_PERSONNAME },
1.12 schwarze 186: { "symbol", NODE_CONSTANT },
1.1 schwarze 187: { "synopsis", NODE_SYNOPSIS },
188: { "table", NODE_TABLE },
189: { "tbody", NODE_TBODY },
190: { "term", NODE_TERM },
191: { "tfoot", NODE_TFOOT },
192: { "tgroup", NODE_TGROUP },
193: { "thead", NODE_THEAD },
194: { "tip", NODE_TIP },
195: { "title", NODE_TITLE },
1.3 schwarze 196: { "trademark", NODE_IGNORE },
1.1 schwarze 197: { "type", NODE_TYPE },
1.18 schwarze 198: { "ulink", NODE_LINK },
1.13 schwarze 199: { "userinput", NODE_LITERAL },
1.1 schwarze 200: { "variablelist", NODE_VARIABLELIST },
201: { "varlistentry", NODE_VARLISTENTRY },
202: { "varname", NODE_VARNAME },
203: { "warning", NODE_WARNING },
204: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 205: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 206: { "year", NODE_YEAR },
1.5 schwarze 207: { NULL, NODE_IGNORE }
1.1 schwarze 208: };
209:
1.9 schwarze 210: struct entity {
211: const char *name;
212: const char *roff;
213: };
214:
215: /*
216: * XML character entity references found in the wild.
217: * Those that don't have an exact mandoc_char(7) representation
218: * are approximated, and the desired codepoint is given as a comment.
219: * Encoding them as \\[u...] would leave -Tascii out in the cold.
220: */
221: static const struct entity entities[] = {
222: { "alpha", "\\(*a" },
223: { "amp", "&" },
224: { "apos", "'" },
225: { "auml", "\\(:a" },
226: { "beta", "\\(*b" },
227: { "circ", "^" }, /* U+02C6 */
228: { "copy", "\\(co" },
229: { "dagger", "\\(dg" },
230: { "Delta", "\\(*D" },
231: { "eacute", "\\('e" },
232: { "emsp", "\\ " }, /* U+2003 */
233: { "gt", ">" },
234: { "hairsp", "\\^" },
235: { "kappa", "\\(*k" },
236: { "larr", "\\(<-" },
237: { "ldquo", "\\(lq" },
238: { "le", "\\(<=" },
239: { "lowbar", "_" },
240: { "lsqb", "[" },
241: { "lt", "<" },
242: { "mdash", "\\(em" },
243: { "minus", "\\-" },
244: { "ndash", "\\(en" },
245: { "nbsp", "\\ " },
246: { "num", "#" },
247: { "oslash", "\\(/o" },
248: { "ouml", "\\(:o" },
249: { "percnt", "%" },
250: { "quot", "\\(dq" },
251: { "rarr", "\\(->" },
252: { "rArr", "\\(rA" },
253: { "rdquo", "\\(rq" },
254: { "reg", "\\(rg" },
255: { "rho", "\\(*r" },
256: { "rsqb", "]" },
257: { "sigma", "\\(*s" },
258: { "shy", "\\&" }, /* U+00AD */
259: { "tau", "\\(*t" },
260: { "tilde", "\\[u02DC]" },
261: { "times", "\\[tmu]" },
262: { "uuml", "\\(:u" },
263: { NULL, NULL }
264: };
265:
1.6 schwarze 266: static void
267: error_msg(struct parse *p, const char *fmt, ...)
268: {
269: va_list ap;
270:
271: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
272: va_start(ap, fmt);
273: vfprintf(stderr, fmt, ap);
274: va_end(ap);
275: fputc('\n', stderr);
276: p->tree->flags |= TREE_FAIL;
277: }
278:
279: static void
280: warn_msg(struct parse *p, const char *fmt, ...)
281: {
282: va_list ap;
283:
284: if (p->warn == 0)
285: return;
286:
287: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
288: va_start(ap, fmt);
289: vfprintf(stderr, fmt, ap);
290: va_end(ap);
291: fputc('\n', stderr);
292: }
293:
1.1 schwarze 294: /*
295: * Process a string of characters.
296: * If a text node is already open, append to it.
297: * Otherwise, create a new one as a child of the current node.
298: */
299: static void
1.5 schwarze 300: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 301: {
302: struct pnode *dat;
1.16 schwarze 303: size_t newsz;
1.1 schwarze 304:
1.5 schwarze 305: if (ps->del > 0)
1.1 schwarze 306: return;
307:
1.5 schwarze 308: if (ps->cur == NULL) {
1.6 schwarze 309: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 310: return;
311: }
312:
1.1 schwarze 313: if (ps->cur->node != NODE_TEXT) {
314: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
315: perror(NULL);
316: exit(1);
317: }
318: dat->node = NODE_TEXT;
1.16 schwarze 319: dat->spc = ps->spc;
1.1 schwarze 320: dat->parent = ps->cur;
321: TAILQ_INIT(&dat->childq);
322: TAILQ_INIT(&dat->attrq);
323: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
324: ps->cur = dat;
325: }
326:
1.5 schwarze 327: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 328: ps->cur->parent == ps->tree->root)
329: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 330:
1.1 schwarze 331: /* Append to the current text node. */
332:
333: assert(sz >= 0);
1.16 schwarze 334: newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
335: ps->cur->b = realloc(ps->cur->b, newsz + 1);
1.1 schwarze 336: if (ps->cur->b == NULL) {
337: perror(NULL);
338: exit(1);
339: }
1.16 schwarze 340: if (ps->cur->bsz && ps->spc)
341: ps->cur->b[ps->cur->bsz++] = ' ';
1.1 schwarze 342: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
1.16 schwarze 343: ps->cur->b[ps->cur->bsz = newsz] = '\0';
1.1 schwarze 344: ps->cur->real = ps->cur->b;
1.16 schwarze 345: ps->spc = 0;
1.1 schwarze 346: }
347:
1.16 schwarze 348: /*
349: * Close out the text node and strip trailing whitespace, if one is open.
350: */
1.1 schwarze 351: static void
1.16 schwarze 352: pnode_closetext(struct parse *p)
1.1 schwarze 353: {
1.16 schwarze 354: struct pnode *n;
355:
356: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
357: return;
358: p->cur = n->parent;
359: while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
360: n->b[--n->bsz] = '\0';
361: p->spc = 1;
362: }
1.1 schwarze 363: }
364:
1.9 schwarze 365: static void
366: xml_entity(struct parse *p, const char *name)
367: {
368: const struct entity *entity;
369: struct pnode *dat;
370:
371: if (p->del > 0)
372: return;
373:
374: if (p->cur == NULL) {
375: error_msg(p, "discarding entity before document: &%s;", name);
376: return;
377: }
378:
1.16 schwarze 379: pnode_closetext(p);
1.9 schwarze 380:
381: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
382: warn_msg(p, "entity after end of document: &%s;", name);
383:
384: for (entity = entities; entity->name != NULL; entity++)
385: if (strcmp(name, entity->name) == 0)
386: break;
387:
388: if (entity->roff == NULL) {
389: error_msg(p, "unknown entity &%s;", name);
390: return;
391: }
392:
393: /* Create, append, and close out an entity node. */
394: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
395: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
396: perror(NULL);
397: exit(1);
398: }
399: dat->node = NODE_ESCAPE;
400: dat->bsz = strlen(dat->b);
1.16 schwarze 401: dat->spc = p->spc;
1.9 schwarze 402: dat->parent = p->cur;
403: TAILQ_INIT(&dat->childq);
404: TAILQ_INIT(&dat->attrq);
405: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
1.16 schwarze 406: p->spc = 0;
1.9 schwarze 407: }
408:
1.1 schwarze 409: /*
410: * Begin an element.
411: */
412: static void
1.5 schwarze 413: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 414: {
1.5 schwarze 415: const struct element *elem;
416: struct pnode *dat;
1.1 schwarze 417:
1.5 schwarze 418: if (*name == '!' || *name == '?')
1.1 schwarze 419: return;
420:
1.4 schwarze 421: /*
422: * An ancestor is excluded from the tree;
423: * keep track of the number of levels excluded.
424: */
425: if (ps->del > 0) {
426: ps->del++;
427: return;
428: }
429:
1.16 schwarze 430: pnode_closetext(ps);
1.1 schwarze 431:
432: for (elem = elements; elem->name != NULL; elem++)
433: if (strcmp(elem->name, name) == 0)
434: break;
435:
1.6 schwarze 436: if (elem->name == NULL)
437: error_msg(ps, "unknown element <%s>", name);
438:
1.5 schwarze 439: ps->ncur = elem->node;
1.1 schwarze 440:
1.5 schwarze 441: switch (ps->ncur) {
1.4 schwarze 442: case NODE_DELETE_WARN:
1.6 schwarze 443: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 444: /* FALLTHROUGH */
1.4 schwarze 445: case NODE_DELETE:
446: ps->del = 1;
447: /* FALLTHROUGH */
1.2 schwarze 448: case NODE_IGNORE:
449: return;
450: case NODE_INLINEEQUATION:
1.1 schwarze 451: ps->tree->flags |= TREE_EQN;
1.2 schwarze 452: break;
453: default:
454: break;
455: }
1.1 schwarze 456:
1.6 schwarze 457: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
458: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 459:
1.1 schwarze 460: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
461: perror(NULL);
462: exit(1);
463: }
1.17 schwarze 464:
465: /*
466: * Nodes that begin a new macro or request line or start by
467: * printing text always want whitespace before themselves.
468: */
469:
470: switch (dat->node = elem->node) {
471: case NODE_AUTHORGROUP:
1.20 schwarze 472: case NODE_BLOCKQUOTE:
1.17 schwarze 473: case NODE_BOOKINFO:
474: case NODE_CAUTION:
475: case NODE_EDITOR:
476: case NODE_ENTRY:
477: case NODE_FUNCDEF:
478: case NODE_FUNCPROTOTYPE:
479: case NODE_INFORMALEQUATION:
480: case NODE_INLINEEQUATION:
481: case NODE_ITEMIZEDLIST:
482: case NODE_LEGALNOTICE:
483: case NODE_LITERALLAYOUT:
484: case NODE_NOTE:
485: case NODE_ORDEREDLIST:
486: case NODE_PARA:
487: case NODE_PREFACE:
488: case NODE_PROGRAMLISTING:
489: case NODE_REFMETA:
490: case NODE_REFNAMEDIV:
491: case NODE_REFSYNOPSISDIV:
492: case NODE_ROW:
493: case NODE_SBR:
494: case NODE_SCREEN:
495: case NODE_SECTION:
496: case NODE_SYNOPSIS:
497: case NODE_TGROUP:
498: case NODE_TIP:
499: case NODE_TITLE:
500: case NODE_VARIABLELIST:
501: case NODE_VARLISTENTRY:
502: case NODE_WARNING:
503: dat->spc = 1;
504: break;
505: default:
506: dat->spc = ps->spc;
507: break;
508: }
1.1 schwarze 509: dat->parent = ps->cur;
510: TAILQ_INIT(&dat->childq);
511: TAILQ_INIT(&dat->attrq);
512:
513: if (ps->cur != NULL)
514: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
515:
516: ps->cur = dat;
517: if (ps->tree->root == NULL)
518: ps->tree->root = dat;
1.5 schwarze 519: }
520:
521: static void
522: xml_attrkey(struct parse *ps, const char *name)
523: {
524: struct pattr *attr;
525: enum attrkey key;
1.1 schwarze 526:
1.19 schwarze 527: if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 528: return;
529: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
530: ps->attr = 0;
531: return;
532: }
533: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
534: perror(NULL);
535: exit(1);
536: }
537: attr->key = key;
538: attr->val = ATTRVAL__MAX;
539: attr->rawval = NULL;
540: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
541: ps->attr = 1;
542: }
543:
544: static void
545: xml_attrval(struct parse *ps, const char *name)
546: {
547: struct pattr *attr;
548:
1.19 schwarze 549: if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0)
1.5 schwarze 550: return;
551: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
552: return;
553: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
554: (attr->rawval = strdup(name)) == NULL) {
555: perror(NULL);
556: exit(1);
1.1 schwarze 557: }
558: }
559:
560: /*
561: * Roll up the parse tree.
562: * If we're at a text node, roll that one up first.
563: */
564: static void
1.5 schwarze 565: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 566: {
1.5 schwarze 567: const struct element *elem;
568: enum nodeid node;
1.1 schwarze 569:
1.4 schwarze 570: /*
571: * An ancestor is excluded from the tree;
572: * keep track of the number of levels excluded.
573: */
574: if (ps->del > 1) {
575: ps->del--;
576: return;
577: }
578:
1.16 schwarze 579: if (ps->del == 0)
580: pnode_closetext(ps);
1.2 schwarze 581:
1.5 schwarze 582: if (name != NULL) {
583: for (elem = elements; elem->name != NULL; elem++)
584: if (strcmp(elem->name, name) == 0)
585: break;
586: node = elem->node;
587: } else
588: node = ps->ncur;
1.2 schwarze 589:
1.5 schwarze 590: switch (node) {
1.4 schwarze 591: case NODE_DELETE_WARN:
592: case NODE_DELETE:
1.5 schwarze 593: if (ps->del > 0)
594: ps->del--;
1.4 schwarze 595: break;
1.2 schwarze 596: case NODE_IGNORE:
597: break;
598: default:
1.5 schwarze 599: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 600: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 601: break;
602: }
603:
604: /*
605: * Refrain from actually closing the document element.
606: * If no more content follows, no harm is done, but if
607: * some content still follows, simply processing it is
608: * obviously better than discarding it or crashing.
609: */
610:
611: if (ps->cur->parent == NULL)
612: ps->tree->flags |= TREE_CLOSED;
613: else
614: ps->cur = ps->cur->parent;
1.16 schwarze 615: ps->spc = 0;
1.4 schwarze 616: break;
1.2 schwarze 617: }
1.4 schwarze 618: assert(ps->del == 0);
1.1 schwarze 619: }
620:
621: struct parse *
622: parse_alloc(int warn)
623: {
624: struct parse *p;
625:
626: if ((p = calloc(1, sizeof(*p))) == NULL)
627: return NULL;
628:
629: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
630: free(p);
631: return NULL;
632: }
633: p->warn = warn;
634: return p;
635: }
636:
637: void
638: parse_free(struct parse *p)
639: {
640: if (p == NULL)
641: return;
642: if (p->tree != NULL) {
643: pnode_unlink(p->tree->root);
644: free(p->tree);
645: }
646: free(p);
647: }
648:
1.14 schwarze 649: static void
650: increment(struct parse *p, char *b, size_t *pend, int refill)
651: {
652: if (refill) {
653: if (b[*pend] == '\n') {
654: p->nline++;
655: p->ncol = 1;
656: } else
657: p->ncol++;
658: }
659: ++*pend;
660: }
661:
1.5 schwarze 662: /*
663: * Advance the pend pointer to the next character in the charset.
664: * If the charset starts with a space, it stands for any whitespace.
665: * Update the new input file position, used for messages.
666: * Do not overrun the buffer b of length rlen.
667: * When reaching the end, NUL-terminate the buffer and return 1;
668: * otherwise, return 0.
669: */
670: static int
671: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 672: const char *charset, int refill)
1.5 schwarze 673: {
674: int space;
675:
676: if (*charset == ' ') {
677: space = 1;
678: charset++;
679: } else
680: space = 0;
681:
1.14 schwarze 682: if (refill) {
683: p->nline = p->line;
684: p->ncol = p->col;
685: }
1.5 schwarze 686: while (*pend < rlen) {
687: if (space && isspace((unsigned char)b[*pend]))
688: break;
689: if (strchr(charset, b[*pend]) != NULL)
690: break;
1.14 schwarze 691: increment(p, b, pend, refill);
1.5 schwarze 692: }
693: if (*pend == rlen) {
694: b[rlen] = '\0';
1.14 schwarze 695: return refill;
1.5 schwarze 696: } else
697: return 0;
698: }
699:
1.14 schwarze 700: size_t
701: parse_string(struct parse *p, char *b, size_t rlen,
702: enum pstate *pstate, int refill)
703: {
704: char *cp;
705: size_t poff; /* Parse offset in b[]. */
706: size_t pend; /* Offset of the end of the current word. */
707: int elem_end;
708:
1.16 schwarze 709: p->spc = 0;
1.14 schwarze 710: pend = 0;
711: for (;;) {
712:
713: /* Proceed to the next token, skipping whitespace. */
714:
715: if (refill) {
716: p->line = p->nline;
717: p->col = p->ncol;
718: }
719: if ((poff = pend) == rlen)
720: break;
721: if (isspace((unsigned char)b[pend])) {
1.16 schwarze 722: p->spc = 1;
1.14 schwarze 723: increment(p, b, &pend, refill);
724: continue;
725: }
726:
727: /*
728: * The following four cases (ARG, TAG, and starting an
729: * entity or a tag) all parse a word or quoted string.
730: * If that extends beyond the read buffer and the last
731: * read(2) still got data, they all break out of the
732: * token loop to request more data from the read loop.
733: *
734: * Also, three of them detect self-closing tags, those
735: * ending with "/>", setting the flag elem_end and
736: * calling xml_elem_end() at the very end, after
737: * handling the attribute value, attribute name, or
738: * tag name, respectively.
739: */
740:
741: /* Parse an attribute value. */
742:
743: if (*pstate >= PARSE_ARG) {
744: if (*pstate == PARSE_ARG &&
745: (b[pend] == '\'' || b[pend] == '"')) {
746: *pstate = b[pend] == '"' ?
747: PARSE_DQ : PARSE_SQ;
748: increment(p, b, &pend, refill);
749: continue;
750: }
751: if (advance(p, b, rlen, &pend,
752: *pstate == PARSE_DQ ? "\"" :
753: *pstate == PARSE_SQ ? "'" : " >", refill))
754: break;
755: *pstate = PARSE_TAG;
756: elem_end = 0;
757: if (b[pend] == '>') {
758: *pstate = PARSE_ELEM;
759: if (pend > 0 && b[pend - 1] == '/') {
760: b[pend - 1] = '\0';
761: elem_end = 1;
762: }
763: }
764: b[pend] = '\0';
765: if (pend < rlen)
766: increment(p, b, &pend, refill);
767: xml_attrval(p, b + poff);
768: if (elem_end)
769: xml_elem_end(p, NULL);
770:
771: /* Look for an attribute name. */
772:
773: } else if (*pstate == PARSE_TAG) {
774: if (advance(p, b, rlen, &pend, " =>", refill))
775: break;
776: elem_end = 0;
777: switch (b[pend]) {
778: case '>':
779: *pstate = PARSE_ELEM;
780: if (pend > 0 && b[pend - 1] == '/') {
781: b[pend - 1] = '\0';
782: elem_end = 1;
783: }
784: break;
785: case '=':
786: *pstate = PARSE_ARG;
787: break;
788: default:
789: break;
790: }
791: b[pend] = '\0';
792: if (pend < rlen)
793: increment(p, b, &pend, refill);
794: xml_attrkey(p, b + poff);
795: if (elem_end)
796: xml_elem_end(p, NULL);
797:
798: /* Begin an opening or closing tag. */
799:
800: } else if (b[poff] == '<') {
801: if (advance(p, b, rlen, &pend, " >", refill))
802: break;
803: if (pend > poff + 3 &&
804: strncmp(b + poff, "<!--", 4) == 0) {
805:
806: /* Skip a comment. */
807:
808: cp = strstr(b + pend - 2, "-->");
809: if (cp == NULL) {
810: if (refill)
811: break;
812: cp = b + rlen;
813: } else
814: cp += 3;
815: while (b + pend < cp)
816: increment(p, b, &pend, refill);
817: continue;
818: }
819: elem_end = 0;
820: if (b[pend] != '>')
821: *pstate = PARSE_TAG;
822: else if (pend > 0 && b[pend - 1] == '/') {
823: b[pend - 1] = '\0';
824: elem_end = 1;
825: }
826: b[pend] = '\0';
827: if (pend < rlen)
828: increment(p, b, &pend, refill);
829: if (b[++poff] == '/') {
830: elem_end = 1;
831: poff++;
832: } else
833: xml_elem_start(p, b + poff);
834: if (elem_end)
835: xml_elem_end(p, b + poff);
836:
837: /* Process an entity. */
838:
839: } else if (b[poff] == '&') {
840: if (advance(p, b, rlen, &pend, ";", refill))
841: break;
842: b[pend] = '\0';
843: if (pend < rlen)
844: increment(p, b, &pend, refill);
845: xml_entity(p, b + poff + 1);
846:
847: /* Process text up to the next tag, entity, or EOL. */
848:
849: } else {
850: advance(p, b, rlen, &pend, "<&", refill);
851: xml_char(p, b + poff, pend - poff);
852: }
853: }
854: return poff;
855: }
856:
1.1 schwarze 857: struct ptree *
858: parse_file(struct parse *p, int fd, const char *fname)
859: {
860: char b[4096];
1.5 schwarze 861: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 862: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 863: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 864: enum pstate pstate;
1.1 schwarze 865:
866: p->fname = fname;
1.5 schwarze 867: p->nline = 1;
868: p->ncol = 1;
1.14 schwarze 869: pstate = PARSE_ELEM;
1.5 schwarze 870: rlen = 0;
871:
872: /*
873: * Read loop.
874: *
1.14 schwarze 875: * If the previous token was incomplete and asked for more
876: * input, we have to enter the read loop once more even on EOF.
1.5 schwarze 877: * Once rsz is 0, incomplete tokens will no longer ask
878: * for more input but instead use whatever there is,
879: * and then exit the read loop.
880: * The minus one on the size limit for read(2) is needed
881: * such that advance() can set b[rlen] to NUL when needed.
882: */
883:
1.14 schwarze 884: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
885: (rlen += rsz) > 0) {
886: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 887: /* Buffer exhausted; shift left and re-fill. */
888: assert(poff > 0);
889: rlen -= poff;
1.14 schwarze 890: memmove(b, b + poff, rlen);
1.5 schwarze 891: }
892: if (rsz < 0) {
893: perror(fname);
894: p->tree->flags |= TREE_FAIL;
895: }
1.16 schwarze 896: pnode_closetext(p);
1.6 schwarze 897: if ((p->tree->flags & TREE_CLOSED) == 0)
898: warn_msg(p, "document not closed");
1.1 schwarze 899: return p->tree;
900: }
CVSweb