Annotation of docbook2mdoc/parse.c, Revision 1.22
1.22 ! schwarze 1: /* $Id: parse.c,v 1.21 2019/04/07 18:51:53 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
1.14 schwarze 33: enum pstate {
34: PARSE_ELEM,
35: PARSE_TAG,
36: PARSE_ARG,
37: PARSE_SQ,
38: PARSE_DQ
39: };
40:
1.1 schwarze 41: /*
42: * Global parse state.
43: * Keep this as simple and small as possible.
44: */
45: struct parse {
46: const char *fname; /* Name of the input file. */
47: struct ptree *tree; /* Complete parse result. */
48: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 49: enum nodeid ncur; /* Type of the current node. */
50: int line; /* Line number in the input file. */
51: int col; /* Column number in the input file. */
52: int nline; /* Line number of next token. */
53: int ncol; /* Column number of next token. */
1.4 schwarze 54: int del; /* Levels of nested nodes being deleted. */
1.16 schwarze 55: int spc; /* Whitespace before the next element. */
1.5 schwarze 56: int attr; /* The most recent attribute is valid. */
1.1 schwarze 57: int warn;
58: };
59:
60: struct element {
61: const char *name; /* DocBook element name. */
62: enum nodeid node; /* Node type to generate. */
63: };
64:
65: static const struct element elements[] = {
1.3 schwarze 66: { "acronym", NODE_IGNORE },
1.1 schwarze 67: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 68: { "anchor", NODE_DELETE },
1.22 ! schwarze 69: { "appendix", NODE_APPENDIX },
1.1 schwarze 70: { "application", NODE_APPLICATION },
71: { "arg", NODE_ARG },
1.22 ! schwarze 72: { "article", NODE_SECTION },
1.1 schwarze 73: { "author", NODE_AUTHOR },
74: { "authorgroup", NODE_AUTHORGROUP },
75: { "blockquote", NODE_BLOCKQUOTE },
1.22 ! schwarze 76: { "book", NODE_SECTION },
1.1 schwarze 77: { "bookinfo", NODE_BOOKINFO },
78: { "caution", NODE_CAUTION },
79: { "chapter", NODE_SECTION },
80: { "citerefentry", NODE_CITEREFENTRY },
81: { "citetitle", NODE_CITETITLE },
82: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 83: { "code", NODE_LITERAL },
1.1 schwarze 84: { "colspec", NODE_COLSPEC },
85: { "command", NODE_COMMAND },
86: { "constant", NODE_CONSTANT },
1.7 schwarze 87: { "contrib", NODE_CONTRIB },
1.1 schwarze 88: { "copyright", NODE_COPYRIGHT },
89: { "date", NODE_DATE },
90: { "editor", NODE_EDITOR },
91: { "email", NODE_EMAIL },
92: { "emphasis", NODE_EMPHASIS },
93: { "entry", NODE_ENTRY },
94: { "envar", NODE_ENVAR },
1.13 schwarze 95: { "errorname", NODE_ERRORNAME },
1.1 schwarze 96: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
97: { "filename", NODE_FILENAME },
1.7 schwarze 98: { "firstname", NODE_PERSONNAME },
1.1 schwarze 99: { "firstterm", NODE_FIRSTTERM },
100: { "footnote", NODE_FOOTNOTE },
101: { "funcdef", NODE_FUNCDEF },
102: { "funcprototype", NODE_FUNCPROTOTYPE },
103: { "funcsynopsis", NODE_FUNCSYNOPSIS },
104: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
105: { "function", NODE_FUNCTION },
1.21 schwarze 106: { "glossary", NODE_VARIABLELIST },
107: { "glossdef", NODE_IGNORE },
108: { "glossdiv", NODE_IGNORE },
109: { "glossentry", NODE_VARLISTENTRY },
110: { "glosslist", NODE_VARIABLELIST },
1.1 schwarze 111: { "glossterm", NODE_GLOSSTERM },
112: { "group", NODE_GROUP },
113: { "holder", NODE_HOLDER },
114: { "index", NODE_INDEX },
1.4 schwarze 115: { "indexterm", NODE_DELETE },
1.1 schwarze 116: { "info", NODE_INFO },
117: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 118: { "informaltable", NODE_TABLE },
1.1 schwarze 119: { "inlineequation", NODE_INLINEEQUATION },
120: { "itemizedlist", NODE_ITEMIZEDLIST },
121: { "keysym", NODE_KEYSYM },
122: { "legalnotice", NODE_LEGALNOTICE },
123: { "link", NODE_LINK },
124: { "listitem", NODE_LISTITEM },
125: { "literal", NODE_LITERAL },
126: { "literallayout", NODE_LITERALLAYOUT },
127: { "manvolnum", NODE_MANVOLNUM },
128: { "member", NODE_MEMBER },
129: { "mml:math", NODE_MML_MATH },
130: { "mml:mfenced", NODE_MML_MFENCED },
131: { "mml:mfrac", NODE_MML_MFRAC },
132: { "mml:mi", NODE_MML_MI },
133: { "mml:mn", NODE_MML_MN },
134: { "mml:mo", NODE_MML_MO },
135: { "mml:mrow", NODE_MML_MROW },
136: { "mml:msub", NODE_MML_MSUB },
137: { "mml:msup", NODE_MML_MSUP },
138: { "modifier", NODE_MODIFIER },
139: { "note", NODE_NOTE },
140: { "option", NODE_OPTION },
141: { "orderedlist", NODE_ORDEREDLIST },
142: { "orgname", NODE_ORGNAME },
1.7 schwarze 143: { "othername", NODE_PERSONNAME },
1.1 schwarze 144: { "para", NODE_PARA },
145: { "paramdef", NODE_PARAMDEF },
146: { "parameter", NODE_PARAMETER },
147: { "part", NODE_SECTION },
148: { "personname", NODE_PERSONNAME },
1.3 schwarze 149: { "phrase", NODE_IGNORE },
1.1 schwarze 150: { "preface", NODE_PREFACE },
1.4 schwarze 151: { "primary", NODE_DELETE },
1.1 schwarze 152: { "programlisting", NODE_PROGRAMLISTING },
153: { "prompt", NODE_PROMPT },
154: { "quote", NODE_QUOTE },
155: { "refclass", NODE_REFCLASS },
156: { "refdescriptor", NODE_REFDESCRIPTOR },
157: { "refentry", NODE_REFENTRY },
158: { "refentryinfo", NODE_REFENTRYINFO },
159: { "refentrytitle", NODE_REFENTRYTITLE },
160: { "refmeta", NODE_REFMETA },
161: { "refmetainfo", NODE_REFMETAINFO },
162: { "refmiscinfo", NODE_REFMISCINFO },
163: { "refname", NODE_REFNAME },
164: { "refnamediv", NODE_REFNAMEDIV },
165: { "refpurpose", NODE_REFPURPOSE },
166: { "refsect1", NODE_SECTION },
167: { "refsect2", NODE_SECTION },
168: { "refsect3", NODE_SECTION },
169: { "refsection", NODE_SECTION },
170: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
171: { "releaseinfo", NODE_RELEASEINFO },
172: { "replaceable", NODE_REPLACEABLE },
173: { "row", NODE_ROW },
174: { "sbr", NODE_SBR },
175: { "screen", NODE_SCREEN },
1.4 schwarze 176: { "secondary", NODE_DELETE },
1.1 schwarze 177: { "sect1", NODE_SECTION },
178: { "sect2", NODE_SECTION },
179: { "section", NODE_SECTION },
180: { "sgmltag", NODE_SGMLTAG },
1.15 schwarze 181: { "simpara", NODE_PARA },
1.1 schwarze 182: { "simplelist", NODE_SIMPLELIST },
183: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 184: { "structfield", NODE_PARAMETER },
185: { "structname", NODE_TYPE },
1.1 schwarze 186: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 187: { "surname", NODE_PERSONNAME },
1.12 schwarze 188: { "symbol", NODE_CONSTANT },
1.1 schwarze 189: { "synopsis", NODE_SYNOPSIS },
190: { "table", NODE_TABLE },
191: { "tbody", NODE_TBODY },
192: { "term", NODE_TERM },
193: { "tfoot", NODE_TFOOT },
194: { "tgroup", NODE_TGROUP },
195: { "thead", NODE_THEAD },
196: { "tip", NODE_TIP },
197: { "title", NODE_TITLE },
1.3 schwarze 198: { "trademark", NODE_IGNORE },
1.1 schwarze 199: { "type", NODE_TYPE },
1.18 schwarze 200: { "ulink", NODE_LINK },
1.13 schwarze 201: { "userinput", NODE_LITERAL },
1.1 schwarze 202: { "variablelist", NODE_VARIABLELIST },
203: { "varlistentry", NODE_VARLISTENTRY },
204: { "varname", NODE_VARNAME },
205: { "warning", NODE_WARNING },
206: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 207: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 208: { "year", NODE_YEAR },
1.5 schwarze 209: { NULL, NODE_IGNORE }
1.1 schwarze 210: };
211:
1.9 schwarze 212: struct entity {
213: const char *name;
214: const char *roff;
215: };
216:
217: /*
218: * XML character entity references found in the wild.
219: * Those that don't have an exact mandoc_char(7) representation
220: * are approximated, and the desired codepoint is given as a comment.
221: * Encoding them as \\[u...] would leave -Tascii out in the cold.
222: */
223: static const struct entity entities[] = {
224: { "alpha", "\\(*a" },
225: { "amp", "&" },
226: { "apos", "'" },
227: { "auml", "\\(:a" },
228: { "beta", "\\(*b" },
229: { "circ", "^" }, /* U+02C6 */
230: { "copy", "\\(co" },
231: { "dagger", "\\(dg" },
232: { "Delta", "\\(*D" },
233: { "eacute", "\\('e" },
234: { "emsp", "\\ " }, /* U+2003 */
235: { "gt", ">" },
236: { "hairsp", "\\^" },
237: { "kappa", "\\(*k" },
238: { "larr", "\\(<-" },
239: { "ldquo", "\\(lq" },
240: { "le", "\\(<=" },
241: { "lowbar", "_" },
242: { "lsqb", "[" },
243: { "lt", "<" },
244: { "mdash", "\\(em" },
245: { "minus", "\\-" },
246: { "ndash", "\\(en" },
247: { "nbsp", "\\ " },
248: { "num", "#" },
249: { "oslash", "\\(/o" },
250: { "ouml", "\\(:o" },
251: { "percnt", "%" },
252: { "quot", "\\(dq" },
253: { "rarr", "\\(->" },
254: { "rArr", "\\(rA" },
255: { "rdquo", "\\(rq" },
256: { "reg", "\\(rg" },
257: { "rho", "\\(*r" },
258: { "rsqb", "]" },
259: { "sigma", "\\(*s" },
260: { "shy", "\\&" }, /* U+00AD */
261: { "tau", "\\(*t" },
262: { "tilde", "\\[u02DC]" },
263: { "times", "\\[tmu]" },
264: { "uuml", "\\(:u" },
265: { NULL, NULL }
266: };
267:
1.6 schwarze 268: static void
269: error_msg(struct parse *p, const char *fmt, ...)
270: {
271: va_list ap;
272:
273: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
274: va_start(ap, fmt);
275: vfprintf(stderr, fmt, ap);
276: va_end(ap);
277: fputc('\n', stderr);
278: p->tree->flags |= TREE_FAIL;
279: }
280:
281: static void
282: warn_msg(struct parse *p, const char *fmt, ...)
283: {
284: va_list ap;
285:
286: if (p->warn == 0)
287: return;
288:
289: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
290: va_start(ap, fmt);
291: vfprintf(stderr, fmt, ap);
292: va_end(ap);
293: fputc('\n', stderr);
294: }
295:
1.1 schwarze 296: /*
297: * Process a string of characters.
298: * If a text node is already open, append to it.
299: * Otherwise, create a new one as a child of the current node.
300: */
301: static void
1.5 schwarze 302: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 303: {
304: struct pnode *dat;
1.16 schwarze 305: size_t newsz;
1.1 schwarze 306:
1.5 schwarze 307: if (ps->del > 0)
1.1 schwarze 308: return;
309:
1.5 schwarze 310: if (ps->cur == NULL) {
1.6 schwarze 311: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 312: return;
313: }
314:
1.1 schwarze 315: if (ps->cur->node != NODE_TEXT) {
316: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
317: perror(NULL);
318: exit(1);
319: }
320: dat->node = NODE_TEXT;
1.16 schwarze 321: dat->spc = ps->spc;
1.1 schwarze 322: dat->parent = ps->cur;
323: TAILQ_INIT(&dat->childq);
324: TAILQ_INIT(&dat->attrq);
325: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
326: ps->cur = dat;
327: }
328:
1.5 schwarze 329: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 330: ps->cur->parent == ps->tree->root)
331: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 332:
1.1 schwarze 333: /* Append to the current text node. */
334:
335: assert(sz >= 0);
1.16 schwarze 336: newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
337: ps->cur->b = realloc(ps->cur->b, newsz + 1);
1.1 schwarze 338: if (ps->cur->b == NULL) {
339: perror(NULL);
340: exit(1);
341: }
1.16 schwarze 342: if (ps->cur->bsz && ps->spc)
343: ps->cur->b[ps->cur->bsz++] = ' ';
1.1 schwarze 344: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
1.16 schwarze 345: ps->cur->b[ps->cur->bsz = newsz] = '\0';
1.1 schwarze 346: ps->cur->real = ps->cur->b;
1.16 schwarze 347: ps->spc = 0;
1.1 schwarze 348: }
349:
1.16 schwarze 350: /*
351: * Close out the text node and strip trailing whitespace, if one is open.
352: */
1.1 schwarze 353: static void
1.16 schwarze 354: pnode_closetext(struct parse *p)
1.1 schwarze 355: {
1.16 schwarze 356: struct pnode *n;
357:
358: if ((n = p->cur) == NULL || n->node != NODE_TEXT)
359: return;
360: p->cur = n->parent;
361: while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
362: n->b[--n->bsz] = '\0';
363: p->spc = 1;
364: }
1.1 schwarze 365: }
366:
1.9 schwarze 367: static void
368: xml_entity(struct parse *p, const char *name)
369: {
370: const struct entity *entity;
371: struct pnode *dat;
372:
373: if (p->del > 0)
374: return;
375:
376: if (p->cur == NULL) {
377: error_msg(p, "discarding entity before document: &%s;", name);
378: return;
379: }
380:
1.16 schwarze 381: pnode_closetext(p);
1.9 schwarze 382:
383: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
384: warn_msg(p, "entity after end of document: &%s;", name);
385:
386: for (entity = entities; entity->name != NULL; entity++)
387: if (strcmp(name, entity->name) == 0)
388: break;
389:
390: if (entity->roff == NULL) {
391: error_msg(p, "unknown entity &%s;", name);
392: return;
393: }
394:
395: /* Create, append, and close out an entity node. */
396: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
397: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
398: perror(NULL);
399: exit(1);
400: }
401: dat->node = NODE_ESCAPE;
402: dat->bsz = strlen(dat->b);
1.16 schwarze 403: dat->spc = p->spc;
1.9 schwarze 404: dat->parent = p->cur;
405: TAILQ_INIT(&dat->childq);
406: TAILQ_INIT(&dat->attrq);
407: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
1.16 schwarze 408: p->spc = 0;
1.9 schwarze 409: }
410:
1.1 schwarze 411: /*
412: * Begin an element.
413: */
414: static void
1.5 schwarze 415: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 416: {
1.5 schwarze 417: const struct element *elem;
418: struct pnode *dat;
1.1 schwarze 419:
1.5 schwarze 420: if (*name == '!' || *name == '?')
1.1 schwarze 421: return;
422:
1.4 schwarze 423: /*
424: * An ancestor is excluded from the tree;
425: * keep track of the number of levels excluded.
426: */
427: if (ps->del > 0) {
428: ps->del++;
429: return;
430: }
431:
1.16 schwarze 432: pnode_closetext(ps);
1.1 schwarze 433:
434: for (elem = elements; elem->name != NULL; elem++)
435: if (strcmp(elem->name, name) == 0)
436: break;
437:
1.6 schwarze 438: if (elem->name == NULL)
439: error_msg(ps, "unknown element <%s>", name);
440:
1.5 schwarze 441: ps->ncur = elem->node;
1.1 schwarze 442:
1.5 schwarze 443: switch (ps->ncur) {
1.4 schwarze 444: case NODE_DELETE_WARN:
1.6 schwarze 445: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 446: /* FALLTHROUGH */
1.4 schwarze 447: case NODE_DELETE:
448: ps->del = 1;
449: /* FALLTHROUGH */
1.2 schwarze 450: case NODE_IGNORE:
451: return;
452: case NODE_INLINEEQUATION:
1.1 schwarze 453: ps->tree->flags |= TREE_EQN;
1.2 schwarze 454: break;
455: default:
456: break;
457: }
1.1 schwarze 458:
1.6 schwarze 459: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
460: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 461:
1.1 schwarze 462: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
463: perror(NULL);
464: exit(1);
465: }
1.17 schwarze 466:
467: /*
468: * Nodes that begin a new macro or request line or start by
469: * printing text always want whitespace before themselves.
470: */
471:
472: switch (dat->node = elem->node) {
1.22 ! schwarze 473: case NODE_APPENDIX:
1.17 schwarze 474: case NODE_AUTHORGROUP:
1.20 schwarze 475: case NODE_BLOCKQUOTE:
1.17 schwarze 476: case NODE_BOOKINFO:
477: case NODE_CAUTION:
478: case NODE_EDITOR:
479: case NODE_ENTRY:
480: case NODE_FUNCDEF:
481: case NODE_FUNCPROTOTYPE:
482: case NODE_INFORMALEQUATION:
483: case NODE_INLINEEQUATION:
484: case NODE_ITEMIZEDLIST:
485: case NODE_LEGALNOTICE:
486: case NODE_LITERALLAYOUT:
487: case NODE_NOTE:
488: case NODE_ORDEREDLIST:
489: case NODE_PARA:
490: case NODE_PREFACE:
491: case NODE_PROGRAMLISTING:
492: case NODE_REFMETA:
493: case NODE_REFNAMEDIV:
494: case NODE_REFSYNOPSISDIV:
495: case NODE_ROW:
496: case NODE_SBR:
497: case NODE_SCREEN:
498: case NODE_SECTION:
499: case NODE_SYNOPSIS:
500: case NODE_TGROUP:
501: case NODE_TIP:
502: case NODE_TITLE:
503: case NODE_VARIABLELIST:
504: case NODE_VARLISTENTRY:
505: case NODE_WARNING:
506: dat->spc = 1;
507: break;
508: default:
509: dat->spc = ps->spc;
510: break;
511: }
1.1 schwarze 512: dat->parent = ps->cur;
513: TAILQ_INIT(&dat->childq);
514: TAILQ_INIT(&dat->attrq);
515:
516: if (ps->cur != NULL)
517: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
518:
519: ps->cur = dat;
520: if (ps->tree->root == NULL)
521: ps->tree->root = dat;
1.5 schwarze 522: }
523:
524: static void
525: xml_attrkey(struct parse *ps, const char *name)
526: {
527: struct pattr *attr;
528: enum attrkey key;
1.1 schwarze 529:
1.19 schwarze 530: if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
1.5 schwarze 531: return;
532: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
533: ps->attr = 0;
534: return;
535: }
536: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
537: perror(NULL);
538: exit(1);
539: }
540: attr->key = key;
541: attr->val = ATTRVAL__MAX;
542: attr->rawval = NULL;
543: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
544: ps->attr = 1;
545: }
546:
547: static void
548: xml_attrval(struct parse *ps, const char *name)
549: {
550: struct pattr *attr;
551:
1.19 schwarze 552: if (ps->del > 0 || ps->ncur == NODE_IGNORE || ps->attr == 0)
1.5 schwarze 553: return;
554: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
555: return;
556: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
557: (attr->rawval = strdup(name)) == NULL) {
558: perror(NULL);
559: exit(1);
1.1 schwarze 560: }
561: }
562:
563: /*
564: * Roll up the parse tree.
565: * If we're at a text node, roll that one up first.
566: */
567: static void
1.5 schwarze 568: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 569: {
1.5 schwarze 570: const struct element *elem;
571: enum nodeid node;
1.1 schwarze 572:
1.4 schwarze 573: /*
574: * An ancestor is excluded from the tree;
575: * keep track of the number of levels excluded.
576: */
577: if (ps->del > 1) {
578: ps->del--;
579: return;
580: }
581:
1.16 schwarze 582: if (ps->del == 0)
583: pnode_closetext(ps);
1.2 schwarze 584:
1.5 schwarze 585: if (name != NULL) {
586: for (elem = elements; elem->name != NULL; elem++)
587: if (strcmp(elem->name, name) == 0)
588: break;
589: node = elem->node;
590: } else
591: node = ps->ncur;
1.2 schwarze 592:
1.5 schwarze 593: switch (node) {
1.4 schwarze 594: case NODE_DELETE_WARN:
595: case NODE_DELETE:
1.5 schwarze 596: if (ps->del > 0)
597: ps->del--;
1.4 schwarze 598: break;
1.2 schwarze 599: case NODE_IGNORE:
600: break;
601: default:
1.5 schwarze 602: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 603: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 604: break;
605: }
606:
607: /*
608: * Refrain from actually closing the document element.
609: * If no more content follows, no harm is done, but if
610: * some content still follows, simply processing it is
611: * obviously better than discarding it or crashing.
612: */
613:
614: if (ps->cur->parent == NULL)
615: ps->tree->flags |= TREE_CLOSED;
616: else
617: ps->cur = ps->cur->parent;
1.16 schwarze 618: ps->spc = 0;
1.4 schwarze 619: break;
1.2 schwarze 620: }
1.4 schwarze 621: assert(ps->del == 0);
1.1 schwarze 622: }
623:
624: struct parse *
625: parse_alloc(int warn)
626: {
627: struct parse *p;
628:
629: if ((p = calloc(1, sizeof(*p))) == NULL)
630: return NULL;
631:
632: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
633: free(p);
634: return NULL;
635: }
636: p->warn = warn;
637: return p;
638: }
639:
640: void
641: parse_free(struct parse *p)
642: {
643: if (p == NULL)
644: return;
645: if (p->tree != NULL) {
646: pnode_unlink(p->tree->root);
647: free(p->tree);
648: }
649: free(p);
650: }
651:
1.14 schwarze 652: static void
653: increment(struct parse *p, char *b, size_t *pend, int refill)
654: {
655: if (refill) {
656: if (b[*pend] == '\n') {
657: p->nline++;
658: p->ncol = 1;
659: } else
660: p->ncol++;
661: }
662: ++*pend;
663: }
664:
1.5 schwarze 665: /*
666: * Advance the pend pointer to the next character in the charset.
667: * If the charset starts with a space, it stands for any whitespace.
668: * Update the new input file position, used for messages.
669: * Do not overrun the buffer b of length rlen.
670: * When reaching the end, NUL-terminate the buffer and return 1;
671: * otherwise, return 0.
672: */
673: static int
674: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 schwarze 675: const char *charset, int refill)
1.5 schwarze 676: {
677: int space;
678:
679: if (*charset == ' ') {
680: space = 1;
681: charset++;
682: } else
683: space = 0;
684:
1.14 schwarze 685: if (refill) {
686: p->nline = p->line;
687: p->ncol = p->col;
688: }
1.5 schwarze 689: while (*pend < rlen) {
690: if (space && isspace((unsigned char)b[*pend]))
691: break;
692: if (strchr(charset, b[*pend]) != NULL)
693: break;
1.14 schwarze 694: increment(p, b, pend, refill);
1.5 schwarze 695: }
696: if (*pend == rlen) {
697: b[rlen] = '\0';
1.14 schwarze 698: return refill;
1.5 schwarze 699: } else
700: return 0;
701: }
702:
1.14 schwarze 703: size_t
704: parse_string(struct parse *p, char *b, size_t rlen,
705: enum pstate *pstate, int refill)
706: {
707: char *cp;
708: size_t poff; /* Parse offset in b[]. */
709: size_t pend; /* Offset of the end of the current word. */
710: int elem_end;
711:
1.16 schwarze 712: p->spc = 0;
1.14 schwarze 713: pend = 0;
714: for (;;) {
715:
716: /* Proceed to the next token, skipping whitespace. */
717:
718: if (refill) {
719: p->line = p->nline;
720: p->col = p->ncol;
721: }
722: if ((poff = pend) == rlen)
723: break;
724: if (isspace((unsigned char)b[pend])) {
1.16 schwarze 725: p->spc = 1;
1.14 schwarze 726: increment(p, b, &pend, refill);
727: continue;
728: }
729:
730: /*
731: * The following four cases (ARG, TAG, and starting an
732: * entity or a tag) all parse a word or quoted string.
733: * If that extends beyond the read buffer and the last
734: * read(2) still got data, they all break out of the
735: * token loop to request more data from the read loop.
736: *
737: * Also, three of them detect self-closing tags, those
738: * ending with "/>", setting the flag elem_end and
739: * calling xml_elem_end() at the very end, after
740: * handling the attribute value, attribute name, or
741: * tag name, respectively.
742: */
743:
744: /* Parse an attribute value. */
745:
746: if (*pstate >= PARSE_ARG) {
747: if (*pstate == PARSE_ARG &&
748: (b[pend] == '\'' || b[pend] == '"')) {
749: *pstate = b[pend] == '"' ?
750: PARSE_DQ : PARSE_SQ;
751: increment(p, b, &pend, refill);
752: continue;
753: }
754: if (advance(p, b, rlen, &pend,
755: *pstate == PARSE_DQ ? "\"" :
756: *pstate == PARSE_SQ ? "'" : " >", refill))
757: break;
758: *pstate = PARSE_TAG;
759: elem_end = 0;
760: if (b[pend] == '>') {
761: *pstate = PARSE_ELEM;
762: if (pend > 0 && b[pend - 1] == '/') {
763: b[pend - 1] = '\0';
764: elem_end = 1;
765: }
766: }
767: b[pend] = '\0';
768: if (pend < rlen)
769: increment(p, b, &pend, refill);
770: xml_attrval(p, b + poff);
771: if (elem_end)
772: xml_elem_end(p, NULL);
773:
774: /* Look for an attribute name. */
775:
776: } else if (*pstate == PARSE_TAG) {
777: if (advance(p, b, rlen, &pend, " =>", refill))
778: break;
779: elem_end = 0;
780: switch (b[pend]) {
781: case '>':
782: *pstate = PARSE_ELEM;
783: if (pend > 0 && b[pend - 1] == '/') {
784: b[pend - 1] = '\0';
785: elem_end = 1;
786: }
787: break;
788: case '=':
789: *pstate = PARSE_ARG;
790: break;
791: default:
792: break;
793: }
794: b[pend] = '\0';
795: if (pend < rlen)
796: increment(p, b, &pend, refill);
797: xml_attrkey(p, b + poff);
798: if (elem_end)
799: xml_elem_end(p, NULL);
800:
801: /* Begin an opening or closing tag. */
802:
803: } else if (b[poff] == '<') {
804: if (advance(p, b, rlen, &pend, " >", refill))
805: break;
806: if (pend > poff + 3 &&
807: strncmp(b + poff, "<!--", 4) == 0) {
808:
809: /* Skip a comment. */
810:
811: cp = strstr(b + pend - 2, "-->");
812: if (cp == NULL) {
813: if (refill)
814: break;
815: cp = b + rlen;
816: } else
817: cp += 3;
818: while (b + pend < cp)
819: increment(p, b, &pend, refill);
820: continue;
821: }
822: elem_end = 0;
823: if (b[pend] != '>')
824: *pstate = PARSE_TAG;
825: else if (pend > 0 && b[pend - 1] == '/') {
826: b[pend - 1] = '\0';
827: elem_end = 1;
828: }
829: b[pend] = '\0';
830: if (pend < rlen)
831: increment(p, b, &pend, refill);
832: if (b[++poff] == '/') {
833: elem_end = 1;
834: poff++;
835: } else
836: xml_elem_start(p, b + poff);
837: if (elem_end)
838: xml_elem_end(p, b + poff);
839:
840: /* Process an entity. */
841:
842: } else if (b[poff] == '&') {
843: if (advance(p, b, rlen, &pend, ";", refill))
844: break;
845: b[pend] = '\0';
846: if (pend < rlen)
847: increment(p, b, &pend, refill);
848: xml_entity(p, b + poff + 1);
849:
850: /* Process text up to the next tag, entity, or EOL. */
851:
852: } else {
853: advance(p, b, rlen, &pend, "<&", refill);
854: xml_char(p, b + poff, pend - poff);
855: }
856: }
857: return poff;
858: }
859:
1.1 schwarze 860: struct ptree *
861: parse_file(struct parse *p, int fd, const char *fname)
862: {
863: char b[4096];
1.5 schwarze 864: ssize_t rsz; /* Return value from read(2). */
1.14 schwarze 865: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 866: size_t poff; /* Parse offset in b[]. */
1.14 schwarze 867: enum pstate pstate;
1.1 schwarze 868:
869: p->fname = fname;
1.5 schwarze 870: p->nline = 1;
871: p->ncol = 1;
1.14 schwarze 872: pstate = PARSE_ELEM;
1.5 schwarze 873: rlen = 0;
874:
875: /*
876: * Read loop.
877: *
1.14 schwarze 878: * If the previous token was incomplete and asked for more
879: * input, we have to enter the read loop once more even on EOF.
1.5 schwarze 880: * Once rsz is 0, incomplete tokens will no longer ask
881: * for more input but instead use whatever there is,
882: * and then exit the read loop.
883: * The minus one on the size limit for read(2) is needed
884: * such that advance() can set b[rlen] to NUL when needed.
885: */
886:
1.14 schwarze 887: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
888: (rlen += rsz) > 0) {
889: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 890: /* Buffer exhausted; shift left and re-fill. */
891: assert(poff > 0);
892: rlen -= poff;
1.14 schwarze 893: memmove(b, b + poff, rlen);
1.5 schwarze 894: }
895: if (rsz < 0) {
896: perror(fname);
897: p->tree->flags |= TREE_FAIL;
898: }
1.16 schwarze 899: pnode_closetext(p);
1.6 schwarze 900: if ((p->tree->flags & TREE_CLOSED) == 0)
901: warn_msg(p, "document not closed");
1.1 schwarze 902: return p->tree;
903: }
CVSweb