Annotation of docbook2mdoc/parse.c, Revision 1.14
1.14 ! schwarze 1: /* $Id: parse.c,v 1.13 2019/04/03 17:53:02 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
1.14 ! schwarze 33: enum pstate {
! 34: PARSE_ELEM,
! 35: PARSE_TAG,
! 36: PARSE_ARG,
! 37: PARSE_SQ,
! 38: PARSE_DQ
! 39: };
! 40:
1.1 schwarze 41: /*
42: * Global parse state.
43: * Keep this as simple and small as possible.
44: */
45: struct parse {
46: const char *fname; /* Name of the input file. */
47: struct ptree *tree; /* Complete parse result. */
48: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 49: enum nodeid ncur; /* Type of the current node. */
50: int line; /* Line number in the input file. */
51: int col; /* Column number in the input file. */
52: int nline; /* Line number of next token. */
53: int ncol; /* Column number of next token. */
1.4 schwarze 54: int del; /* Levels of nested nodes being deleted. */
1.5 schwarze 55: int attr; /* The most recent attribute is valid. */
1.1 schwarze 56: int warn;
57: };
58:
59: struct element {
60: const char *name; /* DocBook element name. */
61: enum nodeid node; /* Node type to generate. */
62: };
63:
64: static const struct element elements[] = {
1.3 schwarze 65: { "acronym", NODE_IGNORE },
1.1 schwarze 66: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 67: { "anchor", NODE_DELETE },
1.1 schwarze 68: { "application", NODE_APPLICATION },
69: { "arg", NODE_ARG },
70: { "author", NODE_AUTHOR },
71: { "authorgroup", NODE_AUTHORGROUP },
72: { "blockquote", NODE_BLOCKQUOTE },
73: { "book", NODE_BOOK },
74: { "bookinfo", NODE_BOOKINFO },
75: { "caution", NODE_CAUTION },
76: { "chapter", NODE_SECTION },
77: { "citerefentry", NODE_CITEREFENTRY },
78: { "citetitle", NODE_CITETITLE },
79: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 schwarze 80: { "code", NODE_LITERAL },
1.1 schwarze 81: { "colspec", NODE_COLSPEC },
82: { "command", NODE_COMMAND },
83: { "constant", NODE_CONSTANT },
1.7 schwarze 84: { "contrib", NODE_CONTRIB },
1.1 schwarze 85: { "copyright", NODE_COPYRIGHT },
86: { "date", NODE_DATE },
87: { "editor", NODE_EDITOR },
88: { "email", NODE_EMAIL },
89: { "emphasis", NODE_EMPHASIS },
90: { "entry", NODE_ENTRY },
91: { "envar", NODE_ENVAR },
1.13 schwarze 92: { "errorname", NODE_ERRORNAME },
1.1 schwarze 93: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
94: { "filename", NODE_FILENAME },
1.7 schwarze 95: { "firstname", NODE_PERSONNAME },
1.1 schwarze 96: { "firstterm", NODE_FIRSTTERM },
97: { "footnote", NODE_FOOTNOTE },
98: { "funcdef", NODE_FUNCDEF },
99: { "funcprototype", NODE_FUNCPROTOTYPE },
100: { "funcsynopsis", NODE_FUNCSYNOPSIS },
101: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
102: { "function", NODE_FUNCTION },
103: { "glossterm", NODE_GLOSSTERM },
104: { "group", NODE_GROUP },
105: { "holder", NODE_HOLDER },
106: { "index", NODE_INDEX },
1.4 schwarze 107: { "indexterm", NODE_DELETE },
1.1 schwarze 108: { "info", NODE_INFO },
109: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 110: { "informaltable", NODE_TABLE },
1.1 schwarze 111: { "inlineequation", NODE_INLINEEQUATION },
112: { "itemizedlist", NODE_ITEMIZEDLIST },
113: { "keysym", NODE_KEYSYM },
114: { "legalnotice", NODE_LEGALNOTICE },
115: { "link", NODE_LINK },
116: { "listitem", NODE_LISTITEM },
117: { "literal", NODE_LITERAL },
118: { "literallayout", NODE_LITERALLAYOUT },
119: { "manvolnum", NODE_MANVOLNUM },
120: { "member", NODE_MEMBER },
121: { "mml:math", NODE_MML_MATH },
122: { "mml:mfenced", NODE_MML_MFENCED },
123: { "mml:mfrac", NODE_MML_MFRAC },
124: { "mml:mi", NODE_MML_MI },
125: { "mml:mn", NODE_MML_MN },
126: { "mml:mo", NODE_MML_MO },
127: { "mml:mrow", NODE_MML_MROW },
128: { "mml:msub", NODE_MML_MSUB },
129: { "mml:msup", NODE_MML_MSUP },
130: { "modifier", NODE_MODIFIER },
131: { "note", NODE_NOTE },
132: { "option", NODE_OPTION },
133: { "orderedlist", NODE_ORDEREDLIST },
134: { "orgname", NODE_ORGNAME },
1.7 schwarze 135: { "othername", NODE_PERSONNAME },
1.1 schwarze 136: { "para", NODE_PARA },
137: { "paramdef", NODE_PARAMDEF },
138: { "parameter", NODE_PARAMETER },
139: { "part", NODE_SECTION },
140: { "personname", NODE_PERSONNAME },
1.3 schwarze 141: { "phrase", NODE_IGNORE },
1.1 schwarze 142: { "preface", NODE_PREFACE },
1.4 schwarze 143: { "primary", NODE_DELETE },
1.1 schwarze 144: { "programlisting", NODE_PROGRAMLISTING },
145: { "prompt", NODE_PROMPT },
146: { "quote", NODE_QUOTE },
147: { "refclass", NODE_REFCLASS },
148: { "refdescriptor", NODE_REFDESCRIPTOR },
149: { "refentry", NODE_REFENTRY },
150: { "refentryinfo", NODE_REFENTRYINFO },
151: { "refentrytitle", NODE_REFENTRYTITLE },
152: { "refmeta", NODE_REFMETA },
153: { "refmetainfo", NODE_REFMETAINFO },
154: { "refmiscinfo", NODE_REFMISCINFO },
155: { "refname", NODE_REFNAME },
156: { "refnamediv", NODE_REFNAMEDIV },
157: { "refpurpose", NODE_REFPURPOSE },
158: { "refsect1", NODE_SECTION },
159: { "refsect2", NODE_SECTION },
160: { "refsect3", NODE_SECTION },
161: { "refsection", NODE_SECTION },
162: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
163: { "releaseinfo", NODE_RELEASEINFO },
164: { "replaceable", NODE_REPLACEABLE },
165: { "row", NODE_ROW },
166: { "sbr", NODE_SBR },
167: { "screen", NODE_SCREEN },
1.4 schwarze 168: { "secondary", NODE_DELETE },
1.1 schwarze 169: { "sect1", NODE_SECTION },
170: { "sect2", NODE_SECTION },
171: { "section", NODE_SECTION },
172: { "sgmltag", NODE_SGMLTAG },
173: { "simplelist", NODE_SIMPLELIST },
174: { "spanspec", NODE_SPANSPEC },
1.13 schwarze 175: { "structfield", NODE_PARAMETER },
176: { "structname", NODE_TYPE },
1.1 schwarze 177: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 178: { "surname", NODE_PERSONNAME },
1.12 schwarze 179: { "symbol", NODE_CONSTANT },
1.1 schwarze 180: { "synopsis", NODE_SYNOPSIS },
181: { "table", NODE_TABLE },
182: { "tbody", NODE_TBODY },
183: { "term", NODE_TERM },
184: { "tfoot", NODE_TFOOT },
185: { "tgroup", NODE_TGROUP },
186: { "thead", NODE_THEAD },
187: { "tip", NODE_TIP },
188: { "title", NODE_TITLE },
1.3 schwarze 189: { "trademark", NODE_IGNORE },
1.1 schwarze 190: { "type", NODE_TYPE },
191: { "ulink", NODE_ULINK },
1.13 schwarze 192: { "userinput", NODE_LITERAL },
1.1 schwarze 193: { "variablelist", NODE_VARIABLELIST },
194: { "varlistentry", NODE_VARLISTENTRY },
195: { "varname", NODE_VARNAME },
196: { "warning", NODE_WARNING },
197: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 198: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 199: { "year", NODE_YEAR },
1.5 schwarze 200: { NULL, NODE_IGNORE }
1.1 schwarze 201: };
202:
1.9 schwarze 203: struct entity {
204: const char *name;
205: const char *roff;
206: };
207:
208: /*
209: * XML character entity references found in the wild.
210: * Those that don't have an exact mandoc_char(7) representation
211: * are approximated, and the desired codepoint is given as a comment.
212: * Encoding them as \\[u...] would leave -Tascii out in the cold.
213: */
214: static const struct entity entities[] = {
215: { "alpha", "\\(*a" },
216: { "amp", "&" },
217: { "apos", "'" },
218: { "auml", "\\(:a" },
219: { "beta", "\\(*b" },
220: { "circ", "^" }, /* U+02C6 */
221: { "copy", "\\(co" },
222: { "dagger", "\\(dg" },
223: { "Delta", "\\(*D" },
224: { "eacute", "\\('e" },
225: { "emsp", "\\ " }, /* U+2003 */
226: { "gt", ">" },
227: { "hairsp", "\\^" },
228: { "kappa", "\\(*k" },
229: { "larr", "\\(<-" },
230: { "ldquo", "\\(lq" },
231: { "le", "\\(<=" },
232: { "lowbar", "_" },
233: { "lsqb", "[" },
234: { "lt", "<" },
235: { "mdash", "\\(em" },
236: { "minus", "\\-" },
237: { "ndash", "\\(en" },
238: { "nbsp", "\\ " },
239: { "num", "#" },
240: { "oslash", "\\(/o" },
241: { "ouml", "\\(:o" },
242: { "percnt", "%" },
243: { "quot", "\\(dq" },
244: { "rarr", "\\(->" },
245: { "rArr", "\\(rA" },
246: { "rdquo", "\\(rq" },
247: { "reg", "\\(rg" },
248: { "rho", "\\(*r" },
249: { "rsqb", "]" },
250: { "sigma", "\\(*s" },
251: { "shy", "\\&" }, /* U+00AD */
252: { "tau", "\\(*t" },
253: { "tilde", "\\[u02DC]" },
254: { "times", "\\[tmu]" },
255: { "uuml", "\\(:u" },
256: { NULL, NULL }
257: };
258:
1.6 schwarze 259: static void
260: error_msg(struct parse *p, const char *fmt, ...)
261: {
262: va_list ap;
263:
264: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
265: va_start(ap, fmt);
266: vfprintf(stderr, fmt, ap);
267: va_end(ap);
268: fputc('\n', stderr);
269: p->tree->flags |= TREE_FAIL;
270: }
271:
272: static void
273: warn_msg(struct parse *p, const char *fmt, ...)
274: {
275: va_list ap;
276:
277: if (p->warn == 0)
278: return;
279:
280: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
281: va_start(ap, fmt);
282: vfprintf(stderr, fmt, ap);
283: va_end(ap);
284: fputc('\n', stderr);
285: }
286:
1.1 schwarze 287: /*
288: * Process a string of characters.
289: * If a text node is already open, append to it.
290: * Otherwise, create a new one as a child of the current node.
291: */
292: static void
1.5 schwarze 293: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 294: {
295: struct pnode *dat;
296:
1.5 schwarze 297: if (ps->del > 0)
1.1 schwarze 298: return;
299:
1.5 schwarze 300: if (ps->cur == NULL) {
1.6 schwarze 301: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 302: return;
303: }
304:
1.1 schwarze 305: if (ps->cur->node != NODE_TEXT) {
306: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
307: perror(NULL);
308: exit(1);
309: }
310: dat->node = NODE_TEXT;
311: dat->parent = ps->cur;
312: TAILQ_INIT(&dat->childq);
313: TAILQ_INIT(&dat->attrq);
314: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
315: ps->cur = dat;
316: }
317:
1.5 schwarze 318: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 319: ps->cur->parent == ps->tree->root)
320: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 321:
1.1 schwarze 322: /* Append to the current text node. */
323:
324: assert(sz >= 0);
325: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
326: if (ps->cur->b == NULL) {
327: perror(NULL);
328: exit(1);
329: }
330: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
331: ps->cur->bsz += sz;
332: ps->cur->b[ps->cur->bsz] = '\0';
333: ps->cur->real = ps->cur->b;
334: }
335:
336: static void
337: pnode_trim(struct pnode *pn)
338: {
339: assert(pn->node == NODE_TEXT);
340: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
341: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
342: break;
343: }
344:
1.9 schwarze 345: static void
346: xml_entity(struct parse *p, const char *name)
347: {
348: const struct entity *entity;
349: struct pnode *dat;
350:
351: if (p->del > 0)
352: return;
353:
354: if (p->cur == NULL) {
355: error_msg(p, "discarding entity before document: &%s;", name);
356: return;
357: }
358:
359: /* Close out the text node, if there is one. */
360: if (p->cur->node == NODE_TEXT) {
361: pnode_trim(p->cur);
362: p->cur = p->cur->parent;
363: }
364:
365: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
366: warn_msg(p, "entity after end of document: &%s;", name);
367:
368: for (entity = entities; entity->name != NULL; entity++)
369: if (strcmp(name, entity->name) == 0)
370: break;
371:
372: if (entity->roff == NULL) {
373: error_msg(p, "unknown entity &%s;", name);
374: return;
375: }
376:
377: /* Create, append, and close out an entity node. */
378: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
379: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
380: perror(NULL);
381: exit(1);
382: }
383: dat->node = NODE_ESCAPE;
384: dat->bsz = strlen(dat->b);
385: dat->parent = p->cur;
386: TAILQ_INIT(&dat->childq);
387: TAILQ_INIT(&dat->attrq);
388: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
389: }
390:
1.1 schwarze 391: /*
392: * Begin an element.
393: */
394: static void
1.5 schwarze 395: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 396: {
1.5 schwarze 397: const struct element *elem;
398: struct pnode *dat;
1.1 schwarze 399:
1.5 schwarze 400: if (*name == '!' || *name == '?')
1.1 schwarze 401: return;
402:
1.4 schwarze 403: /*
404: * An ancestor is excluded from the tree;
405: * keep track of the number of levels excluded.
406: */
407: if (ps->del > 0) {
408: ps->del++;
409: return;
410: }
411:
1.1 schwarze 412: /* Close out the text node, if there is one. */
413: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
414: pnode_trim(ps->cur);
415: ps->cur = ps->cur->parent;
416: }
417:
418: for (elem = elements; elem->name != NULL; elem++)
419: if (strcmp(elem->name, name) == 0)
420: break;
421:
1.6 schwarze 422: if (elem->name == NULL)
423: error_msg(ps, "unknown element <%s>", name);
424:
1.5 schwarze 425: ps->ncur = elem->node;
1.1 schwarze 426:
1.5 schwarze 427: switch (ps->ncur) {
1.4 schwarze 428: case NODE_DELETE_WARN:
1.6 schwarze 429: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 430: /* FALLTHROUGH */
1.4 schwarze 431: case NODE_DELETE:
432: ps->del = 1;
433: /* FALLTHROUGH */
1.2 schwarze 434: case NODE_IGNORE:
435: return;
436: case NODE_INLINEEQUATION:
1.1 schwarze 437: ps->tree->flags |= TREE_EQN;
1.2 schwarze 438: break;
439: default:
440: break;
441: }
1.1 schwarze 442:
1.6 schwarze 443: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
444: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 445:
1.1 schwarze 446: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
447: perror(NULL);
448: exit(1);
449: }
450: dat->node = elem->node;
451: dat->parent = ps->cur;
452: TAILQ_INIT(&dat->childq);
453: TAILQ_INIT(&dat->attrq);
454:
455: if (ps->cur != NULL)
456: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
457:
458: ps->cur = dat;
459: if (ps->tree->root == NULL)
460: ps->tree->root = dat;
1.5 schwarze 461: }
462:
463: static void
464: xml_attrkey(struct parse *ps, const char *name)
465: {
466: struct pattr *attr;
467: enum attrkey key;
1.1 schwarze 468:
1.5 schwarze 469: if (ps->del > 0 || *name == '\0')
470: return;
471: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
472: ps->attr = 0;
473: return;
474: }
475: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
476: perror(NULL);
477: exit(1);
478: }
479: attr->key = key;
480: attr->val = ATTRVAL__MAX;
481: attr->rawval = NULL;
482: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
483: ps->attr = 1;
484: }
485:
486: static void
487: xml_attrval(struct parse *ps, const char *name)
488: {
489: struct pattr *attr;
490:
491: if (ps->del > 0 || ps->attr == 0)
492: return;
493: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
494: return;
495: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
496: (attr->rawval = strdup(name)) == NULL) {
497: perror(NULL);
498: exit(1);
1.1 schwarze 499: }
500: }
501:
502: /*
503: * Roll up the parse tree.
504: * If we're at a text node, roll that one up first.
505: */
506: static void
1.5 schwarze 507: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 508: {
1.5 schwarze 509: const struct element *elem;
510: enum nodeid node;
1.1 schwarze 511:
1.4 schwarze 512: /*
513: * An ancestor is excluded from the tree;
514: * keep track of the number of levels excluded.
515: */
516: if (ps->del > 1) {
517: ps->del--;
518: return;
519: }
520:
1.1 schwarze 521: /* Close out the text node, if there is one. */
1.5 schwarze 522: if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1 schwarze 523: pnode_trim(ps->cur);
524: ps->cur = ps->cur->parent;
525: }
1.2 schwarze 526:
1.5 schwarze 527: if (name != NULL) {
528: for (elem = elements; elem->name != NULL; elem++)
529: if (strcmp(elem->name, name) == 0)
530: break;
531: node = elem->node;
532: } else
533: node = ps->ncur;
1.2 schwarze 534:
1.5 schwarze 535: switch (node) {
1.4 schwarze 536: case NODE_DELETE_WARN:
537: case NODE_DELETE:
1.5 schwarze 538: if (ps->del > 0)
539: ps->del--;
1.4 schwarze 540: break;
1.2 schwarze 541: case NODE_IGNORE:
542: break;
543: default:
1.5 schwarze 544: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 545: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 546: break;
547: }
548:
549: /*
550: * Refrain from actually closing the document element.
551: * If no more content follows, no harm is done, but if
552: * some content still follows, simply processing it is
553: * obviously better than discarding it or crashing.
554: */
555:
556: if (ps->cur->parent == NULL)
557: ps->tree->flags |= TREE_CLOSED;
558: else
559: ps->cur = ps->cur->parent;
1.4 schwarze 560: break;
1.2 schwarze 561: }
1.4 schwarze 562: assert(ps->del == 0);
1.1 schwarze 563: }
564:
565: struct parse *
566: parse_alloc(int warn)
567: {
568: struct parse *p;
569:
570: if ((p = calloc(1, sizeof(*p))) == NULL)
571: return NULL;
572:
573: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
574: free(p);
575: return NULL;
576: }
577: p->warn = warn;
578: return p;
579: }
580:
581: void
582: parse_free(struct parse *p)
583: {
584: if (p == NULL)
585: return;
586: if (p->tree != NULL) {
587: pnode_unlink(p->tree->root);
588: free(p->tree);
589: }
590: free(p);
591: }
592:
1.14 ! schwarze 593: static void
! 594: increment(struct parse *p, char *b, size_t *pend, int refill)
! 595: {
! 596: if (refill) {
! 597: if (b[*pend] == '\n') {
! 598: p->nline++;
! 599: p->ncol = 1;
! 600: } else
! 601: p->ncol++;
! 602: }
! 603: ++*pend;
! 604: }
! 605:
1.5 schwarze 606: /*
607: * Advance the pend pointer to the next character in the charset.
608: * If the charset starts with a space, it stands for any whitespace.
609: * Update the new input file position, used for messages.
610: * Do not overrun the buffer b of length rlen.
611: * When reaching the end, NUL-terminate the buffer and return 1;
612: * otherwise, return 0.
613: */
614: static int
615: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14 ! schwarze 616: const char *charset, int refill)
1.5 schwarze 617: {
618: int space;
619:
620: if (*charset == ' ') {
621: space = 1;
622: charset++;
623: } else
624: space = 0;
625:
1.14 ! schwarze 626: if (refill) {
! 627: p->nline = p->line;
! 628: p->ncol = p->col;
! 629: }
1.5 schwarze 630: while (*pend < rlen) {
631: if (space && isspace((unsigned char)b[*pend]))
632: break;
633: if (strchr(charset, b[*pend]) != NULL)
634: break;
1.14 ! schwarze 635: increment(p, b, pend, refill);
1.5 schwarze 636: }
637: if (*pend == rlen) {
638: b[rlen] = '\0';
1.14 ! schwarze 639: return refill;
1.5 schwarze 640: } else
641: return 0;
642: }
643:
1.14 ! schwarze 644: size_t
! 645: parse_string(struct parse *p, char *b, size_t rlen,
! 646: enum pstate *pstate, int refill)
! 647: {
! 648: char *cp;
! 649: size_t poff; /* Parse offset in b[]. */
! 650: size_t pend; /* Offset of the end of the current word. */
! 651: int elem_end;
! 652:
! 653: pend = 0;
! 654: for (;;) {
! 655:
! 656: /* Proceed to the next token, skipping whitespace. */
! 657:
! 658: if (refill) {
! 659: p->line = p->nline;
! 660: p->col = p->ncol;
! 661: }
! 662: if ((poff = pend) == rlen)
! 663: break;
! 664: if (isspace((unsigned char)b[pend])) {
! 665: increment(p, b, &pend, refill);
! 666: continue;
! 667: }
! 668:
! 669: /*
! 670: * The following four cases (ARG, TAG, and starting an
! 671: * entity or a tag) all parse a word or quoted string.
! 672: * If that extends beyond the read buffer and the last
! 673: * read(2) still got data, they all break out of the
! 674: * token loop to request more data from the read loop.
! 675: *
! 676: * Also, three of them detect self-closing tags, those
! 677: * ending with "/>", setting the flag elem_end and
! 678: * calling xml_elem_end() at the very end, after
! 679: * handling the attribute value, attribute name, or
! 680: * tag name, respectively.
! 681: */
! 682:
! 683: /* Parse an attribute value. */
! 684:
! 685: if (*pstate >= PARSE_ARG) {
! 686: if (*pstate == PARSE_ARG &&
! 687: (b[pend] == '\'' || b[pend] == '"')) {
! 688: *pstate = b[pend] == '"' ?
! 689: PARSE_DQ : PARSE_SQ;
! 690: increment(p, b, &pend, refill);
! 691: continue;
! 692: }
! 693: if (advance(p, b, rlen, &pend,
! 694: *pstate == PARSE_DQ ? "\"" :
! 695: *pstate == PARSE_SQ ? "'" : " >", refill))
! 696: break;
! 697: *pstate = PARSE_TAG;
! 698: elem_end = 0;
! 699: if (b[pend] == '>') {
! 700: *pstate = PARSE_ELEM;
! 701: if (pend > 0 && b[pend - 1] == '/') {
! 702: b[pend - 1] = '\0';
! 703: elem_end = 1;
! 704: }
! 705: }
! 706: b[pend] = '\0';
! 707: if (pend < rlen)
! 708: increment(p, b, &pend, refill);
! 709: xml_attrval(p, b + poff);
! 710: if (elem_end)
! 711: xml_elem_end(p, NULL);
! 712:
! 713: /* Look for an attribute name. */
! 714:
! 715: } else if (*pstate == PARSE_TAG) {
! 716: if (advance(p, b, rlen, &pend, " =>", refill))
! 717: break;
! 718: elem_end = 0;
! 719: switch (b[pend]) {
! 720: case '>':
! 721: *pstate = PARSE_ELEM;
! 722: if (pend > 0 && b[pend - 1] == '/') {
! 723: b[pend - 1] = '\0';
! 724: elem_end = 1;
! 725: }
! 726: break;
! 727: case '=':
! 728: *pstate = PARSE_ARG;
! 729: break;
! 730: default:
! 731: break;
! 732: }
! 733: b[pend] = '\0';
! 734: if (pend < rlen)
! 735: increment(p, b, &pend, refill);
! 736: xml_attrkey(p, b + poff);
! 737: if (elem_end)
! 738: xml_elem_end(p, NULL);
! 739:
! 740: /* Begin an opening or closing tag. */
! 741:
! 742: } else if (b[poff] == '<') {
! 743: if (advance(p, b, rlen, &pend, " >", refill))
! 744: break;
! 745: if (pend > poff + 3 &&
! 746: strncmp(b + poff, "<!--", 4) == 0) {
! 747:
! 748: /* Skip a comment. */
! 749:
! 750: cp = strstr(b + pend - 2, "-->");
! 751: if (cp == NULL) {
! 752: if (refill)
! 753: break;
! 754: cp = b + rlen;
! 755: } else
! 756: cp += 3;
! 757: while (b + pend < cp)
! 758: increment(p, b, &pend, refill);
! 759: continue;
! 760: }
! 761: elem_end = 0;
! 762: if (b[pend] != '>')
! 763: *pstate = PARSE_TAG;
! 764: else if (pend > 0 && b[pend - 1] == '/') {
! 765: b[pend - 1] = '\0';
! 766: elem_end = 1;
! 767: }
! 768: b[pend] = '\0';
! 769: if (pend < rlen)
! 770: increment(p, b, &pend, refill);
! 771: if (b[++poff] == '/') {
! 772: elem_end = 1;
! 773: poff++;
! 774: } else
! 775: xml_elem_start(p, b + poff);
! 776: if (elem_end)
! 777: xml_elem_end(p, b + poff);
! 778:
! 779: /* Process an entity. */
! 780:
! 781: } else if (b[poff] == '&') {
! 782: if (advance(p, b, rlen, &pend, ";", refill))
! 783: break;
! 784: b[pend] = '\0';
! 785: if (pend < rlen)
! 786: increment(p, b, &pend, refill);
! 787: xml_entity(p, b + poff + 1);
! 788:
! 789: /* Process text up to the next tag, entity, or EOL. */
! 790:
! 791: } else {
! 792: advance(p, b, rlen, &pend, "<&", refill);
! 793: xml_char(p, b + poff, pend - poff);
! 794: }
! 795: }
! 796: return poff;
! 797: }
! 798:
1.1 schwarze 799: struct ptree *
800: parse_file(struct parse *p, int fd, const char *fname)
801: {
802: char b[4096];
1.5 schwarze 803: ssize_t rsz; /* Return value from read(2). */
1.14 ! schwarze 804: size_t rlen; /* Number of bytes in b[]. */
1.5 schwarze 805: size_t poff; /* Parse offset in b[]. */
1.14 ! schwarze 806: enum pstate pstate;
1.1 schwarze 807:
808: p->fname = fname;
1.5 schwarze 809: p->nline = 1;
810: p->ncol = 1;
1.14 ! schwarze 811: pstate = PARSE_ELEM;
1.5 schwarze 812: rlen = 0;
813:
814: /*
815: * Read loop.
816: *
1.14 ! schwarze 817: * If the previous token was incomplete and asked for more
! 818: * input, we have to enter the read loop once more even on EOF.
1.5 schwarze 819: * Once rsz is 0, incomplete tokens will no longer ask
820: * for more input but instead use whatever there is,
821: * and then exit the read loop.
822: * The minus one on the size limit for read(2) is needed
823: * such that advance() can set b[rlen] to NUL when needed.
824: */
825:
1.14 ! schwarze 826: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
! 827: (rlen += rsz) > 0) {
! 828: poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5 schwarze 829: /* Buffer exhausted; shift left and re-fill. */
830: assert(poff > 0);
831: rlen -= poff;
1.14 ! schwarze 832: memmove(b, b + poff, rlen);
1.5 schwarze 833: }
834: if (rsz < 0) {
835: perror(fname);
836: p->tree->flags |= TREE_FAIL;
837: }
838: if (p->cur != NULL && p->cur->node == NODE_TEXT) {
839: pnode_trim(p->cur);
840: p->cur = p->cur->parent;
841: }
1.6 schwarze 842: if ((p->tree->flags & TREE_CLOSED) == 0)
843: warn_msg(p, "document not closed");
1.1 schwarze 844: return p->tree;
845: }
CVSweb