Annotation of docbook2mdoc/parse.c, Revision 1.13
1.13 ! schwarze 1: /* $Id: parse.c,v 1.12 2019/04/03 16:08:57 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
33: /*
34: * Global parse state.
35: * Keep this as simple and small as possible.
36: */
37: struct parse {
38: const char *fname; /* Name of the input file. */
39: struct ptree *tree; /* Complete parse result. */
40: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 41: enum nodeid ncur; /* Type of the current node. */
42: int line; /* Line number in the input file. */
43: int col; /* Column number in the input file. */
44: int nline; /* Line number of next token. */
45: int ncol; /* Column number of next token. */
1.4 schwarze 46: int del; /* Levels of nested nodes being deleted. */
1.5 schwarze 47: int attr; /* The most recent attribute is valid. */
1.1 schwarze 48: int warn;
49: };
50:
51: struct element {
52: const char *name; /* DocBook element name. */
53: enum nodeid node; /* Node type to generate. */
54: };
55:
56: static const struct element elements[] = {
1.3 schwarze 57: { "acronym", NODE_IGNORE },
1.1 schwarze 58: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 59: { "anchor", NODE_DELETE },
1.1 schwarze 60: { "application", NODE_APPLICATION },
61: { "arg", NODE_ARG },
62: { "author", NODE_AUTHOR },
63: { "authorgroup", NODE_AUTHORGROUP },
64: { "blockquote", NODE_BLOCKQUOTE },
65: { "book", NODE_BOOK },
66: { "bookinfo", NODE_BOOKINFO },
67: { "caution", NODE_CAUTION },
68: { "chapter", NODE_SECTION },
69: { "citerefentry", NODE_CITEREFENTRY },
70: { "citetitle", NODE_CITETITLE },
71: { "cmdsynopsis", NODE_CMDSYNOPSIS },
1.13 ! schwarze 72: { "code", NODE_LITERAL },
1.1 schwarze 73: { "colspec", NODE_COLSPEC },
74: { "command", NODE_COMMAND },
75: { "constant", NODE_CONSTANT },
1.7 schwarze 76: { "contrib", NODE_CONTRIB },
1.1 schwarze 77: { "copyright", NODE_COPYRIGHT },
78: { "date", NODE_DATE },
79: { "editor", NODE_EDITOR },
80: { "email", NODE_EMAIL },
81: { "emphasis", NODE_EMPHASIS },
82: { "entry", NODE_ENTRY },
83: { "envar", NODE_ENVAR },
1.13 ! schwarze 84: { "errorname", NODE_ERRORNAME },
1.1 schwarze 85: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
86: { "filename", NODE_FILENAME },
1.7 schwarze 87: { "firstname", NODE_PERSONNAME },
1.1 schwarze 88: { "firstterm", NODE_FIRSTTERM },
89: { "footnote", NODE_FOOTNOTE },
90: { "funcdef", NODE_FUNCDEF },
91: { "funcprototype", NODE_FUNCPROTOTYPE },
92: { "funcsynopsis", NODE_FUNCSYNOPSIS },
93: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
94: { "function", NODE_FUNCTION },
95: { "glossterm", NODE_GLOSSTERM },
96: { "group", NODE_GROUP },
97: { "holder", NODE_HOLDER },
98: { "index", NODE_INDEX },
1.4 schwarze 99: { "indexterm", NODE_DELETE },
1.1 schwarze 100: { "info", NODE_INFO },
101: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 102: { "informaltable", NODE_TABLE },
1.1 schwarze 103: { "inlineequation", NODE_INLINEEQUATION },
104: { "itemizedlist", NODE_ITEMIZEDLIST },
105: { "keysym", NODE_KEYSYM },
106: { "legalnotice", NODE_LEGALNOTICE },
107: { "link", NODE_LINK },
108: { "listitem", NODE_LISTITEM },
109: { "literal", NODE_LITERAL },
110: { "literallayout", NODE_LITERALLAYOUT },
111: { "manvolnum", NODE_MANVOLNUM },
112: { "member", NODE_MEMBER },
113: { "mml:math", NODE_MML_MATH },
114: { "mml:mfenced", NODE_MML_MFENCED },
115: { "mml:mfrac", NODE_MML_MFRAC },
116: { "mml:mi", NODE_MML_MI },
117: { "mml:mn", NODE_MML_MN },
118: { "mml:mo", NODE_MML_MO },
119: { "mml:mrow", NODE_MML_MROW },
120: { "mml:msub", NODE_MML_MSUB },
121: { "mml:msup", NODE_MML_MSUP },
122: { "modifier", NODE_MODIFIER },
123: { "note", NODE_NOTE },
124: { "option", NODE_OPTION },
125: { "orderedlist", NODE_ORDEREDLIST },
126: { "orgname", NODE_ORGNAME },
1.7 schwarze 127: { "othername", NODE_PERSONNAME },
1.1 schwarze 128: { "para", NODE_PARA },
129: { "paramdef", NODE_PARAMDEF },
130: { "parameter", NODE_PARAMETER },
131: { "part", NODE_SECTION },
132: { "personname", NODE_PERSONNAME },
1.3 schwarze 133: { "phrase", NODE_IGNORE },
1.1 schwarze 134: { "preface", NODE_PREFACE },
1.4 schwarze 135: { "primary", NODE_DELETE },
1.1 schwarze 136: { "programlisting", NODE_PROGRAMLISTING },
137: { "prompt", NODE_PROMPT },
138: { "quote", NODE_QUOTE },
139: { "refclass", NODE_REFCLASS },
140: { "refdescriptor", NODE_REFDESCRIPTOR },
141: { "refentry", NODE_REFENTRY },
142: { "refentryinfo", NODE_REFENTRYINFO },
143: { "refentrytitle", NODE_REFENTRYTITLE },
144: { "refmeta", NODE_REFMETA },
145: { "refmetainfo", NODE_REFMETAINFO },
146: { "refmiscinfo", NODE_REFMISCINFO },
147: { "refname", NODE_REFNAME },
148: { "refnamediv", NODE_REFNAMEDIV },
149: { "refpurpose", NODE_REFPURPOSE },
150: { "refsect1", NODE_SECTION },
151: { "refsect2", NODE_SECTION },
152: { "refsect3", NODE_SECTION },
153: { "refsection", NODE_SECTION },
154: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
155: { "releaseinfo", NODE_RELEASEINFO },
156: { "replaceable", NODE_REPLACEABLE },
157: { "row", NODE_ROW },
158: { "sbr", NODE_SBR },
159: { "screen", NODE_SCREEN },
1.4 schwarze 160: { "secondary", NODE_DELETE },
1.1 schwarze 161: { "sect1", NODE_SECTION },
162: { "sect2", NODE_SECTION },
163: { "section", NODE_SECTION },
164: { "sgmltag", NODE_SGMLTAG },
165: { "simplelist", NODE_SIMPLELIST },
166: { "spanspec", NODE_SPANSPEC },
1.13 ! schwarze 167: { "structfield", NODE_PARAMETER },
! 168: { "structname", NODE_TYPE },
1.1 schwarze 169: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 170: { "surname", NODE_PERSONNAME },
1.12 schwarze 171: { "symbol", NODE_CONSTANT },
1.1 schwarze 172: { "synopsis", NODE_SYNOPSIS },
173: { "table", NODE_TABLE },
174: { "tbody", NODE_TBODY },
175: { "term", NODE_TERM },
176: { "tfoot", NODE_TFOOT },
177: { "tgroup", NODE_TGROUP },
178: { "thead", NODE_THEAD },
179: { "tip", NODE_TIP },
180: { "title", NODE_TITLE },
1.3 schwarze 181: { "trademark", NODE_IGNORE },
1.1 schwarze 182: { "type", NODE_TYPE },
183: { "ulink", NODE_ULINK },
1.13 ! schwarze 184: { "userinput", NODE_LITERAL },
1.1 schwarze 185: { "variablelist", NODE_VARIABLELIST },
186: { "varlistentry", NODE_VARLISTENTRY },
187: { "varname", NODE_VARNAME },
188: { "warning", NODE_WARNING },
189: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 190: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 191: { "year", NODE_YEAR },
1.5 schwarze 192: { NULL, NODE_IGNORE }
1.1 schwarze 193: };
194:
1.9 schwarze 195: struct entity {
196: const char *name;
197: const char *roff;
198: };
199:
200: /*
201: * XML character entity references found in the wild.
202: * Those that don't have an exact mandoc_char(7) representation
203: * are approximated, and the desired codepoint is given as a comment.
204: * Encoding them as \\[u...] would leave -Tascii out in the cold.
205: */
206: static const struct entity entities[] = {
207: { "alpha", "\\(*a" },
208: { "amp", "&" },
209: { "apos", "'" },
210: { "auml", "\\(:a" },
211: { "beta", "\\(*b" },
212: { "circ", "^" }, /* U+02C6 */
213: { "copy", "\\(co" },
214: { "dagger", "\\(dg" },
215: { "Delta", "\\(*D" },
216: { "eacute", "\\('e" },
217: { "emsp", "\\ " }, /* U+2003 */
218: { "gt", ">" },
219: { "hairsp", "\\^" },
220: { "kappa", "\\(*k" },
221: { "larr", "\\(<-" },
222: { "ldquo", "\\(lq" },
223: { "le", "\\(<=" },
224: { "lowbar", "_" },
225: { "lsqb", "[" },
226: { "lt", "<" },
227: { "mdash", "\\(em" },
228: { "minus", "\\-" },
229: { "ndash", "\\(en" },
230: { "nbsp", "\\ " },
231: { "num", "#" },
232: { "oslash", "\\(/o" },
233: { "ouml", "\\(:o" },
234: { "percnt", "%" },
235: { "quot", "\\(dq" },
236: { "rarr", "\\(->" },
237: { "rArr", "\\(rA" },
238: { "rdquo", "\\(rq" },
239: { "reg", "\\(rg" },
240: { "rho", "\\(*r" },
241: { "rsqb", "]" },
242: { "sigma", "\\(*s" },
243: { "shy", "\\&" }, /* U+00AD */
244: { "tau", "\\(*t" },
245: { "tilde", "\\[u02DC]" },
246: { "times", "\\[tmu]" },
247: { "uuml", "\\(:u" },
248: { NULL, NULL }
249: };
250:
1.6 schwarze 251: static void
252: error_msg(struct parse *p, const char *fmt, ...)
253: {
254: va_list ap;
255:
256: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
257: va_start(ap, fmt);
258: vfprintf(stderr, fmt, ap);
259: va_end(ap);
260: fputc('\n', stderr);
261: p->tree->flags |= TREE_FAIL;
262: }
263:
264: static void
265: warn_msg(struct parse *p, const char *fmt, ...)
266: {
267: va_list ap;
268:
269: if (p->warn == 0)
270: return;
271:
272: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
273: va_start(ap, fmt);
274: vfprintf(stderr, fmt, ap);
275: va_end(ap);
276: fputc('\n', stderr);
277: }
278:
1.1 schwarze 279: /*
280: * Process a string of characters.
281: * If a text node is already open, append to it.
282: * Otherwise, create a new one as a child of the current node.
283: */
284: static void
1.5 schwarze 285: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 286: {
287: struct pnode *dat;
288:
1.5 schwarze 289: if (ps->del > 0)
1.1 schwarze 290: return;
291:
1.5 schwarze 292: if (ps->cur == NULL) {
1.6 schwarze 293: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 294: return;
295: }
296:
1.1 schwarze 297: if (ps->cur->node != NODE_TEXT) {
298: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
299: perror(NULL);
300: exit(1);
301: }
302: dat->node = NODE_TEXT;
303: dat->parent = ps->cur;
304: TAILQ_INIT(&dat->childq);
305: TAILQ_INIT(&dat->attrq);
306: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
307: ps->cur = dat;
308: }
309:
1.5 schwarze 310: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 311: ps->cur->parent == ps->tree->root)
312: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 313:
1.1 schwarze 314: /* Append to the current text node. */
315:
316: assert(sz >= 0);
317: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
318: if (ps->cur->b == NULL) {
319: perror(NULL);
320: exit(1);
321: }
322: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
323: ps->cur->bsz += sz;
324: ps->cur->b[ps->cur->bsz] = '\0';
325: ps->cur->real = ps->cur->b;
326: }
327:
328: static void
329: pnode_trim(struct pnode *pn)
330: {
331: assert(pn->node == NODE_TEXT);
332: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
333: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
334: break;
335: }
336:
1.9 schwarze 337: static void
338: xml_entity(struct parse *p, const char *name)
339: {
340: const struct entity *entity;
341: struct pnode *dat;
342:
343: if (p->del > 0)
344: return;
345:
346: if (p->cur == NULL) {
347: error_msg(p, "discarding entity before document: &%s;", name);
348: return;
349: }
350:
351: /* Close out the text node, if there is one. */
352: if (p->cur->node == NODE_TEXT) {
353: pnode_trim(p->cur);
354: p->cur = p->cur->parent;
355: }
356:
357: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
358: warn_msg(p, "entity after end of document: &%s;", name);
359:
360: for (entity = entities; entity->name != NULL; entity++)
361: if (strcmp(name, entity->name) == 0)
362: break;
363:
364: if (entity->roff == NULL) {
365: error_msg(p, "unknown entity &%s;", name);
366: return;
367: }
368:
369: /* Create, append, and close out an entity node. */
370: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
371: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
372: perror(NULL);
373: exit(1);
374: }
375: dat->node = NODE_ESCAPE;
376: dat->bsz = strlen(dat->b);
377: dat->parent = p->cur;
378: TAILQ_INIT(&dat->childq);
379: TAILQ_INIT(&dat->attrq);
380: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
381: }
382:
1.1 schwarze 383: /*
384: * Begin an element.
385: */
386: static void
1.5 schwarze 387: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 388: {
1.5 schwarze 389: const struct element *elem;
390: struct pnode *dat;
1.1 schwarze 391:
1.5 schwarze 392: if (*name == '!' || *name == '?')
1.1 schwarze 393: return;
394:
1.4 schwarze 395: /*
396: * An ancestor is excluded from the tree;
397: * keep track of the number of levels excluded.
398: */
399: if (ps->del > 0) {
400: ps->del++;
401: return;
402: }
403:
1.1 schwarze 404: /* Close out the text node, if there is one. */
405: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
406: pnode_trim(ps->cur);
407: ps->cur = ps->cur->parent;
408: }
409:
410: for (elem = elements; elem->name != NULL; elem++)
411: if (strcmp(elem->name, name) == 0)
412: break;
413:
1.6 schwarze 414: if (elem->name == NULL)
415: error_msg(ps, "unknown element <%s>", name);
416:
1.5 schwarze 417: ps->ncur = elem->node;
1.1 schwarze 418:
1.5 schwarze 419: switch (ps->ncur) {
1.4 schwarze 420: case NODE_DELETE_WARN:
1.6 schwarze 421: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 422: /* FALLTHROUGH */
1.4 schwarze 423: case NODE_DELETE:
424: ps->del = 1;
425: /* FALLTHROUGH */
1.2 schwarze 426: case NODE_IGNORE:
427: return;
428: case NODE_INLINEEQUATION:
1.1 schwarze 429: ps->tree->flags |= TREE_EQN;
1.2 schwarze 430: break;
431: default:
432: break;
433: }
1.1 schwarze 434:
1.6 schwarze 435: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
436: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 437:
1.1 schwarze 438: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
439: perror(NULL);
440: exit(1);
441: }
442: dat->node = elem->node;
443: dat->parent = ps->cur;
444: TAILQ_INIT(&dat->childq);
445: TAILQ_INIT(&dat->attrq);
446:
447: if (ps->cur != NULL)
448: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
449:
450: ps->cur = dat;
451: if (ps->tree->root == NULL)
452: ps->tree->root = dat;
1.5 schwarze 453: }
454:
455: static void
456: xml_attrkey(struct parse *ps, const char *name)
457: {
458: struct pattr *attr;
459: enum attrkey key;
1.1 schwarze 460:
1.5 schwarze 461: if (ps->del > 0 || *name == '\0')
462: return;
463: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
464: ps->attr = 0;
465: return;
466: }
467: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
468: perror(NULL);
469: exit(1);
470: }
471: attr->key = key;
472: attr->val = ATTRVAL__MAX;
473: attr->rawval = NULL;
474: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
475: ps->attr = 1;
476: }
477:
478: static void
479: xml_attrval(struct parse *ps, const char *name)
480: {
481: struct pattr *attr;
482:
483: if (ps->del > 0 || ps->attr == 0)
484: return;
485: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
486: return;
487: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
488: (attr->rawval = strdup(name)) == NULL) {
489: perror(NULL);
490: exit(1);
1.1 schwarze 491: }
492: }
493:
494: /*
495: * Roll up the parse tree.
496: * If we're at a text node, roll that one up first.
497: */
498: static void
1.5 schwarze 499: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 500: {
1.5 schwarze 501: const struct element *elem;
502: enum nodeid node;
1.1 schwarze 503:
1.4 schwarze 504: /*
505: * An ancestor is excluded from the tree;
506: * keep track of the number of levels excluded.
507: */
508: if (ps->del > 1) {
509: ps->del--;
510: return;
511: }
512:
1.1 schwarze 513: /* Close out the text node, if there is one. */
1.5 schwarze 514: if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1 schwarze 515: pnode_trim(ps->cur);
516: ps->cur = ps->cur->parent;
517: }
1.2 schwarze 518:
1.5 schwarze 519: if (name != NULL) {
520: for (elem = elements; elem->name != NULL; elem++)
521: if (strcmp(elem->name, name) == 0)
522: break;
523: node = elem->node;
524: } else
525: node = ps->ncur;
1.2 schwarze 526:
1.5 schwarze 527: switch (node) {
1.4 schwarze 528: case NODE_DELETE_WARN:
529: case NODE_DELETE:
1.5 schwarze 530: if (ps->del > 0)
531: ps->del--;
1.4 schwarze 532: break;
1.2 schwarze 533: case NODE_IGNORE:
534: break;
535: default:
1.5 schwarze 536: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 537: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 538: break;
539: }
540:
541: /*
542: * Refrain from actually closing the document element.
543: * If no more content follows, no harm is done, but if
544: * some content still follows, simply processing it is
545: * obviously better than discarding it or crashing.
546: */
547:
548: if (ps->cur->parent == NULL)
549: ps->tree->flags |= TREE_CLOSED;
550: else
551: ps->cur = ps->cur->parent;
1.4 schwarze 552: break;
1.2 schwarze 553: }
1.4 schwarze 554: assert(ps->del == 0);
1.1 schwarze 555: }
556:
557: struct parse *
558: parse_alloc(int warn)
559: {
560: struct parse *p;
561:
562: if ((p = calloc(1, sizeof(*p))) == NULL)
563: return NULL;
564:
565: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
566: free(p);
567: return NULL;
568: }
569: p->warn = warn;
570: return p;
571: }
572:
573: void
574: parse_free(struct parse *p)
575: {
576: if (p == NULL)
577: return;
578: if (p->tree != NULL) {
579: pnode_unlink(p->tree->root);
580: free(p->tree);
581: }
582: free(p);
583: }
584:
1.5 schwarze 585: /*
586: * Advance the pend pointer to the next character in the charset.
587: * If the charset starts with a space, it stands for any whitespace.
588: * Update the new input file position, used for messages.
589: * Do not overrun the buffer b of length rlen.
590: * When reaching the end, NUL-terminate the buffer and return 1;
591: * otherwise, return 0.
592: */
593: static int
594: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
595: const char *charset)
596: {
597: int space;
598:
599: if (*charset == ' ') {
600: space = 1;
601: charset++;
602: } else
603: space = 0;
604:
605: p->nline = p->line;
606: p->ncol = p->col;
607: while (*pend < rlen) {
608: if (b[*pend] == '\n') {
609: p->nline++;
610: p->ncol = 1;
611: } else
612: p->ncol++;
613: if (space && isspace((unsigned char)b[*pend]))
614: break;
615: if (strchr(charset, b[*pend]) != NULL)
616: break;
617: ++*pend;
618: }
619: if (*pend == rlen) {
620: b[rlen] = '\0';
621: return 1;
622: } else
623: return 0;
624: }
625:
1.1 schwarze 626: struct ptree *
627: parse_file(struct parse *p, int fd, const char *fname)
628: {
629: char b[4096];
1.8 schwarze 630: char *cp;
1.5 schwarze 631: ssize_t rsz; /* Return value from read(2). */
632: size_t rlen; /* Number of bytes in b[]. */
633: size_t poff; /* Parse offset in b[]. */
634: size_t pend; /* Offset of the end of the current word. */
635: int in_tag, in_arg, in_quotes, elem_end;
1.1 schwarze 636:
637: p->fname = fname;
1.5 schwarze 638: p->nline = 1;
639: p->ncol = 1;
640: rlen = 0;
641: in_tag = in_arg = in_quotes = 0;
642:
643: /*
644: * Read loop.
645: *
646: * We have to enter the read loop once more even on EOF
647: * because the previous token may have been incomplete,
648: * such that it asked for more input.
649: * Once rsz is 0, incomplete tokens will no longer ask
650: * for more input but instead use whatever there is,
651: * and then exit the read loop.
652: * The minus one on the size limit for read(2) is needed
653: * such that advance() can set b[rlen] to NUL when needed.
654: */
655:
656: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
657: if ((rlen += rsz) == 0)
658: break;
659:
660: /* Token loop. */
661:
662: pend = 0;
663: for (;;) {
664:
665: /* Proceed to the next token, skipping whitespace. */
666:
667: p->line = p->nline;
668: p->col = p->ncol;
669: if ((poff = pend) == rlen)
670: break;
671: if (isspace((unsigned char)b[pend])) {
672: if (b[pend++] == '\n') {
673: p->nline++;
674: p->ncol = 1;
675: } else
676: p->ncol++;
677: continue;
678: }
679:
680: /*
1.9 schwarze 681: * The following four cases (in_arg, in_tag, and
682: * starting an entity or a tag) all parse a word
683: * or quoted string. If that extends beyond the
1.5 schwarze 684: * read buffer and the last read(2) still got
685: * data, they all break out of the token loop
686: * to request more data from the read loop.
687: *
1.9 schwarze 688: * Also, three of them detect self-closing tags,
1.5 schwarze 689: * those ending with "/>", setting the flag
690: * elem_end and calling xml_elem_end() at the
691: * very end, after handling the attribute value,
692: * attribute name, or tag name, respectively.
693: */
694:
695: /* Parse an attribute value. */
696:
697: if (in_arg) {
1.10 schwarze 698: if (in_quotes == 0 &&
699: (b[pend] == '\'' || b[pend] == '"')) {
700: in_quotes = b[pend] == '"' ? 2 : 1;
1.5 schwarze 701: p->ncol++;
702: pend++;
703: continue;
704: }
705: if (advance(p, b, rlen, &pend,
1.10 schwarze 706: in_quotes == 2 ? "\"" :
707: in_quotes == 1 ? "'" : " >") && rsz > 0)
1.5 schwarze 708: break;
709: in_arg = in_quotes = elem_end = 0;
710: if (b[pend] == '>') {
711: in_tag = 0;
712: if (pend > 0 && b[pend - 1] == '/') {
713: b[pend - 1] = '\0';
714: elem_end = 1;
715: }
716: }
717: b[pend] = '\0';
718: if (pend < rlen)
719: pend++;
720: xml_attrval(p, b + poff);
721: if (elem_end)
722: xml_elem_end(p, NULL);
723:
724: /* Look for an attribute name. */
725:
726: } else if (in_tag) {
727: if (advance(p, b, rlen, &pend, " =>") &&
728: rsz > 0)
729: break;
730: elem_end = 0;
731: switch (b[pend]) {
732: case '>':
733: in_tag = 0;
734: if (pend > 0 && b[pend - 1] == '/') {
735: b[pend - 1] = '\0';
736: elem_end = 1;
737: }
738: break;
739: case '=':
740: in_arg = 1;
741: break;
742: default:
743: break;
744: }
745: b[pend] = '\0';
746: if (pend < rlen)
747: pend++;
748: xml_attrkey(p, b + poff);
749: if (elem_end)
750: xml_elem_end(p, NULL);
751:
752: /* Begin an opening or closing tag. */
753:
754: } else if (b[poff] == '<') {
755: if (advance(p, b, rlen, &pend, " >") &&
756: rsz > 0)
757: break;
1.8 schwarze 758: if (pend > poff + 3 &&
759: strncmp(b + poff, "<!--", 4) == 0) {
760:
761: /* Skip a comment. */
762:
763: cp = strstr(b + pend - 2, "-->");
764: if (cp == NULL) {
765: if (rsz > 0) {
766: pend = rlen;
767: break;
768: }
769: cp = b + rlen;
770: } else
771: cp += 3;
772: while (b + pend < cp) {
773: if (b[++pend] == '\n') {
774: p->nline++;
775: p->ncol = 1;
776: } else
777: p->ncol++;
778: }
779: continue;
780: }
1.5 schwarze 781: elem_end = 0;
782: if (b[pend] != '>')
783: in_tag = 1;
784: else if (pend > 0 && b[pend - 1] == '/') {
785: b[pend - 1] = '\0';
786: elem_end = 1;
787: }
788: b[pend] = '\0';
789: if (pend < rlen)
790: pend++;
791: if (b[++poff] == '/') {
792: elem_end = 1;
793: poff++;
794: } else
795: xml_elem_start(p, b + poff);
796: if (elem_end)
797: xml_elem_end(p, b + poff);
798:
1.9 schwarze 799: /* Process an entity. */
800:
801: } else if (b[poff] == '&') {
802: if (advance(p, b, rlen, &pend, ";") &&
803: rsz > 0)
804: break;
805: b[pend] = '\0';
806: if (pend < rlen)
807: pend++;
808: xml_entity(p, b + poff + 1);
809:
810: /* Process text up to the next tag or entity. */
1.5 schwarze 811:
812: } else {
1.9 schwarze 813: if (advance(p, b, rlen, &pend, "<&") == 0)
1.5 schwarze 814: p->ncol--;
815: xml_char(p, b + poff, pend - poff);
816: }
1.1 schwarze 817: }
1.5 schwarze 818:
819: /* Buffer exhausted; shift left and re-fill. */
820:
821: assert(poff > 0);
822: memmove(b, b + poff, rlen - poff);
823: rlen -= poff;
824: }
825: if (rsz < 0) {
826: perror(fname);
827: p->tree->flags |= TREE_FAIL;
828: }
829: if (p->cur != NULL && p->cur->node == NODE_TEXT) {
830: pnode_trim(p->cur);
831: p->cur = p->cur->parent;
832: }
1.6 schwarze 833: if ((p->tree->flags & TREE_CLOSED) == 0)
834: warn_msg(p, "document not closed");
1.1 schwarze 835: return p->tree;
836: }
CVSweb