Annotation of docbook2mdoc/parse.c, Revision 1.12
1.12 ! schwarze 1: /* $Id: parse.c,v 1.11 2019/04/03 11:46:09 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
33: /*
34: * Global parse state.
35: * Keep this as simple and small as possible.
36: */
37: struct parse {
38: const char *fname; /* Name of the input file. */
39: struct ptree *tree; /* Complete parse result. */
40: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 41: enum nodeid ncur; /* Type of the current node. */
42: int line; /* Line number in the input file. */
43: int col; /* Column number in the input file. */
44: int nline; /* Line number of next token. */
45: int ncol; /* Column number of next token. */
1.4 schwarze 46: int del; /* Levels of nested nodes being deleted. */
1.5 schwarze 47: int attr; /* The most recent attribute is valid. */
1.1 schwarze 48: int warn;
49: };
50:
51: struct element {
52: const char *name; /* DocBook element name. */
53: enum nodeid node; /* Node type to generate. */
54: };
55:
56: static const struct element elements[] = {
1.3 schwarze 57: { "acronym", NODE_IGNORE },
1.1 schwarze 58: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 59: { "anchor", NODE_DELETE },
1.1 schwarze 60: { "application", NODE_APPLICATION },
61: { "arg", NODE_ARG },
62: { "author", NODE_AUTHOR },
63: { "authorgroup", NODE_AUTHORGROUP },
64: { "blockquote", NODE_BLOCKQUOTE },
65: { "book", NODE_BOOK },
66: { "bookinfo", NODE_BOOKINFO },
67: { "caution", NODE_CAUTION },
68: { "chapter", NODE_SECTION },
69: { "citerefentry", NODE_CITEREFENTRY },
70: { "citetitle", NODE_CITETITLE },
71: { "cmdsynopsis", NODE_CMDSYNOPSIS },
72: { "code", NODE_CODE },
73: { "colspec", NODE_COLSPEC },
74: { "command", NODE_COMMAND },
75: { "constant", NODE_CONSTANT },
1.7 schwarze 76: { "contrib", NODE_CONTRIB },
1.1 schwarze 77: { "copyright", NODE_COPYRIGHT },
78: { "date", NODE_DATE },
79: { "editor", NODE_EDITOR },
80: { "email", NODE_EMAIL },
81: { "emphasis", NODE_EMPHASIS },
82: { "entry", NODE_ENTRY },
83: { "envar", NODE_ENVAR },
84: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
85: { "filename", NODE_FILENAME },
1.7 schwarze 86: { "firstname", NODE_PERSONNAME },
1.1 schwarze 87: { "firstterm", NODE_FIRSTTERM },
88: { "footnote", NODE_FOOTNOTE },
89: { "funcdef", NODE_FUNCDEF },
90: { "funcprototype", NODE_FUNCPROTOTYPE },
91: { "funcsynopsis", NODE_FUNCSYNOPSIS },
92: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
93: { "function", NODE_FUNCTION },
94: { "glossterm", NODE_GLOSSTERM },
95: { "group", NODE_GROUP },
96: { "holder", NODE_HOLDER },
97: { "index", NODE_INDEX },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.1 schwarze 99: { "info", NODE_INFO },
100: { "informalequation", NODE_INFORMALEQUATION },
1.11 schwarze 101: { "informaltable", NODE_TABLE },
1.1 schwarze 102: { "inlineequation", NODE_INLINEEQUATION },
103: { "itemizedlist", NODE_ITEMIZEDLIST },
104: { "keysym", NODE_KEYSYM },
105: { "legalnotice", NODE_LEGALNOTICE },
106: { "link", NODE_LINK },
107: { "listitem", NODE_LISTITEM },
108: { "literal", NODE_LITERAL },
109: { "literallayout", NODE_LITERALLAYOUT },
110: { "manvolnum", NODE_MANVOLNUM },
111: { "member", NODE_MEMBER },
112: { "mml:math", NODE_MML_MATH },
113: { "mml:mfenced", NODE_MML_MFENCED },
114: { "mml:mfrac", NODE_MML_MFRAC },
115: { "mml:mi", NODE_MML_MI },
116: { "mml:mn", NODE_MML_MN },
117: { "mml:mo", NODE_MML_MO },
118: { "mml:mrow", NODE_MML_MROW },
119: { "mml:msub", NODE_MML_MSUB },
120: { "mml:msup", NODE_MML_MSUP },
121: { "modifier", NODE_MODIFIER },
122: { "note", NODE_NOTE },
123: { "option", NODE_OPTION },
124: { "orderedlist", NODE_ORDEREDLIST },
125: { "orgname", NODE_ORGNAME },
1.7 schwarze 126: { "othername", NODE_PERSONNAME },
1.1 schwarze 127: { "para", NODE_PARA },
128: { "paramdef", NODE_PARAMDEF },
129: { "parameter", NODE_PARAMETER },
130: { "part", NODE_SECTION },
131: { "personname", NODE_PERSONNAME },
1.3 schwarze 132: { "phrase", NODE_IGNORE },
1.1 schwarze 133: { "preface", NODE_PREFACE },
1.4 schwarze 134: { "primary", NODE_DELETE },
1.1 schwarze 135: { "programlisting", NODE_PROGRAMLISTING },
136: { "prompt", NODE_PROMPT },
137: { "quote", NODE_QUOTE },
138: { "refclass", NODE_REFCLASS },
139: { "refdescriptor", NODE_REFDESCRIPTOR },
140: { "refentry", NODE_REFENTRY },
141: { "refentryinfo", NODE_REFENTRYINFO },
142: { "refentrytitle", NODE_REFENTRYTITLE },
143: { "refmeta", NODE_REFMETA },
144: { "refmetainfo", NODE_REFMETAINFO },
145: { "refmiscinfo", NODE_REFMISCINFO },
146: { "refname", NODE_REFNAME },
147: { "refnamediv", NODE_REFNAMEDIV },
148: { "refpurpose", NODE_REFPURPOSE },
149: { "refsect1", NODE_SECTION },
150: { "refsect2", NODE_SECTION },
151: { "refsect3", NODE_SECTION },
152: { "refsection", NODE_SECTION },
153: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
154: { "releaseinfo", NODE_RELEASEINFO },
155: { "replaceable", NODE_REPLACEABLE },
156: { "row", NODE_ROW },
157: { "sbr", NODE_SBR },
158: { "screen", NODE_SCREEN },
1.4 schwarze 159: { "secondary", NODE_DELETE },
1.1 schwarze 160: { "sect1", NODE_SECTION },
161: { "sect2", NODE_SECTION },
162: { "section", NODE_SECTION },
163: { "sgmltag", NODE_SGMLTAG },
164: { "simplelist", NODE_SIMPLELIST },
165: { "spanspec", NODE_SPANSPEC },
166: { "structname", NODE_STRUCTNAME },
167: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 168: { "surname", NODE_PERSONNAME },
1.12 ! schwarze 169: { "symbol", NODE_CONSTANT },
1.1 schwarze 170: { "synopsis", NODE_SYNOPSIS },
171: { "table", NODE_TABLE },
172: { "tbody", NODE_TBODY },
173: { "term", NODE_TERM },
174: { "tfoot", NODE_TFOOT },
175: { "tgroup", NODE_TGROUP },
176: { "thead", NODE_THEAD },
177: { "tip", NODE_TIP },
178: { "title", NODE_TITLE },
1.3 schwarze 179: { "trademark", NODE_IGNORE },
1.1 schwarze 180: { "type", NODE_TYPE },
181: { "ulink", NODE_ULINK },
182: { "userinput", NODE_USERINPUT },
183: { "variablelist", NODE_VARIABLELIST },
184: { "varlistentry", NODE_VARLISTENTRY },
185: { "varname", NODE_VARNAME },
186: { "warning", NODE_WARNING },
187: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 188: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 189: { "year", NODE_YEAR },
1.5 schwarze 190: { NULL, NODE_IGNORE }
1.1 schwarze 191: };
192:
1.9 schwarze 193: struct entity {
194: const char *name;
195: const char *roff;
196: };
197:
198: /*
199: * XML character entity references found in the wild.
200: * Those that don't have an exact mandoc_char(7) representation
201: * are approximated, and the desired codepoint is given as a comment.
202: * Encoding them as \\[u...] would leave -Tascii out in the cold.
203: */
204: static const struct entity entities[] = {
205: { "alpha", "\\(*a" },
206: { "amp", "&" },
207: { "apos", "'" },
208: { "auml", "\\(:a" },
209: { "beta", "\\(*b" },
210: { "circ", "^" }, /* U+02C6 */
211: { "copy", "\\(co" },
212: { "dagger", "\\(dg" },
213: { "Delta", "\\(*D" },
214: { "eacute", "\\('e" },
215: { "emsp", "\\ " }, /* U+2003 */
216: { "gt", ">" },
217: { "hairsp", "\\^" },
218: { "kappa", "\\(*k" },
219: { "larr", "\\(<-" },
220: { "ldquo", "\\(lq" },
221: { "le", "\\(<=" },
222: { "lowbar", "_" },
223: { "lsqb", "[" },
224: { "lt", "<" },
225: { "mdash", "\\(em" },
226: { "minus", "\\-" },
227: { "ndash", "\\(en" },
228: { "nbsp", "\\ " },
229: { "num", "#" },
230: { "oslash", "\\(/o" },
231: { "ouml", "\\(:o" },
232: { "percnt", "%" },
233: { "quot", "\\(dq" },
234: { "rarr", "\\(->" },
235: { "rArr", "\\(rA" },
236: { "rdquo", "\\(rq" },
237: { "reg", "\\(rg" },
238: { "rho", "\\(*r" },
239: { "rsqb", "]" },
240: { "sigma", "\\(*s" },
241: { "shy", "\\&" }, /* U+00AD */
242: { "tau", "\\(*t" },
243: { "tilde", "\\[u02DC]" },
244: { "times", "\\[tmu]" },
245: { "uuml", "\\(:u" },
246: { NULL, NULL }
247: };
248:
1.6 schwarze 249: static void
250: error_msg(struct parse *p, const char *fmt, ...)
251: {
252: va_list ap;
253:
254: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
255: va_start(ap, fmt);
256: vfprintf(stderr, fmt, ap);
257: va_end(ap);
258: fputc('\n', stderr);
259: p->tree->flags |= TREE_FAIL;
260: }
261:
262: static void
263: warn_msg(struct parse *p, const char *fmt, ...)
264: {
265: va_list ap;
266:
267: if (p->warn == 0)
268: return;
269:
270: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
271: va_start(ap, fmt);
272: vfprintf(stderr, fmt, ap);
273: va_end(ap);
274: fputc('\n', stderr);
275: }
276:
1.1 schwarze 277: /*
278: * Process a string of characters.
279: * If a text node is already open, append to it.
280: * Otherwise, create a new one as a child of the current node.
281: */
282: static void
1.5 schwarze 283: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 284: {
285: struct pnode *dat;
286:
1.5 schwarze 287: if (ps->del > 0)
1.1 schwarze 288: return;
289:
1.5 schwarze 290: if (ps->cur == NULL) {
1.6 schwarze 291: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 292: return;
293: }
294:
1.1 schwarze 295: if (ps->cur->node != NODE_TEXT) {
296: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
297: perror(NULL);
298: exit(1);
299: }
300: dat->node = NODE_TEXT;
301: dat->parent = ps->cur;
302: TAILQ_INIT(&dat->childq);
303: TAILQ_INIT(&dat->attrq);
304: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
305: ps->cur = dat;
306: }
307:
1.5 schwarze 308: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 309: ps->cur->parent == ps->tree->root)
310: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 311:
1.1 schwarze 312: /* Append to the current text node. */
313:
314: assert(sz >= 0);
315: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
316: if (ps->cur->b == NULL) {
317: perror(NULL);
318: exit(1);
319: }
320: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
321: ps->cur->bsz += sz;
322: ps->cur->b[ps->cur->bsz] = '\0';
323: ps->cur->real = ps->cur->b;
324: }
325:
326: static void
327: pnode_trim(struct pnode *pn)
328: {
329: assert(pn->node == NODE_TEXT);
330: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
331: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
332: break;
333: }
334:
1.9 schwarze 335: static void
336: xml_entity(struct parse *p, const char *name)
337: {
338: const struct entity *entity;
339: struct pnode *dat;
340:
341: if (p->del > 0)
342: return;
343:
344: if (p->cur == NULL) {
345: error_msg(p, "discarding entity before document: &%s;", name);
346: return;
347: }
348:
349: /* Close out the text node, if there is one. */
350: if (p->cur->node == NODE_TEXT) {
351: pnode_trim(p->cur);
352: p->cur = p->cur->parent;
353: }
354:
355: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
356: warn_msg(p, "entity after end of document: &%s;", name);
357:
358: for (entity = entities; entity->name != NULL; entity++)
359: if (strcmp(name, entity->name) == 0)
360: break;
361:
362: if (entity->roff == NULL) {
363: error_msg(p, "unknown entity &%s;", name);
364: return;
365: }
366:
367: /* Create, append, and close out an entity node. */
368: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
369: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
370: perror(NULL);
371: exit(1);
372: }
373: dat->node = NODE_ESCAPE;
374: dat->bsz = strlen(dat->b);
375: dat->parent = p->cur;
376: TAILQ_INIT(&dat->childq);
377: TAILQ_INIT(&dat->attrq);
378: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
379: }
380:
1.1 schwarze 381: /*
382: * Begin an element.
383: */
384: static void
1.5 schwarze 385: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 386: {
1.5 schwarze 387: const struct element *elem;
388: struct pnode *dat;
1.1 schwarze 389:
1.5 schwarze 390: if (*name == '!' || *name == '?')
1.1 schwarze 391: return;
392:
1.4 schwarze 393: /*
394: * An ancestor is excluded from the tree;
395: * keep track of the number of levels excluded.
396: */
397: if (ps->del > 0) {
398: ps->del++;
399: return;
400: }
401:
1.1 schwarze 402: /* Close out the text node, if there is one. */
403: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
404: pnode_trim(ps->cur);
405: ps->cur = ps->cur->parent;
406: }
407:
408: for (elem = elements; elem->name != NULL; elem++)
409: if (strcmp(elem->name, name) == 0)
410: break;
411:
1.6 schwarze 412: if (elem->name == NULL)
413: error_msg(ps, "unknown element <%s>", name);
414:
1.5 schwarze 415: ps->ncur = elem->node;
1.1 schwarze 416:
1.5 schwarze 417: switch (ps->ncur) {
1.4 schwarze 418: case NODE_DELETE_WARN:
1.6 schwarze 419: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 420: /* FALLTHROUGH */
1.4 schwarze 421: case NODE_DELETE:
422: ps->del = 1;
423: /* FALLTHROUGH */
1.2 schwarze 424: case NODE_IGNORE:
425: return;
426: case NODE_INLINEEQUATION:
1.1 schwarze 427: ps->tree->flags |= TREE_EQN;
1.2 schwarze 428: break;
429: default:
430: break;
431: }
1.1 schwarze 432:
1.6 schwarze 433: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
434: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 435:
1.1 schwarze 436: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
437: perror(NULL);
438: exit(1);
439: }
440: dat->node = elem->node;
441: dat->parent = ps->cur;
442: TAILQ_INIT(&dat->childq);
443: TAILQ_INIT(&dat->attrq);
444:
445: if (ps->cur != NULL)
446: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
447:
448: ps->cur = dat;
449: if (ps->tree->root == NULL)
450: ps->tree->root = dat;
1.5 schwarze 451: }
452:
453: static void
454: xml_attrkey(struct parse *ps, const char *name)
455: {
456: struct pattr *attr;
457: enum attrkey key;
1.1 schwarze 458:
1.5 schwarze 459: if (ps->del > 0 || *name == '\0')
460: return;
461: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
462: ps->attr = 0;
463: return;
464: }
465: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
466: perror(NULL);
467: exit(1);
468: }
469: attr->key = key;
470: attr->val = ATTRVAL__MAX;
471: attr->rawval = NULL;
472: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
473: ps->attr = 1;
474: }
475:
476: static void
477: xml_attrval(struct parse *ps, const char *name)
478: {
479: struct pattr *attr;
480:
481: if (ps->del > 0 || ps->attr == 0)
482: return;
483: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
484: return;
485: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
486: (attr->rawval = strdup(name)) == NULL) {
487: perror(NULL);
488: exit(1);
1.1 schwarze 489: }
490: }
491:
492: /*
493: * Roll up the parse tree.
494: * If we're at a text node, roll that one up first.
495: */
496: static void
1.5 schwarze 497: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 498: {
1.5 schwarze 499: const struct element *elem;
500: enum nodeid node;
1.1 schwarze 501:
1.4 schwarze 502: /*
503: * An ancestor is excluded from the tree;
504: * keep track of the number of levels excluded.
505: */
506: if (ps->del > 1) {
507: ps->del--;
508: return;
509: }
510:
1.1 schwarze 511: /* Close out the text node, if there is one. */
1.5 schwarze 512: if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1 schwarze 513: pnode_trim(ps->cur);
514: ps->cur = ps->cur->parent;
515: }
1.2 schwarze 516:
1.5 schwarze 517: if (name != NULL) {
518: for (elem = elements; elem->name != NULL; elem++)
519: if (strcmp(elem->name, name) == 0)
520: break;
521: node = elem->node;
522: } else
523: node = ps->ncur;
1.2 schwarze 524:
1.5 schwarze 525: switch (node) {
1.4 schwarze 526: case NODE_DELETE_WARN:
527: case NODE_DELETE:
1.5 schwarze 528: if (ps->del > 0)
529: ps->del--;
1.4 schwarze 530: break;
1.2 schwarze 531: case NODE_IGNORE:
532: break;
533: default:
1.5 schwarze 534: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 535: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 536: break;
537: }
538:
539: /*
540: * Refrain from actually closing the document element.
541: * If no more content follows, no harm is done, but if
542: * some content still follows, simply processing it is
543: * obviously better than discarding it or crashing.
544: */
545:
546: if (ps->cur->parent == NULL)
547: ps->tree->flags |= TREE_CLOSED;
548: else
549: ps->cur = ps->cur->parent;
1.4 schwarze 550: break;
1.2 schwarze 551: }
1.4 schwarze 552: assert(ps->del == 0);
1.1 schwarze 553: }
554:
555: struct parse *
556: parse_alloc(int warn)
557: {
558: struct parse *p;
559:
560: if ((p = calloc(1, sizeof(*p))) == NULL)
561: return NULL;
562:
563: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
564: free(p);
565: return NULL;
566: }
567: p->warn = warn;
568: return p;
569: }
570:
571: void
572: parse_free(struct parse *p)
573: {
574: if (p == NULL)
575: return;
576: if (p->tree != NULL) {
577: pnode_unlink(p->tree->root);
578: free(p->tree);
579: }
580: free(p);
581: }
582:
1.5 schwarze 583: /*
584: * Advance the pend pointer to the next character in the charset.
585: * If the charset starts with a space, it stands for any whitespace.
586: * Update the new input file position, used for messages.
587: * Do not overrun the buffer b of length rlen.
588: * When reaching the end, NUL-terminate the buffer and return 1;
589: * otherwise, return 0.
590: */
591: static int
592: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
593: const char *charset)
594: {
595: int space;
596:
597: if (*charset == ' ') {
598: space = 1;
599: charset++;
600: } else
601: space = 0;
602:
603: p->nline = p->line;
604: p->ncol = p->col;
605: while (*pend < rlen) {
606: if (b[*pend] == '\n') {
607: p->nline++;
608: p->ncol = 1;
609: } else
610: p->ncol++;
611: if (space && isspace((unsigned char)b[*pend]))
612: break;
613: if (strchr(charset, b[*pend]) != NULL)
614: break;
615: ++*pend;
616: }
617: if (*pend == rlen) {
618: b[rlen] = '\0';
619: return 1;
620: } else
621: return 0;
622: }
623:
1.1 schwarze 624: struct ptree *
625: parse_file(struct parse *p, int fd, const char *fname)
626: {
627: char b[4096];
1.8 schwarze 628: char *cp;
1.5 schwarze 629: ssize_t rsz; /* Return value from read(2). */
630: size_t rlen; /* Number of bytes in b[]. */
631: size_t poff; /* Parse offset in b[]. */
632: size_t pend; /* Offset of the end of the current word. */
633: int in_tag, in_arg, in_quotes, elem_end;
1.1 schwarze 634:
635: p->fname = fname;
1.5 schwarze 636: p->nline = 1;
637: p->ncol = 1;
638: rlen = 0;
639: in_tag = in_arg = in_quotes = 0;
640:
641: /*
642: * Read loop.
643: *
644: * We have to enter the read loop once more even on EOF
645: * because the previous token may have been incomplete,
646: * such that it asked for more input.
647: * Once rsz is 0, incomplete tokens will no longer ask
648: * for more input but instead use whatever there is,
649: * and then exit the read loop.
650: * The minus one on the size limit for read(2) is needed
651: * such that advance() can set b[rlen] to NUL when needed.
652: */
653:
654: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
655: if ((rlen += rsz) == 0)
656: break;
657:
658: /* Token loop. */
659:
660: pend = 0;
661: for (;;) {
662:
663: /* Proceed to the next token, skipping whitespace. */
664:
665: p->line = p->nline;
666: p->col = p->ncol;
667: if ((poff = pend) == rlen)
668: break;
669: if (isspace((unsigned char)b[pend])) {
670: if (b[pend++] == '\n') {
671: p->nline++;
672: p->ncol = 1;
673: } else
674: p->ncol++;
675: continue;
676: }
677:
678: /*
1.9 schwarze 679: * The following four cases (in_arg, in_tag, and
680: * starting an entity or a tag) all parse a word
681: * or quoted string. If that extends beyond the
1.5 schwarze 682: * read buffer and the last read(2) still got
683: * data, they all break out of the token loop
684: * to request more data from the read loop.
685: *
1.9 schwarze 686: * Also, three of them detect self-closing tags,
1.5 schwarze 687: * those ending with "/>", setting the flag
688: * elem_end and calling xml_elem_end() at the
689: * very end, after handling the attribute value,
690: * attribute name, or tag name, respectively.
691: */
692:
693: /* Parse an attribute value. */
694:
695: if (in_arg) {
1.10 schwarze 696: if (in_quotes == 0 &&
697: (b[pend] == '\'' || b[pend] == '"')) {
698: in_quotes = b[pend] == '"' ? 2 : 1;
1.5 schwarze 699: p->ncol++;
700: pend++;
701: continue;
702: }
703: if (advance(p, b, rlen, &pend,
1.10 schwarze 704: in_quotes == 2 ? "\"" :
705: in_quotes == 1 ? "'" : " >") && rsz > 0)
1.5 schwarze 706: break;
707: in_arg = in_quotes = elem_end = 0;
708: if (b[pend] == '>') {
709: in_tag = 0;
710: if (pend > 0 && b[pend - 1] == '/') {
711: b[pend - 1] = '\0';
712: elem_end = 1;
713: }
714: }
715: b[pend] = '\0';
716: if (pend < rlen)
717: pend++;
718: xml_attrval(p, b + poff);
719: if (elem_end)
720: xml_elem_end(p, NULL);
721:
722: /* Look for an attribute name. */
723:
724: } else if (in_tag) {
725: if (advance(p, b, rlen, &pend, " =>") &&
726: rsz > 0)
727: break;
728: elem_end = 0;
729: switch (b[pend]) {
730: case '>':
731: in_tag = 0;
732: if (pend > 0 && b[pend - 1] == '/') {
733: b[pend - 1] = '\0';
734: elem_end = 1;
735: }
736: break;
737: case '=':
738: in_arg = 1;
739: break;
740: default:
741: break;
742: }
743: b[pend] = '\0';
744: if (pend < rlen)
745: pend++;
746: xml_attrkey(p, b + poff);
747: if (elem_end)
748: xml_elem_end(p, NULL);
749:
750: /* Begin an opening or closing tag. */
751:
752: } else if (b[poff] == '<') {
753: if (advance(p, b, rlen, &pend, " >") &&
754: rsz > 0)
755: break;
1.8 schwarze 756: if (pend > poff + 3 &&
757: strncmp(b + poff, "<!--", 4) == 0) {
758:
759: /* Skip a comment. */
760:
761: cp = strstr(b + pend - 2, "-->");
762: if (cp == NULL) {
763: if (rsz > 0) {
764: pend = rlen;
765: break;
766: }
767: cp = b + rlen;
768: } else
769: cp += 3;
770: while (b + pend < cp) {
771: if (b[++pend] == '\n') {
772: p->nline++;
773: p->ncol = 1;
774: } else
775: p->ncol++;
776: }
777: continue;
778: }
1.5 schwarze 779: elem_end = 0;
780: if (b[pend] != '>')
781: in_tag = 1;
782: else if (pend > 0 && b[pend - 1] == '/') {
783: b[pend - 1] = '\0';
784: elem_end = 1;
785: }
786: b[pend] = '\0';
787: if (pend < rlen)
788: pend++;
789: if (b[++poff] == '/') {
790: elem_end = 1;
791: poff++;
792: } else
793: xml_elem_start(p, b + poff);
794: if (elem_end)
795: xml_elem_end(p, b + poff);
796:
1.9 schwarze 797: /* Process an entity. */
798:
799: } else if (b[poff] == '&') {
800: if (advance(p, b, rlen, &pend, ";") &&
801: rsz > 0)
802: break;
803: b[pend] = '\0';
804: if (pend < rlen)
805: pend++;
806: xml_entity(p, b + poff + 1);
807:
808: /* Process text up to the next tag or entity. */
1.5 schwarze 809:
810: } else {
1.9 schwarze 811: if (advance(p, b, rlen, &pend, "<&") == 0)
1.5 schwarze 812: p->ncol--;
813: xml_char(p, b + poff, pend - poff);
814: }
1.1 schwarze 815: }
1.5 schwarze 816:
817: /* Buffer exhausted; shift left and re-fill. */
818:
819: assert(poff > 0);
820: memmove(b, b + poff, rlen - poff);
821: rlen -= poff;
822: }
823: if (rsz < 0) {
824: perror(fname);
825: p->tree->flags |= TREE_FAIL;
826: }
827: if (p->cur != NULL && p->cur->node == NODE_TEXT) {
828: pnode_trim(p->cur);
829: p->cur = p->cur->parent;
830: }
1.6 schwarze 831: if ((p->tree->flags & TREE_CLOSED) == 0)
832: warn_msg(p, "document not closed");
1.1 schwarze 833: return p->tree;
834: }
CVSweb