Annotation of docbook2mdoc/parse.c, Revision 1.10
1.10 ! schwarze 1: /* $Id: parse.c,v 1.9 2019/04/02 15:53:02 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include <assert.h>
19: #include <ctype.h>
1.6 schwarze 20: #include <stdarg.h>
1.1 schwarze 21: #include <stdio.h>
1.5 schwarze 22: #include <stdlib.h>
1.1 schwarze 23: #include <string.h>
24: #include <unistd.h>
25:
26: #include "node.h"
27: #include "parse.h"
28:
29: /*
30: * The implementation of the DocBook parser.
31: */
32:
33: /*
34: * Global parse state.
35: * Keep this as simple and small as possible.
36: */
37: struct parse {
38: const char *fname; /* Name of the input file. */
39: struct ptree *tree; /* Complete parse result. */
40: struct pnode *cur; /* Current node in the tree. */
1.5 schwarze 41: enum nodeid ncur; /* Type of the current node. */
42: int line; /* Line number in the input file. */
43: int col; /* Column number in the input file. */
44: int nline; /* Line number of next token. */
45: int ncol; /* Column number of next token. */
1.4 schwarze 46: int del; /* Levels of nested nodes being deleted. */
1.5 schwarze 47: int attr; /* The most recent attribute is valid. */
1.1 schwarze 48: int warn;
49: };
50:
51: struct element {
52: const char *name; /* DocBook element name. */
53: enum nodeid node; /* Node type to generate. */
54: };
55:
56: static const struct element elements[] = {
1.3 schwarze 57: { "acronym", NODE_IGNORE },
1.1 schwarze 58: { "affiliation", NODE_AFFILIATION },
1.4 schwarze 59: { "anchor", NODE_DELETE },
1.1 schwarze 60: { "application", NODE_APPLICATION },
61: { "arg", NODE_ARG },
62: { "author", NODE_AUTHOR },
63: { "authorgroup", NODE_AUTHORGROUP },
64: { "blockquote", NODE_BLOCKQUOTE },
65: { "book", NODE_BOOK },
66: { "bookinfo", NODE_BOOKINFO },
67: { "caution", NODE_CAUTION },
68: { "chapter", NODE_SECTION },
69: { "citerefentry", NODE_CITEREFENTRY },
70: { "citetitle", NODE_CITETITLE },
71: { "cmdsynopsis", NODE_CMDSYNOPSIS },
72: { "code", NODE_CODE },
73: { "colspec", NODE_COLSPEC },
74: { "command", NODE_COMMAND },
75: { "constant", NODE_CONSTANT },
1.7 schwarze 76: { "contrib", NODE_CONTRIB },
1.1 schwarze 77: { "copyright", NODE_COPYRIGHT },
78: { "date", NODE_DATE },
79: { "editor", NODE_EDITOR },
80: { "email", NODE_EMAIL },
81: { "emphasis", NODE_EMPHASIS },
82: { "entry", NODE_ENTRY },
83: { "envar", NODE_ENVAR },
84: { "fieldsynopsis", NODE_FIELDSYNOPSIS },
85: { "filename", NODE_FILENAME },
1.7 schwarze 86: { "firstname", NODE_PERSONNAME },
1.1 schwarze 87: { "firstterm", NODE_FIRSTTERM },
88: { "footnote", NODE_FOOTNOTE },
89: { "funcdef", NODE_FUNCDEF },
90: { "funcprototype", NODE_FUNCPROTOTYPE },
91: { "funcsynopsis", NODE_FUNCSYNOPSIS },
92: { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO },
93: { "function", NODE_FUNCTION },
94: { "glossterm", NODE_GLOSSTERM },
95: { "group", NODE_GROUP },
96: { "holder", NODE_HOLDER },
97: { "index", NODE_INDEX },
1.4 schwarze 98: { "indexterm", NODE_DELETE },
1.1 schwarze 99: { "info", NODE_INFO },
100: { "informalequation", NODE_INFORMALEQUATION },
101: { "informaltable", NODE_INFORMALTABLE },
102: { "inlineequation", NODE_INLINEEQUATION },
103: { "itemizedlist", NODE_ITEMIZEDLIST },
104: { "keysym", NODE_KEYSYM },
105: { "legalnotice", NODE_LEGALNOTICE },
106: { "link", NODE_LINK },
107: { "listitem", NODE_LISTITEM },
108: { "literal", NODE_LITERAL },
109: { "literallayout", NODE_LITERALLAYOUT },
110: { "manvolnum", NODE_MANVOLNUM },
111: { "member", NODE_MEMBER },
112: { "mml:math", NODE_MML_MATH },
113: { "mml:mfenced", NODE_MML_MFENCED },
114: { "mml:mfrac", NODE_MML_MFRAC },
115: { "mml:mi", NODE_MML_MI },
116: { "mml:mn", NODE_MML_MN },
117: { "mml:mo", NODE_MML_MO },
118: { "mml:mrow", NODE_MML_MROW },
119: { "mml:msub", NODE_MML_MSUB },
120: { "mml:msup", NODE_MML_MSUP },
121: { "modifier", NODE_MODIFIER },
122: { "note", NODE_NOTE },
123: { "option", NODE_OPTION },
124: { "orderedlist", NODE_ORDEREDLIST },
125: { "orgname", NODE_ORGNAME },
1.7 schwarze 126: { "othername", NODE_PERSONNAME },
1.1 schwarze 127: { "para", NODE_PARA },
128: { "paramdef", NODE_PARAMDEF },
129: { "parameter", NODE_PARAMETER },
130: { "part", NODE_SECTION },
131: { "personname", NODE_PERSONNAME },
1.3 schwarze 132: { "phrase", NODE_IGNORE },
1.1 schwarze 133: { "preface", NODE_PREFACE },
1.4 schwarze 134: { "primary", NODE_DELETE },
1.1 schwarze 135: { "programlisting", NODE_PROGRAMLISTING },
136: { "prompt", NODE_PROMPT },
137: { "quote", NODE_QUOTE },
138: { "refclass", NODE_REFCLASS },
139: { "refdescriptor", NODE_REFDESCRIPTOR },
140: { "refentry", NODE_REFENTRY },
141: { "refentryinfo", NODE_REFENTRYINFO },
142: { "refentrytitle", NODE_REFENTRYTITLE },
143: { "refmeta", NODE_REFMETA },
144: { "refmetainfo", NODE_REFMETAINFO },
145: { "refmiscinfo", NODE_REFMISCINFO },
146: { "refname", NODE_REFNAME },
147: { "refnamediv", NODE_REFNAMEDIV },
148: { "refpurpose", NODE_REFPURPOSE },
149: { "refsect1", NODE_SECTION },
150: { "refsect2", NODE_SECTION },
151: { "refsect3", NODE_SECTION },
152: { "refsection", NODE_SECTION },
153: { "refsynopsisdiv", NODE_REFSYNOPSISDIV },
154: { "releaseinfo", NODE_RELEASEINFO },
155: { "replaceable", NODE_REPLACEABLE },
156: { "row", NODE_ROW },
157: { "sbr", NODE_SBR },
158: { "screen", NODE_SCREEN },
1.4 schwarze 159: { "secondary", NODE_DELETE },
1.1 schwarze 160: { "sect1", NODE_SECTION },
161: { "sect2", NODE_SECTION },
162: { "section", NODE_SECTION },
163: { "sgmltag", NODE_SGMLTAG },
164: { "simplelist", NODE_SIMPLELIST },
165: { "spanspec", NODE_SPANSPEC },
166: { "structname", NODE_STRUCTNAME },
167: { "subtitle", NODE_SUBTITLE },
1.7 schwarze 168: { "surname", NODE_PERSONNAME },
1.1 schwarze 169: { "synopsis", NODE_SYNOPSIS },
170: { "table", NODE_TABLE },
171: { "tbody", NODE_TBODY },
172: { "term", NODE_TERM },
173: { "tfoot", NODE_TFOOT },
174: { "tgroup", NODE_TGROUP },
175: { "thead", NODE_THEAD },
176: { "tip", NODE_TIP },
177: { "title", NODE_TITLE },
1.3 schwarze 178: { "trademark", NODE_IGNORE },
1.1 schwarze 179: { "type", NODE_TYPE },
180: { "ulink", NODE_ULINK },
181: { "userinput", NODE_USERINPUT },
182: { "variablelist", NODE_VARIABLELIST },
183: { "varlistentry", NODE_VARLISTENTRY },
184: { "varname", NODE_VARNAME },
185: { "warning", NODE_WARNING },
186: { "wordasword", NODE_WORDASWORD },
1.4 schwarze 187: { "xi:include", NODE_DELETE_WARN },
1.1 schwarze 188: { "year", NODE_YEAR },
1.5 schwarze 189: { NULL, NODE_IGNORE }
1.1 schwarze 190: };
191:
1.9 schwarze 192: struct entity {
193: const char *name;
194: const char *roff;
195: };
196:
197: /*
198: * XML character entity references found in the wild.
199: * Those that don't have an exact mandoc_char(7) representation
200: * are approximated, and the desired codepoint is given as a comment.
201: * Encoding them as \\[u...] would leave -Tascii out in the cold.
202: */
203: static const struct entity entities[] = {
204: { "alpha", "\\(*a" },
205: { "amp", "&" },
206: { "apos", "'" },
207: { "auml", "\\(:a" },
208: { "beta", "\\(*b" },
209: { "circ", "^" }, /* U+02C6 */
210: { "copy", "\\(co" },
211: { "dagger", "\\(dg" },
212: { "Delta", "\\(*D" },
213: { "eacute", "\\('e" },
214: { "emsp", "\\ " }, /* U+2003 */
215: { "gt", ">" },
216: { "hairsp", "\\^" },
217: { "kappa", "\\(*k" },
218: { "larr", "\\(<-" },
219: { "ldquo", "\\(lq" },
220: { "le", "\\(<=" },
221: { "lowbar", "_" },
222: { "lsqb", "[" },
223: { "lt", "<" },
224: { "mdash", "\\(em" },
225: { "minus", "\\-" },
226: { "ndash", "\\(en" },
227: { "nbsp", "\\ " },
228: { "num", "#" },
229: { "oslash", "\\(/o" },
230: { "ouml", "\\(:o" },
231: { "percnt", "%" },
232: { "quot", "\\(dq" },
233: { "rarr", "\\(->" },
234: { "rArr", "\\(rA" },
235: { "rdquo", "\\(rq" },
236: { "reg", "\\(rg" },
237: { "rho", "\\(*r" },
238: { "rsqb", "]" },
239: { "sigma", "\\(*s" },
240: { "shy", "\\&" }, /* U+00AD */
241: { "tau", "\\(*t" },
242: { "tilde", "\\[u02DC]" },
243: { "times", "\\[tmu]" },
244: { "uuml", "\\(:u" },
245: { NULL, NULL }
246: };
247:
1.6 schwarze 248: static void
249: error_msg(struct parse *p, const char *fmt, ...)
250: {
251: va_list ap;
252:
253: fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
254: va_start(ap, fmt);
255: vfprintf(stderr, fmt, ap);
256: va_end(ap);
257: fputc('\n', stderr);
258: p->tree->flags |= TREE_FAIL;
259: }
260:
261: static void
262: warn_msg(struct parse *p, const char *fmt, ...)
263: {
264: va_list ap;
265:
266: if (p->warn == 0)
267: return;
268:
269: fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
270: va_start(ap, fmt);
271: vfprintf(stderr, fmt, ap);
272: va_end(ap);
273: fputc('\n', stderr);
274: }
275:
1.1 schwarze 276: /*
277: * Process a string of characters.
278: * If a text node is already open, append to it.
279: * Otherwise, create a new one as a child of the current node.
280: */
281: static void
1.5 schwarze 282: xml_char(struct parse *ps, const char *p, int sz)
1.1 schwarze 283: {
284: struct pnode *dat;
285:
1.5 schwarze 286: if (ps->del > 0)
1.1 schwarze 287: return;
288:
1.5 schwarze 289: if (ps->cur == NULL) {
1.6 schwarze 290: error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5 schwarze 291: return;
292: }
293:
1.1 schwarze 294: if (ps->cur->node != NODE_TEXT) {
295: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
296: perror(NULL);
297: exit(1);
298: }
299: dat->node = NODE_TEXT;
300: dat->parent = ps->cur;
301: TAILQ_INIT(&dat->childq);
302: TAILQ_INIT(&dat->attrq);
303: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
304: ps->cur = dat;
305: }
306:
1.5 schwarze 307: if (ps->tree->flags & TREE_CLOSED &&
1.6 schwarze 308: ps->cur->parent == ps->tree->root)
309: warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5 schwarze 310:
1.1 schwarze 311: /* Append to the current text node. */
312:
313: assert(sz >= 0);
314: ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
315: if (ps->cur->b == NULL) {
316: perror(NULL);
317: exit(1);
318: }
319: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
320: ps->cur->bsz += sz;
321: ps->cur->b[ps->cur->bsz] = '\0';
322: ps->cur->real = ps->cur->b;
323: }
324:
325: static void
326: pnode_trim(struct pnode *pn)
327: {
328: assert(pn->node == NODE_TEXT);
329: for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
330: if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
331: break;
332: }
333:
1.9 schwarze 334: static void
335: xml_entity(struct parse *p, const char *name)
336: {
337: const struct entity *entity;
338: struct pnode *dat;
339:
340: if (p->del > 0)
341: return;
342:
343: if (p->cur == NULL) {
344: error_msg(p, "discarding entity before document: &%s;", name);
345: return;
346: }
347:
348: /* Close out the text node, if there is one. */
349: if (p->cur->node == NODE_TEXT) {
350: pnode_trim(p->cur);
351: p->cur = p->cur->parent;
352: }
353:
354: if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
355: warn_msg(p, "entity after end of document: &%s;", name);
356:
357: for (entity = entities; entity->name != NULL; entity++)
358: if (strcmp(name, entity->name) == 0)
359: break;
360:
361: if (entity->roff == NULL) {
362: error_msg(p, "unknown entity &%s;", name);
363: return;
364: }
365:
366: /* Create, append, and close out an entity node. */
367: if ((dat = calloc(1, sizeof(*dat))) == NULL ||
368: (dat->b = dat->real = strdup(entity->roff)) == NULL) {
369: perror(NULL);
370: exit(1);
371: }
372: dat->node = NODE_ESCAPE;
373: dat->bsz = strlen(dat->b);
374: dat->parent = p->cur;
375: TAILQ_INIT(&dat->childq);
376: TAILQ_INIT(&dat->attrq);
377: TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
378: }
379:
1.1 schwarze 380: /*
381: * Begin an element.
382: */
383: static void
1.5 schwarze 384: xml_elem_start(struct parse *ps, const char *name)
1.1 schwarze 385: {
1.5 schwarze 386: const struct element *elem;
387: struct pnode *dat;
1.1 schwarze 388:
1.5 schwarze 389: if (*name == '!' || *name == '?')
1.1 schwarze 390: return;
391:
1.4 schwarze 392: /*
393: * An ancestor is excluded from the tree;
394: * keep track of the number of levels excluded.
395: */
396: if (ps->del > 0) {
397: ps->del++;
398: return;
399: }
400:
1.1 schwarze 401: /* Close out the text node, if there is one. */
402: if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
403: pnode_trim(ps->cur);
404: ps->cur = ps->cur->parent;
405: }
406:
407: for (elem = elements; elem->name != NULL; elem++)
408: if (strcmp(elem->name, name) == 0)
409: break;
410:
1.6 schwarze 411: if (elem->name == NULL)
412: error_msg(ps, "unknown element <%s>", name);
413:
1.5 schwarze 414: ps->ncur = elem->node;
1.1 schwarze 415:
1.5 schwarze 416: switch (ps->ncur) {
1.4 schwarze 417: case NODE_DELETE_WARN:
1.6 schwarze 418: warn_msg(ps, "skipping element <%s>", name);
1.2 schwarze 419: /* FALLTHROUGH */
1.4 schwarze 420: case NODE_DELETE:
421: ps->del = 1;
422: /* FALLTHROUGH */
1.2 schwarze 423: case NODE_IGNORE:
424: return;
425: case NODE_INLINEEQUATION:
1.1 schwarze 426: ps->tree->flags |= TREE_EQN;
1.2 schwarze 427: break;
428: default:
429: break;
430: }
1.1 schwarze 431:
1.6 schwarze 432: if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
433: warn_msg(ps, "element after end of document: <%s>", name);
1.5 schwarze 434:
1.1 schwarze 435: if ((dat = calloc(1, sizeof(*dat))) == NULL) {
436: perror(NULL);
437: exit(1);
438: }
439: dat->node = elem->node;
440: dat->parent = ps->cur;
441: TAILQ_INIT(&dat->childq);
442: TAILQ_INIT(&dat->attrq);
443:
444: if (ps->cur != NULL)
445: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
446:
447: ps->cur = dat;
448: if (ps->tree->root == NULL)
449: ps->tree->root = dat;
1.5 schwarze 450: }
451:
452: static void
453: xml_attrkey(struct parse *ps, const char *name)
454: {
455: struct pattr *attr;
456: enum attrkey key;
1.1 schwarze 457:
1.5 schwarze 458: if (ps->del > 0 || *name == '\0')
459: return;
460: if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
461: ps->attr = 0;
462: return;
463: }
464: if ((attr = calloc(1, sizeof(*attr))) == NULL) {
465: perror(NULL);
466: exit(1);
467: }
468: attr->key = key;
469: attr->val = ATTRVAL__MAX;
470: attr->rawval = NULL;
471: TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
472: ps->attr = 1;
473: }
474:
475: static void
476: xml_attrval(struct parse *ps, const char *name)
477: {
478: struct pattr *attr;
479:
480: if (ps->del > 0 || ps->attr == 0)
481: return;
482: if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
483: return;
484: if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
485: (attr->rawval = strdup(name)) == NULL) {
486: perror(NULL);
487: exit(1);
1.1 schwarze 488: }
489: }
490:
491: /*
492: * Roll up the parse tree.
493: * If we're at a text node, roll that one up first.
494: */
495: static void
1.5 schwarze 496: xml_elem_end(struct parse *ps, const char *name)
1.1 schwarze 497: {
1.5 schwarze 498: const struct element *elem;
499: enum nodeid node;
1.1 schwarze 500:
1.4 schwarze 501: /*
502: * An ancestor is excluded from the tree;
503: * keep track of the number of levels excluded.
504: */
505: if (ps->del > 1) {
506: ps->del--;
507: return;
508: }
509:
1.1 schwarze 510: /* Close out the text node, if there is one. */
1.5 schwarze 511: if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1 schwarze 512: pnode_trim(ps->cur);
513: ps->cur = ps->cur->parent;
514: }
1.2 schwarze 515:
1.5 schwarze 516: if (name != NULL) {
517: for (elem = elements; elem->name != NULL; elem++)
518: if (strcmp(elem->name, name) == 0)
519: break;
520: node = elem->node;
521: } else
522: node = ps->ncur;
1.2 schwarze 523:
1.5 schwarze 524: switch (node) {
1.4 schwarze 525: case NODE_DELETE_WARN:
526: case NODE_DELETE:
1.5 schwarze 527: if (ps->del > 0)
528: ps->del--;
1.4 schwarze 529: break;
1.2 schwarze 530: case NODE_IGNORE:
531: break;
532: default:
1.5 schwarze 533: if (ps->cur == NULL || node != ps->cur->node) {
1.6 schwarze 534: warn_msg(ps, "element not open: </%s>", name);
1.5 schwarze 535: break;
536: }
537:
538: /*
539: * Refrain from actually closing the document element.
540: * If no more content follows, no harm is done, but if
541: * some content still follows, simply processing it is
542: * obviously better than discarding it or crashing.
543: */
544:
545: if (ps->cur->parent == NULL)
546: ps->tree->flags |= TREE_CLOSED;
547: else
548: ps->cur = ps->cur->parent;
1.4 schwarze 549: break;
1.2 schwarze 550: }
1.4 schwarze 551: assert(ps->del == 0);
1.1 schwarze 552: }
553:
554: struct parse *
555: parse_alloc(int warn)
556: {
557: struct parse *p;
558:
559: if ((p = calloc(1, sizeof(*p))) == NULL)
560: return NULL;
561:
562: if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
563: free(p);
564: return NULL;
565: }
566: p->warn = warn;
567: return p;
568: }
569:
570: void
571: parse_free(struct parse *p)
572: {
573: if (p == NULL)
574: return;
575: if (p->tree != NULL) {
576: pnode_unlink(p->tree->root);
577: free(p->tree);
578: }
579: free(p);
580: }
581:
1.5 schwarze 582: /*
583: * Advance the pend pointer to the next character in the charset.
584: * If the charset starts with a space, it stands for any whitespace.
585: * Update the new input file position, used for messages.
586: * Do not overrun the buffer b of length rlen.
587: * When reaching the end, NUL-terminate the buffer and return 1;
588: * otherwise, return 0.
589: */
590: static int
591: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
592: const char *charset)
593: {
594: int space;
595:
596: if (*charset == ' ') {
597: space = 1;
598: charset++;
599: } else
600: space = 0;
601:
602: p->nline = p->line;
603: p->ncol = p->col;
604: while (*pend < rlen) {
605: if (b[*pend] == '\n') {
606: p->nline++;
607: p->ncol = 1;
608: } else
609: p->ncol++;
610: if (space && isspace((unsigned char)b[*pend]))
611: break;
612: if (strchr(charset, b[*pend]) != NULL)
613: break;
614: ++*pend;
615: }
616: if (*pend == rlen) {
617: b[rlen] = '\0';
618: return 1;
619: } else
620: return 0;
621: }
622:
1.1 schwarze 623: struct ptree *
624: parse_file(struct parse *p, int fd, const char *fname)
625: {
626: char b[4096];
1.8 schwarze 627: char *cp;
1.5 schwarze 628: ssize_t rsz; /* Return value from read(2). */
629: size_t rlen; /* Number of bytes in b[]. */
630: size_t poff; /* Parse offset in b[]. */
631: size_t pend; /* Offset of the end of the current word. */
632: int in_tag, in_arg, in_quotes, elem_end;
1.1 schwarze 633:
634: p->fname = fname;
1.5 schwarze 635: p->nline = 1;
636: p->ncol = 1;
637: rlen = 0;
638: in_tag = in_arg = in_quotes = 0;
639:
640: /*
641: * Read loop.
642: *
643: * We have to enter the read loop once more even on EOF
644: * because the previous token may have been incomplete,
645: * such that it asked for more input.
646: * Once rsz is 0, incomplete tokens will no longer ask
647: * for more input but instead use whatever there is,
648: * and then exit the read loop.
649: * The minus one on the size limit for read(2) is needed
650: * such that advance() can set b[rlen] to NUL when needed.
651: */
652:
653: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
654: if ((rlen += rsz) == 0)
655: break;
656:
657: /* Token loop. */
658:
659: pend = 0;
660: for (;;) {
661:
662: /* Proceed to the next token, skipping whitespace. */
663:
664: p->line = p->nline;
665: p->col = p->ncol;
666: if ((poff = pend) == rlen)
667: break;
668: if (isspace((unsigned char)b[pend])) {
669: if (b[pend++] == '\n') {
670: p->nline++;
671: p->ncol = 1;
672: } else
673: p->ncol++;
674: continue;
675: }
676:
677: /*
1.9 schwarze 678: * The following four cases (in_arg, in_tag, and
679: * starting an entity or a tag) all parse a word
680: * or quoted string. If that extends beyond the
1.5 schwarze 681: * read buffer and the last read(2) still got
682: * data, they all break out of the token loop
683: * to request more data from the read loop.
684: *
1.9 schwarze 685: * Also, three of them detect self-closing tags,
1.5 schwarze 686: * those ending with "/>", setting the flag
687: * elem_end and calling xml_elem_end() at the
688: * very end, after handling the attribute value,
689: * attribute name, or tag name, respectively.
690: */
691:
692: /* Parse an attribute value. */
693:
694: if (in_arg) {
1.10 ! schwarze 695: if (in_quotes == 0 &&
! 696: (b[pend] == '\'' || b[pend] == '"')) {
! 697: in_quotes = b[pend] == '"' ? 2 : 1;
1.5 schwarze 698: p->ncol++;
699: pend++;
700: continue;
701: }
702: if (advance(p, b, rlen, &pend,
1.10 ! schwarze 703: in_quotes == 2 ? "\"" :
! 704: in_quotes == 1 ? "'" : " >") && rsz > 0)
1.5 schwarze 705: break;
706: in_arg = in_quotes = elem_end = 0;
707: if (b[pend] == '>') {
708: in_tag = 0;
709: if (pend > 0 && b[pend - 1] == '/') {
710: b[pend - 1] = '\0';
711: elem_end = 1;
712: }
713: }
714: b[pend] = '\0';
715: if (pend < rlen)
716: pend++;
717: xml_attrval(p, b + poff);
718: if (elem_end)
719: xml_elem_end(p, NULL);
720:
721: /* Look for an attribute name. */
722:
723: } else if (in_tag) {
724: if (advance(p, b, rlen, &pend, " =>") &&
725: rsz > 0)
726: break;
727: elem_end = 0;
728: switch (b[pend]) {
729: case '>':
730: in_tag = 0;
731: if (pend > 0 && b[pend - 1] == '/') {
732: b[pend - 1] = '\0';
733: elem_end = 1;
734: }
735: break;
736: case '=':
737: in_arg = 1;
738: break;
739: default:
740: break;
741: }
742: b[pend] = '\0';
743: if (pend < rlen)
744: pend++;
745: xml_attrkey(p, b + poff);
746: if (elem_end)
747: xml_elem_end(p, NULL);
748:
749: /* Begin an opening or closing tag. */
750:
751: } else if (b[poff] == '<') {
752: if (advance(p, b, rlen, &pend, " >") &&
753: rsz > 0)
754: break;
1.8 schwarze 755: if (pend > poff + 3 &&
756: strncmp(b + poff, "<!--", 4) == 0) {
757:
758: /* Skip a comment. */
759:
760: cp = strstr(b + pend - 2, "-->");
761: if (cp == NULL) {
762: if (rsz > 0) {
763: pend = rlen;
764: break;
765: }
766: cp = b + rlen;
767: } else
768: cp += 3;
769: while (b + pend < cp) {
770: if (b[++pend] == '\n') {
771: p->nline++;
772: p->ncol = 1;
773: } else
774: p->ncol++;
775: }
776: continue;
777: }
1.5 schwarze 778: elem_end = 0;
779: if (b[pend] != '>')
780: in_tag = 1;
781: else if (pend > 0 && b[pend - 1] == '/') {
782: b[pend - 1] = '\0';
783: elem_end = 1;
784: }
785: b[pend] = '\0';
786: if (pend < rlen)
787: pend++;
788: if (b[++poff] == '/') {
789: elem_end = 1;
790: poff++;
791: } else
792: xml_elem_start(p, b + poff);
793: if (elem_end)
794: xml_elem_end(p, b + poff);
795:
1.9 schwarze 796: /* Process an entity. */
797:
798: } else if (b[poff] == '&') {
799: if (advance(p, b, rlen, &pend, ";") &&
800: rsz > 0)
801: break;
802: b[pend] = '\0';
803: if (pend < rlen)
804: pend++;
805: xml_entity(p, b + poff + 1);
806:
807: /* Process text up to the next tag or entity. */
1.5 schwarze 808:
809: } else {
1.9 schwarze 810: if (advance(p, b, rlen, &pend, "<&") == 0)
1.5 schwarze 811: p->ncol--;
812: xml_char(p, b + poff, pend - poff);
813: }
1.1 schwarze 814: }
1.5 schwarze 815:
816: /* Buffer exhausted; shift left and re-fill. */
817:
818: assert(poff > 0);
819: memmove(b, b + poff, rlen - poff);
820: rlen -= poff;
821: }
822: if (rsz < 0) {
823: perror(fname);
824: p->tree->flags |= TREE_FAIL;
825: }
826: if (p->cur != NULL && p->cur->node == NODE_TEXT) {
827: pnode_trim(p->cur);
828: p->cur = p->cur->parent;
829: }
1.6 schwarze 830: if ((p->tree->flags & TREE_CLOSED) == 0)
831: warn_msg(p, "document not closed");
1.1 schwarze 832: return p->tree;
833: }
CVSweb