/* $Id: parse.c,v 1.23 2019/04/08 14:37:31 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include "node.h" #include "parse.h" /* * The implementation of the DocBook parser. */ enum pstate { PARSE_ELEM, PARSE_TAG, PARSE_ARG, PARSE_SQ, PARSE_DQ }; /* * Global parse state. * Keep this as simple and small as possible. */ struct parse { const char *fname; /* Name of the input file. */ struct ptree *tree; /* Complete parse result. */ struct pnode *doctype; struct pnode *cur; /* Current node in the tree. */ enum nodeid ncur; /* Type of the current node. */ int line; /* Line number in the input file. */ int col; /* Column number in the input file. */ int nline; /* Line number of next token. */ int ncol; /* Column number of next token. */ int del; /* Levels of nested nodes being deleted. */ int flags; #define PFLAG_WARN (1 << 0) /* Print warning messages. */ #define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */ #define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */ #define PFLAG_EEND (1 << 3) /* This element is self-closing. */ }; struct element { const char *name; /* DocBook element name. */ enum nodeid node; /* Node type to generate. */ }; static const struct element elements[] = { { "acronym", NODE_IGNORE }, { "affiliation", NODE_AFFILIATION }, { "anchor", NODE_DELETE }, { "appendix", NODE_APPENDIX }, { "application", NODE_APPLICATION }, { "arg", NODE_ARG }, { "article", NODE_SECTION }, { "author", NODE_AUTHOR }, { "authorgroup", NODE_AUTHORGROUP }, { "blockquote", NODE_BLOCKQUOTE }, { "book", NODE_SECTION }, { "bookinfo", NODE_BOOKINFO }, { "caution", NODE_CAUTION }, { "chapter", NODE_SECTION }, { "citerefentry", NODE_CITEREFENTRY }, { "citetitle", NODE_CITETITLE }, { "cmdsynopsis", NODE_CMDSYNOPSIS }, { "code", NODE_LITERAL }, { "colspec", NODE_COLSPEC }, { "command", NODE_COMMAND }, { "constant", NODE_CONSTANT }, { "contrib", NODE_CONTRIB }, { "copyright", NODE_COPYRIGHT }, { "date", NODE_DATE }, { "!doctype", NODE_DOCTYPE }, { "!DOCTYPE", NODE_DOCTYPE }, { "editor", NODE_EDITOR }, { "email", NODE_EMAIL }, { "emphasis", NODE_EMPHASIS }, { "!ENTITY", NODE_ENTITY }, { "entry", NODE_ENTRY }, { "envar", NODE_ENVAR }, { "errorname", NODE_ERRORNAME }, { "fieldsynopsis", NODE_FIELDSYNOPSIS }, { "filename", NODE_FILENAME }, { "firstname", NODE_PERSONNAME }, { "firstterm", NODE_FIRSTTERM }, { "footnote", NODE_FOOTNOTE }, { "funcdef", NODE_FUNCDEF }, { "funcprototype", NODE_FUNCPROTOTYPE }, { "funcsynopsis", NODE_FUNCSYNOPSIS }, { "funcsynopsisinfo", NODE_FUNCSYNOPSISINFO }, { "function", NODE_FUNCTION }, { "glossary", NODE_VARIABLELIST }, { "glossdef", NODE_IGNORE }, { "glossdiv", NODE_IGNORE }, { "glossentry", NODE_VARLISTENTRY }, { "glosslist", NODE_VARIABLELIST }, { "glossterm", NODE_GLOSSTERM }, { "group", NODE_GROUP }, { "holder", NODE_HOLDER }, { "index", NODE_INDEX }, { "indexterm", NODE_DELETE }, { "info", NODE_INFO }, { "informalequation", NODE_INFORMALEQUATION }, { "informaltable", NODE_TABLE }, { "inlineequation", NODE_INLINEEQUATION }, { "itemizedlist", NODE_ITEMIZEDLIST }, { "keysym", NODE_KEYSYM }, { "legalnotice", NODE_LEGALNOTICE }, { "link", NODE_LINK }, { "listitem", NODE_LISTITEM }, { "literal", NODE_LITERAL }, { "literallayout", NODE_LITERALLAYOUT }, { "manvolnum", NODE_MANVOLNUM }, { "member", NODE_MEMBER }, { "mml:math", NODE_MML_MATH }, { "mml:mfenced", NODE_MML_MFENCED }, { "mml:mfrac", NODE_MML_MFRAC }, { "mml:mi", NODE_MML_MI }, { "mml:mn", NODE_MML_MN }, { "mml:mo", NODE_MML_MO }, { "mml:mrow", NODE_MML_MROW }, { "mml:msub", NODE_MML_MSUB }, { "mml:msup", NODE_MML_MSUP }, { "modifier", NODE_MODIFIER }, { "note", NODE_NOTE }, { "option", NODE_OPTION }, { "orderedlist", NODE_ORDEREDLIST }, { "orgname", NODE_ORGNAME }, { "othername", NODE_PERSONNAME }, { "para", NODE_PARA }, { "paramdef", NODE_PARAMDEF }, { "parameter", NODE_PARAMETER }, { "part", NODE_SECTION }, { "personname", NODE_PERSONNAME }, { "phrase", NODE_IGNORE }, { "preface", NODE_PREFACE }, { "primary", NODE_DELETE }, { "programlisting", NODE_PROGRAMLISTING }, { "prompt", NODE_PROMPT }, { "quote", NODE_QUOTE }, { "refclass", NODE_REFCLASS }, { "refdescriptor", NODE_REFDESCRIPTOR }, { "refentry", NODE_REFENTRY }, { "refentryinfo", NODE_REFENTRYINFO }, { "refentrytitle", NODE_REFENTRYTITLE }, { "refmeta", NODE_REFMETA }, { "refmetainfo", NODE_REFMETAINFO }, { "refmiscinfo", NODE_REFMISCINFO }, { "refname", NODE_REFNAME }, { "refnamediv", NODE_REFNAMEDIV }, { "refpurpose", NODE_REFPURPOSE }, { "refsect1", NODE_SECTION }, { "refsect2", NODE_SECTION }, { "refsect3", NODE_SECTION }, { "refsection", NODE_SECTION }, { "refsynopsisdiv", NODE_REFSYNOPSISDIV }, { "releaseinfo", NODE_RELEASEINFO }, { "replaceable", NODE_REPLACEABLE }, { "row", NODE_ROW }, { "sbr", NODE_SBR }, { "screen", NODE_SCREEN }, { "secondary", NODE_DELETE }, { "sect1", NODE_SECTION }, { "sect2", NODE_SECTION }, { "section", NODE_SECTION }, { "sgmltag", NODE_SGMLTAG }, { "simpara", NODE_PARA }, { "simplelist", NODE_SIMPLELIST }, { "spanspec", NODE_SPANSPEC }, { "structfield", NODE_PARAMETER }, { "structname", NODE_TYPE }, { "subtitle", NODE_SUBTITLE }, { "surname", NODE_PERSONNAME }, { "symbol", NODE_CONSTANT }, { "synopsis", NODE_SYNOPSIS }, { "table", NODE_TABLE }, { "tbody", NODE_TBODY }, { "term", NODE_TERM }, { "tfoot", NODE_TFOOT }, { "tgroup", NODE_TGROUP }, { "thead", NODE_THEAD }, { "tip", NODE_TIP }, { "title", NODE_TITLE }, { "trademark", NODE_IGNORE }, { "type", NODE_TYPE }, { "ulink", NODE_LINK }, { "userinput", NODE_LITERAL }, { "variablelist", NODE_VARIABLELIST }, { "varlistentry", NODE_VARLISTENTRY }, { "varname", NODE_VARNAME }, { "warning", NODE_WARNING }, { "wordasword", NODE_WORDASWORD }, { "xi:include", NODE_DELETE_WARN }, { "year", NODE_YEAR }, { NULL, NODE_IGNORE } }; struct entity { const char *name; const char *roff; }; /* * XML character entity references found in the wild. * Those that don't have an exact mandoc_char(7) representation * are approximated, and the desired codepoint is given as a comment. * Encoding them as \\[u...] would leave -Tascii out in the cold. */ static const struct entity entities[] = { { "alpha", "\\(*a" }, { "amp", "&" }, { "apos", "'" }, { "auml", "\\(:a" }, { "beta", "\\(*b" }, { "circ", "^" }, /* U+02C6 */ { "copy", "\\(co" }, { "dagger", "\\(dg" }, { "Delta", "\\(*D" }, { "eacute", "\\('e" }, { "emsp", "\\ " }, /* U+2003 */ { "gt", ">" }, { "hairsp", "\\^" }, { "kappa", "\\(*k" }, { "larr", "\\(<-" }, { "ldquo", "\\(lq" }, { "le", "\\(<=" }, { "lowbar", "_" }, { "lsqb", "[" }, { "lt", "<" }, { "mdash", "\\(em" }, { "minus", "\\-" }, { "ndash", "\\(en" }, { "nbsp", "\\ " }, { "num", "#" }, { "oslash", "\\(/o" }, { "ouml", "\\(:o" }, { "percnt", "%" }, { "quot", "\\(dq" }, { "rarr", "\\(->" }, { "rArr", "\\(rA" }, { "rdquo", "\\(rq" }, { "reg", "\\(rg" }, { "rho", "\\(*r" }, { "rsqb", "]" }, { "sigma", "\\(*s" }, { "shy", "\\&" }, /* U+00AD */ { "tau", "\\(*t" }, { "tilde", "\\[u02DC]" }, { "times", "\\[tmu]" }, { "uuml", "\\(:u" }, { NULL, NULL } }; static size_t parse_string(struct parse *, char *, size_t, enum pstate *, int); static void error_msg(struct parse *p, const char *fmt, ...) { va_list ap; fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); p->tree->flags |= TREE_FAIL; } static void warn_msg(struct parse *p, const char *fmt, ...) { va_list ap; if ((p->flags & PFLAG_WARN) == 0) return; fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); } /* * Process a string of characters. * If a text node is already open, append to it. * Otherwise, create a new one as a child of the current node. */ static void xml_char(struct parse *ps, const char *p, int sz) { struct pnode *dat; size_t newsz; if (ps->del > 0) return; if (ps->cur == NULL) { error_msg(ps, "discarding text before document: %.*s", sz, p); return; } if (ps->cur->node != NODE_TEXT) { if ((dat = calloc(1, sizeof(*dat))) == NULL) { perror(NULL); exit(1); } dat->node = NODE_TEXT; dat->spc = (ps->flags & PFLAG_SPC) != 0; dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); ps->cur = dat; } if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == ps->tree->root) warn_msg(ps, "text after end of document: %.*s", sz, p); /* Append to the current text node. */ assert(sz >= 0); newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz; ps->cur->b = realloc(ps->cur->b, newsz + 1); if (ps->cur->b == NULL) { perror(NULL); exit(1); } if (ps->cur->bsz && (ps->flags & PFLAG_SPC)) ps->cur->b[ps->cur->bsz++] = ' '; memcpy(ps->cur->b + ps->cur->bsz, p, sz); ps->cur->b[ps->cur->bsz = newsz] = '\0'; ps->cur->real = ps->cur->b; ps->flags &= ~PFLAG_SPC; } /* * Close out the text node and strip trailing whitespace, if one is open. */ static void pnode_closetext(struct parse *p) { struct pnode *n; if ((n = p->cur) == NULL || n->node != NODE_TEXT) return; p->cur = n->parent; while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) { n->b[--n->bsz] = '\0'; p->flags |= PFLAG_SPC; } } static void xml_entity(struct parse *p, const char *name) { const struct entity *entity; struct pnode *dat; const char *ccp; char *cp; enum pstate pstate; if (p->del > 0) return; if (p->cur == NULL) { error_msg(p, "discarding entity before document: &%s;", name); return; } pnode_closetext(p); if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root) warn_msg(p, "entity after end of document: &%s;", name); for (entity = entities; entity->name != NULL; entity++) if (strcmp(name, entity->name) == 0) break; if (entity->roff == NULL) { if (p->doctype != NULL) { TAILQ_FOREACH(dat, &p->doctype->childq, child) { if ((ccp = pnode_getattr_raw(dat, ATTRKEY_NAME, NULL)) == NULL || strcmp(ccp, name) != 0 || (ccp = pnode_getattr_raw(dat, ATTRKEY_DEFINITION, NULL)) == NULL) continue; if ((cp = strdup(ccp)) == NULL) { perror(NULL); exit(1); } pstate = PARSE_ELEM; parse_string(p, cp, strlen(cp), &pstate, 0); p->flags &= ~PFLAG_SPC; free(cp); return; } } error_msg(p, "unknown entity &%s;", name); return; } /* Create, append, and close out an entity node. */ if ((dat = calloc(1, sizeof(*dat))) == NULL || (dat->b = dat->real = strdup(entity->roff)) == NULL) { perror(NULL); exit(1); } dat->node = NODE_ESCAPE; dat->bsz = strlen(dat->b); dat->spc = (p->flags & PFLAG_SPC) != 0; dat->parent = p->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); TAILQ_INSERT_TAIL(&p->cur->childq, dat, child); p->flags &= ~PFLAG_SPC; } /* * Begin an element. */ static void xml_elem_start(struct parse *ps, const char *name) { const struct element *elem; struct pnode *dat; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ if (ps->del > 0) { if (*name != '!' && *name != '?') ps->del++; return; } pnode_closetext(ps); for (elem = elements; elem->name != NULL; elem++) if (strcmp(elem->name, name) == 0) break; if (elem->name == NULL) { if (*name == '!' || *name == '?') return; error_msg(ps, "unknown element <%s>", name); } ps->ncur = elem->node; switch (ps->ncur) { case NODE_DELETE_WARN: warn_msg(ps, "skipping element <%s>", name); /* FALLTHROUGH */ case NODE_DELETE: ps->del = 1; /* FALLTHROUGH */ case NODE_IGNORE: return; case NODE_INLINEEQUATION: ps->tree->flags |= TREE_EQN; break; default: break; } if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL) warn_msg(ps, "element after end of document: <%s>", name); if ((dat = calloc(1, sizeof(*dat))) == NULL) { perror(NULL); exit(1); } /* * Nodes that begin a new macro or request line or start by * printing text always want whitespace before themselves. */ switch (dat->node = elem->node) { case NODE_DOCTYPE: case NODE_ENTITY: case NODE_SBR: ps->flags |= PFLAG_EEND; /* FALLTHROUGH */ case NODE_APPENDIX: case NODE_AUTHORGROUP: case NODE_BLOCKQUOTE: case NODE_BOOKINFO: case NODE_CAUTION: case NODE_EDITOR: case NODE_ENTRY: case NODE_FUNCDEF: case NODE_FUNCPROTOTYPE: case NODE_INFORMALEQUATION: case NODE_INLINEEQUATION: case NODE_ITEMIZEDLIST: case NODE_LEGALNOTICE: case NODE_LITERALLAYOUT: case NODE_NOTE: case NODE_ORDEREDLIST: case NODE_PARA: case NODE_PREFACE: case NODE_PROGRAMLISTING: case NODE_REFMETA: case NODE_REFNAMEDIV: case NODE_REFSYNOPSISDIV: case NODE_ROW: case NODE_SCREEN: case NODE_SECTION: case NODE_SYNOPSIS: case NODE_TGROUP: case NODE_TIP: case NODE_TITLE: case NODE_VARIABLELIST: case NODE_VARLISTENTRY: case NODE_WARNING: dat->spc = 1; break; default: dat->spc = (ps->flags & PFLAG_SPC) != 0; break; } dat->parent = ps->cur; TAILQ_INIT(&dat->childq); TAILQ_INIT(&dat->attrq); if (ps->cur != NULL) TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child); ps->cur = dat; if (dat->node == NODE_DOCTYPE) { if (ps->doctype == NULL) ps->doctype = dat; else error_msg(ps, "duplicate doctype"); } else if (dat->parent == NULL && ps->tree->root == NULL) ps->tree->root = dat; } static void xml_attrkey(struct parse *ps, const char *name) { struct pattr *attr; const char *value; enum attrkey key; if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0') return; if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) && TAILQ_FIRST(&ps->cur->attrq) == NULL) { value = name; name = "NAME"; } else value = NULL; if ((key = attrkey_parse(name)) == ATTRKEY__MAX) { ps->flags &= ~PFLAG_ATTR; return; } if ((attr = calloc(1, sizeof(*attr))) == NULL) { perror(NULL); exit(1); } attr->key = key; attr->val = ATTRVAL__MAX; if (value == NULL) { attr->rawval = NULL; ps->flags |= PFLAG_ATTR; } else { if ((attr->rawval = strdup(value)) == NULL) { perror(NULL); exit(1); } ps->flags &= ~PFLAG_ATTR; } TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child); if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME) xml_attrkey(ps, "DEFINITION"); } static void xml_attrval(struct parse *ps, const char *name) { struct pattr *attr; if (ps->del > 0 || ps->ncur == NODE_IGNORE || (ps->flags & PFLAG_ATTR) == 0) return; if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL) return; if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX && (attr->rawval = strdup(name)) == NULL) { perror(NULL); exit(1); } } /* * Roll up the parse tree. * If we're at a text node, roll that one up first. */ static void xml_elem_end(struct parse *ps, const char *name) { const struct element *elem; enum nodeid node; /* * An ancestor is excluded from the tree; * keep track of the number of levels excluded. */ if (ps->del > 1) { ps->del--; return; } if (ps->del == 0) pnode_closetext(ps); if (name != NULL) { for (elem = elements; elem->name != NULL; elem++) if (strcmp(elem->name, name) == 0) break; node = elem->node; } else node = ps->ncur; switch (node) { case NODE_DELETE_WARN: case NODE_DELETE: if (ps->del > 0) ps->del--; break; case NODE_IGNORE: break; case NODE_DOCTYPE: ps->flags &= ~PFLAG_EEND; /* FALLTHROUGH */ default: if (ps->cur == NULL || node != ps->cur->node) { warn_msg(ps, "element not open: ", name); break; } /* * Refrain from actually closing the document element. * If no more content follows, no harm is done, but if * some content still follows, simply processing it is * obviously better than discarding it or crashing. */ if (ps->cur->parent != NULL || node == NODE_DOCTYPE) { ps->cur = ps->cur->parent; if (ps->cur != NULL) ps->ncur = ps->cur->node; } else ps->tree->flags |= TREE_CLOSED; ps->flags &= ~PFLAG_SPC; break; } assert(ps->del == 0); } struct parse * parse_alloc(int warn) { struct parse *p; if ((p = calloc(1, sizeof(*p))) == NULL) return NULL; if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) { free(p); return NULL; } if (warn) p->flags |= PFLAG_WARN; else p->flags &= ~PFLAG_WARN; return p; } void parse_free(struct parse *p) { if (p == NULL) return; if (p->tree != NULL) { pnode_unlink(p->tree->root); free(p->tree); } free(p); } static void increment(struct parse *p, char *b, size_t *pend, int refill) { if (refill) { if (b[*pend] == '\n') { p->nline++; p->ncol = 1; } else p->ncol++; } ++*pend; } /* * Advance the pend pointer to the next character in the charset. * If the charset starts with a space, it stands for any whitespace. * Update the new input file position, used for messages. * Do not overrun the buffer b of length rlen. * When reaching the end, NUL-terminate the buffer and return 1; * otherwise, return 0. */ static int advance(struct parse *p, char *b, size_t rlen, size_t *pend, const char *charset, int refill) { int space; if (*charset == ' ') { space = 1; charset++; } else space = 0; if (refill) { p->nline = p->line; p->ncol = p->col; } while (*pend < rlen) { if (space && isspace((unsigned char)b[*pend])) break; if (strchr(charset, b[*pend]) != NULL) break; increment(p, b, pend, refill); } if (*pend == rlen) { b[rlen] = '\0'; return refill; } else return 0; } size_t parse_string(struct parse *p, char *b, size_t rlen, enum pstate *pstate, int refill) { char *cp; size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ int elem_end; pend = 0; for (;;) { /* Proceed to the next token, skipping whitespace. */ if (refill) { p->line = p->nline; p->col = p->ncol; } if ((poff = pend) == rlen) break; if (isspace((unsigned char)b[pend])) { p->flags |= PFLAG_SPC; increment(p, b, &pend, refill); continue; } /* * The following four cases (ARG, TAG, and starting an * entity or a tag) all parse a word or quoted string. * If that extends beyond the read buffer and the last * read(2) still got data, they all break out of the * token loop to request more data from the read loop. * * Also, three of them detect self-closing tags, those * ending with "/>", setting the flag elem_end and * calling xml_elem_end() at the very end, after * handling the attribute value, attribute name, or * tag name, respectively. */ /* Parse an attribute value. */ if (*pstate >= PARSE_ARG) { if (*pstate == PARSE_ARG && (b[pend] == '\'' || b[pend] == '"')) { *pstate = b[pend] == '"' ? PARSE_DQ : PARSE_SQ; increment(p, b, &pend, refill); continue; } if (advance(p, b, rlen, &pend, *pstate == PARSE_DQ ? "\"" : *pstate == PARSE_SQ ? "'" : " >", refill)) break; *pstate = PARSE_TAG; elem_end = 0; if (b[pend] == '>') { *pstate = PARSE_ELEM; if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } if (p->flags & PFLAG_EEND) elem_end = 1; } b[pend] = '\0'; if (pend < rlen) increment(p, b, &pend, refill); xml_attrval(p, b + poff); if (elem_end) xml_elem_end(p, NULL); /* Look for an attribute name. */ } else if (*pstate == PARSE_TAG) { switch (p->ncur) { case NODE_DOCTYPE: if (b[pend] == '[') { *pstate = PARSE_ELEM; increment(p, b, &pend, refill); continue; } /* FALLTHROUGH */ case NODE_ENTITY: if (b[pend] == '"' || b[pend] == '\'') { *pstate = PARSE_ARG; continue; } break; default: break; } if (advance(p, b, rlen, &pend, " =>", refill)) break; elem_end = 0; switch (b[pend]) { case '>': *pstate = PARSE_ELEM; if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } if (p->flags & PFLAG_EEND) elem_end = 1; break; case '=': *pstate = PARSE_ARG; break; default: break; } b[pend] = '\0'; if (pend < rlen) increment(p, b, &pend, refill); xml_attrkey(p, b + poff); if (elem_end) xml_elem_end(p, NULL); /* Begin an opening or closing tag. */ } else if (b[poff] == '<') { if (advance(p, b, rlen, &pend, " >", refill)) break; if (pend > poff + 3 && strncmp(b + poff, ""); if (cp == NULL) { if (refill) break; cp = b + rlen; } else cp += 3; while (b + pend < cp) increment(p, b, &pend, refill); continue; } elem_end = 0; if (b[pend] != '>') *pstate = PARSE_TAG; else if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } b[pend] = '\0'; if (pend < rlen) increment(p, b, &pend, refill); if (b[++poff] == '/') { elem_end = 1; poff++; } else { xml_elem_start(p, b + poff); if (*pstate == PARSE_ELEM && p->flags & PFLAG_EEND) elem_end = 1; } if (elem_end) xml_elem_end(p, b + poff); /* Close a doctype. */ } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') { *pstate = PARSE_TAG; increment(p, b, &pend, refill); /* Process an entity. */ } else if (b[poff] == '&') { if (advance(p, b, rlen, &pend, ";", refill)) break; b[pend] = '\0'; if (pend < rlen) increment(p, b, &pend, refill); xml_entity(p, b + poff + 1); /* Process text up to the next tag, entity, or EOL. */ } else { advance(p, b, rlen, &pend, "<&", refill); xml_char(p, b + poff, pend - poff); } } return poff; } struct ptree * parse_file(struct parse *p, int fd, const char *fname) { char b[4096]; ssize_t rsz; /* Return value from read(2). */ size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ enum pstate pstate; p->fname = fname; p->nline = 1; p->ncol = 1; pstate = PARSE_ELEM; rlen = 0; /* * Read loop. * * If the previous token was incomplete and asked for more * input, we have to enter the read loop once more even on EOF. * Once rsz is 0, incomplete tokens will no longer ask * for more input but instead use whatever there is, * and then exit the read loop. * The minus one on the size limit for read(2) is needed * such that advance() can set b[rlen] to NUL when needed. */ while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 && (rlen += rsz) > 0) { poff = parse_string(p, b, rlen, &pstate, rsz > 0); /* Buffer exhausted; shift left and re-fill. */ assert(poff > 0); rlen -= poff; memmove(b, b + poff, rlen); } if (rsz < 0) { perror(fname); p->tree->flags |= TREE_FAIL; } pnode_closetext(p); if ((p->tree->flags & TREE_CLOSED) == 0) warn_msg(p, "document not closed"); pnode_unlink(p->doctype); return p->tree; }