docbook2mdoc/parse.c - annotate

Return to parse.c CVS log
Up to [cvsweb.bsd.lv] / docbook2mdoc
Annotation of docbook2mdoc/parse.c, Revision 1.14

1.14    ! schwarze    1: /* $Id: parse.c,v 1.13 2019/04/03 17:53:02 schwarze Exp $ */
1.1       schwarze    2: /*
                      3:  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
                      4:  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
                      5:  *
                      6:  * Permission to use, copy, modify, and distribute this software for any
                      7:  * purpose with or without fee is hereby granted, provided that the above
                      8:  * copyright notice and this permission notice appear in all copies.
                      9:  *
                     10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
                     11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
                     12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
                     13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     14:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
                     15:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
                     16:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
                     17:  */
                     18: #include <assert.h>
                     19: #include <ctype.h>
1.6       schwarze   20: #include <stdarg.h>
1.1       schwarze   21: #include <stdio.h>
1.5       schwarze   22: #include <stdlib.h>
1.1       schwarze   23: #include <string.h>
                     24: #include <unistd.h>
                     25:
                     26: #include "node.h"
                     27: #include "parse.h"
                     28:
                     29: /*
                     30:  * The implementation of the DocBook parser.
                     31:  */
                     32:
1.14    ! schwarze   33: enum   pstate {
        !            34:        PARSE_ELEM,
        !            35:        PARSE_TAG,
        !            36:        PARSE_ARG,
        !            37:        PARSE_SQ,
        !            38:        PARSE_DQ
        !            39: };
        !            40:
1.1       schwarze   41: /*
                     42:  * Global parse state.
                     43:  * Keep this as simple and small as possible.
                     44:  */
                     45: struct parse {
                     46:        const char      *fname;  /* Name of the input file. */
                     47:        struct ptree    *tree;   /* Complete parse result. */
                     48:        struct pnode    *cur;    /* Current node in the tree. */
1.5       schwarze   49:        enum nodeid      ncur;   /* Type of the current node. */
                     50:        int              line;   /* Line number in the input file. */
                     51:        int              col;    /* Column number in the input file. */
                     52:        int              nline;  /* Line number of next token. */
                     53:        int              ncol;   /* Column number of next token. */
1.4       schwarze   54:        int              del;    /* Levels of nested nodes being deleted. */
1.5       schwarze   55:        int              attr;   /* The most recent attribute is valid. */
1.1       schwarze   56:        int              warn;
                     57: };
                     58:
                     59: struct element {
                     60:        const char      *name;   /* DocBook element name. */
                     61:        enum nodeid      node;   /* Node type to generate. */
                     62: };
                     63:
                     64: static const struct element elements[] = {
1.3       schwarze   65:        { "acronym",            NODE_IGNORE },
1.1       schwarze   66:        { "affiliation",        NODE_AFFILIATION },
1.4       schwarze   67:        { "anchor",             NODE_DELETE },
1.1       schwarze   68:        { "application",        NODE_APPLICATION },
                     69:        { "arg",                NODE_ARG },
                     70:        { "author",             NODE_AUTHOR },
                     71:        { "authorgroup",        NODE_AUTHORGROUP },
                     72:        { "blockquote",         NODE_BLOCKQUOTE },
                     73:        { "book",               NODE_BOOK },
                     74:        { "bookinfo",           NODE_BOOKINFO },
                     75:        { "caution",            NODE_CAUTION },
                     76:        { "chapter",            NODE_SECTION },
                     77:        { "citerefentry",       NODE_CITEREFENTRY },
                     78:        { "citetitle",          NODE_CITETITLE },
                     79:        { "cmdsynopsis",        NODE_CMDSYNOPSIS },
1.13      schwarze   80:        { "code",               NODE_LITERAL },
1.1       schwarze   81:        { "colspec",            NODE_COLSPEC },
                     82:        { "command",            NODE_COMMAND },
                     83:        { "constant",           NODE_CONSTANT },
1.7       schwarze   84:        { "contrib",            NODE_CONTRIB },
1.1       schwarze   85:        { "copyright",          NODE_COPYRIGHT },
                     86:        { "date",               NODE_DATE },
                     87:        { "editor",             NODE_EDITOR },
                     88:        { "email",              NODE_EMAIL },
                     89:        { "emphasis",           NODE_EMPHASIS },
                     90:        { "entry",              NODE_ENTRY },
                     91:        { "envar",              NODE_ENVAR },
1.13      schwarze   92:        { "errorname",          NODE_ERRORNAME },
1.1       schwarze   93:        { "fieldsynopsis",      NODE_FIELDSYNOPSIS },
                     94:        { "filename",           NODE_FILENAME },
1.7       schwarze   95:        { "firstname",          NODE_PERSONNAME },
1.1       schwarze   96:        { "firstterm",          NODE_FIRSTTERM },
                     97:        { "footnote",           NODE_FOOTNOTE },
                     98:        { "funcdef",            NODE_FUNCDEF },
                     99:        { "funcprototype",      NODE_FUNCPROTOTYPE },
                    100:        { "funcsynopsis",       NODE_FUNCSYNOPSIS },
                    101:        { "funcsynopsisinfo",   NODE_FUNCSYNOPSISINFO },
                    102:        { "function",           NODE_FUNCTION },
                    103:        { "glossterm",          NODE_GLOSSTERM },
                    104:        { "group",              NODE_GROUP },
                    105:        { "holder",             NODE_HOLDER },
                    106:        { "index",              NODE_INDEX },
1.4       schwarze  107:        { "indexterm",          NODE_DELETE },
1.1       schwarze  108:        { "info",               NODE_INFO },
                    109:        { "informalequation",   NODE_INFORMALEQUATION },
1.11      schwarze  110:        { "informaltable",      NODE_TABLE },
1.1       schwarze  111:        { "inlineequation",     NODE_INLINEEQUATION },
                    112:        { "itemizedlist",       NODE_ITEMIZEDLIST },
                    113:        { "keysym",             NODE_KEYSYM },
                    114:        { "legalnotice",        NODE_LEGALNOTICE },
                    115:        { "link",               NODE_LINK },
                    116:        { "listitem",           NODE_LISTITEM },
                    117:        { "literal",            NODE_LITERAL },
                    118:        { "literallayout",      NODE_LITERALLAYOUT },
                    119:        { "manvolnum",          NODE_MANVOLNUM },
                    120:        { "member",             NODE_MEMBER },
                    121:        { "mml:math",           NODE_MML_MATH },
                    122:        { "mml:mfenced",        NODE_MML_MFENCED },
                    123:        { "mml:mfrac",          NODE_MML_MFRAC },
                    124:        { "mml:mi",             NODE_MML_MI },
                    125:        { "mml:mn",             NODE_MML_MN },
                    126:        { "mml:mo",             NODE_MML_MO },
                    127:        { "mml:mrow",           NODE_MML_MROW },
                    128:        { "mml:msub",           NODE_MML_MSUB },
                    129:        { "mml:msup",           NODE_MML_MSUP },
                    130:        { "modifier",           NODE_MODIFIER },
                    131:        { "note",               NODE_NOTE },
                    132:        { "option",             NODE_OPTION },
                    133:        { "orderedlist",        NODE_ORDEREDLIST },
                    134:        { "orgname",            NODE_ORGNAME },
1.7       schwarze  135:        { "othername",          NODE_PERSONNAME },
1.1       schwarze  136:        { "para",               NODE_PARA },
                    137:        { "paramdef",           NODE_PARAMDEF },
                    138:        { "parameter",          NODE_PARAMETER },
                    139:        { "part",               NODE_SECTION },
                    140:        { "personname",         NODE_PERSONNAME },
1.3       schwarze  141:        { "phrase",             NODE_IGNORE },
1.1       schwarze  142:        { "preface",            NODE_PREFACE },
1.4       schwarze  143:        { "primary",            NODE_DELETE },
1.1       schwarze  144:        { "programlisting",     NODE_PROGRAMLISTING },
                    145:        { "prompt",             NODE_PROMPT },
                    146:        { "quote",              NODE_QUOTE },
                    147:        { "refclass",           NODE_REFCLASS },
                    148:        { "refdescriptor",      NODE_REFDESCRIPTOR },
                    149:        { "refentry",           NODE_REFENTRY },
                    150:        { "refentryinfo",       NODE_REFENTRYINFO },
                    151:        { "refentrytitle",      NODE_REFENTRYTITLE },
                    152:        { "refmeta",            NODE_REFMETA },
                    153:        { "refmetainfo",        NODE_REFMETAINFO },
                    154:        { "refmiscinfo",        NODE_REFMISCINFO },
                    155:        { "refname",            NODE_REFNAME },
                    156:        { "refnamediv",         NODE_REFNAMEDIV },
                    157:        { "refpurpose",         NODE_REFPURPOSE },
                    158:        { "refsect1",           NODE_SECTION },
                    159:        { "refsect2",           NODE_SECTION },
                    160:        { "refsect3",           NODE_SECTION },
                    161:        { "refsection",         NODE_SECTION },
                    162:        { "refsynopsisdiv",     NODE_REFSYNOPSISDIV },
                    163:        { "releaseinfo",        NODE_RELEASEINFO },
                    164:        { "replaceable",        NODE_REPLACEABLE },
                    165:        { "row",                NODE_ROW },
                    166:        { "sbr",                NODE_SBR },
                    167:        { "screen",             NODE_SCREEN },
1.4       schwarze  168:        { "secondary",          NODE_DELETE },
1.1       schwarze  169:        { "sect1",              NODE_SECTION },
                    170:        { "sect2",              NODE_SECTION },
                    171:        { "section",            NODE_SECTION },
                    172:        { "sgmltag",            NODE_SGMLTAG },
                    173:        { "simplelist",         NODE_SIMPLELIST },
                    174:        { "spanspec",           NODE_SPANSPEC },
1.13      schwarze  175:        { "structfield",        NODE_PARAMETER },
                    176:        { "structname",         NODE_TYPE },
1.1       schwarze  177:        { "subtitle",           NODE_SUBTITLE },
1.7       schwarze  178:        { "surname",            NODE_PERSONNAME },
1.12      schwarze  179:        { "symbol",             NODE_CONSTANT },
1.1       schwarze  180:        { "synopsis",           NODE_SYNOPSIS },
                    181:        { "table",              NODE_TABLE },
                    182:        { "tbody",              NODE_TBODY },
                    183:        { "term",               NODE_TERM },
                    184:        { "tfoot",              NODE_TFOOT },
                    185:        { "tgroup",             NODE_TGROUP },
                    186:        { "thead",              NODE_THEAD },
                    187:        { "tip",                NODE_TIP },
                    188:        { "title",              NODE_TITLE },
1.3       schwarze  189:        { "trademark",          NODE_IGNORE },
1.1       schwarze  190:        { "type",               NODE_TYPE },
                    191:        { "ulink",              NODE_ULINK },
1.13      schwarze  192:        { "userinput",          NODE_LITERAL },
1.1       schwarze  193:        { "variablelist",       NODE_VARIABLELIST },
                    194:        { "varlistentry",       NODE_VARLISTENTRY },
                    195:        { "varname",            NODE_VARNAME },
                    196:        { "warning",            NODE_WARNING },
                    197:        { "wordasword",         NODE_WORDASWORD },
1.4       schwarze  198:        { "xi:include",         NODE_DELETE_WARN },
1.1       schwarze  199:        { "year",               NODE_YEAR },
1.5       schwarze  200:        { NULL,                 NODE_IGNORE }
1.1       schwarze  201: };
                    202:
1.9       schwarze  203: struct entity {
                    204:        const char      *name;
                    205:        const char      *roff;
                    206: };
                    207:
                    208: /*
                    209:  * XML character entity references found in the wild.
                    210:  * Those that don't have an exact mandoc_char(7) representation
                    211:  * are approximated, and the desired codepoint is given as a comment.
                    212:  * Encoding them as \\[u...] would leave -Tascii out in the cold.
                    213:  */
                    214: static const struct entity entities[] = {
                    215:        { "alpha",      "\\(*a" },
                    216:        { "amp",        "&" },
                    217:        { "apos",       "'" },
                    218:        { "auml",       "\\(:a" },
                    219:        { "beta",       "\\(*b" },
                    220:        { "circ",       "^" },      /* U+02C6 */
                    221:        { "copy",       "\\(co" },
                    222:        { "dagger",     "\\(dg" },
                    223:        { "Delta",      "\\(*D" },
                    224:        { "eacute",     "\\('e" },
                    225:        { "emsp",       "\\ " },    /* U+2003 */
                    226:        { "gt",         ">" },
                    227:        { "hairsp",     "\\^" },
                    228:        { "kappa",      "\\(*k" },
                    229:        { "larr",       "\\(<-" },
                    230:        { "ldquo",      "\\(lq" },
                    231:        { "le",         "\\(<=" },
                    232:        { "lowbar",     "_" },
                    233:        { "lsqb",       "[" },
                    234:        { "lt",         "<" },
                    235:        { "mdash",      "\\(em" },
                    236:        { "minus",      "\\-" },
                    237:        { "ndash",      "\\(en" },
                    238:        { "nbsp",       "\\ " },
                    239:        { "num",        "#" },
                    240:        { "oslash",     "\\(/o" },
                    241:        { "ouml",       "\\(:o" },
                    242:        { "percnt",     "%" },
                    243:        { "quot",       "\\(dq" },
                    244:        { "rarr",       "\\(->" },
                    245:        { "rArr",       "\\(rA" },
                    246:        { "rdquo",      "\\(rq" },
                    247:        { "reg",        "\\(rg" },
                    248:        { "rho",        "\\(*r" },
                    249:        { "rsqb",       "]" },
                    250:        { "sigma",      "\\(*s" },
                    251:        { "shy",        "\\&" },     /* U+00AD */
                    252:        { "tau",        "\\(*t" },
                    253:        { "tilde",      "\\[u02DC]" },
                    254:        { "times",      "\\[tmu]" },
                    255:        { "uuml",       "\\(:u" },
                    256:        { NULL,         NULL }
                    257: };
                    258:
1.6       schwarze  259: static void
                    260: error_msg(struct parse *p, const char *fmt, ...)
                    261: {
                    262:        va_list          ap;
                    263:
                    264:        fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
                    265:        va_start(ap, fmt);
                    266:        vfprintf(stderr, fmt, ap);
                    267:        va_end(ap);
                    268:        fputc('\n', stderr);
                    269:        p->tree->flags |= TREE_FAIL;
                    270: }
                    271:
                    272: static void
                    273: warn_msg(struct parse *p, const char *fmt, ...)
                    274: {
                    275:        va_list          ap;
                    276:
                    277:        if (p->warn == 0)
                    278:                return;
                    279:
                    280:        fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
                    281:        va_start(ap, fmt);
                    282:        vfprintf(stderr, fmt, ap);
                    283:        va_end(ap);
                    284:        fputc('\n', stderr);
                    285: }
                    286:
1.1       schwarze  287: /*
                    288:  * Process a string of characters.
                    289:  * If a text node is already open, append to it.
                    290:  * Otherwise, create a new one as a child of the current node.
                    291:  */
                    292: static void
1.5       schwarze  293: xml_char(struct parse *ps, const char *p, int sz)
1.1       schwarze  294: {
                    295:        struct pnode    *dat;
                    296:
1.5       schwarze  297:        if (ps->del > 0)
1.1       schwarze  298:                return;
                    299:
1.5       schwarze  300:        if (ps->cur == NULL) {
1.6       schwarze  301:                error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5       schwarze  302:                return;
                    303:        }
                    304:
1.1       schwarze  305:        if (ps->cur->node != NODE_TEXT) {
                    306:                if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    307:                        perror(NULL);
                    308:                        exit(1);
                    309:                }
                    310:                dat->node = NODE_TEXT;
                    311:                dat->parent = ps->cur;
                    312:                TAILQ_INIT(&dat->childq);
                    313:                TAILQ_INIT(&dat->attrq);
                    314:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    315:                ps->cur = dat;
                    316:        }
                    317:
1.5       schwarze  318:        if (ps->tree->flags & TREE_CLOSED &&
1.6       schwarze  319:            ps->cur->parent == ps->tree->root)
                    320:                warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5       schwarze  321:
1.1       schwarze  322:        /* Append to the current text node. */
                    323:
                    324:        assert(sz >= 0);
                    325:        ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
                    326:        if (ps->cur->b == NULL) {
                    327:                perror(NULL);
                    328:                exit(1);
                    329:        }
                    330:        memcpy(ps->cur->b + ps->cur->bsz, p, sz);
                    331:        ps->cur->bsz += sz;
                    332:        ps->cur->b[ps->cur->bsz] = '\0';
                    333:        ps->cur->real = ps->cur->b;
                    334: }
                    335:
                    336: static void
                    337: pnode_trim(struct pnode *pn)
                    338: {
                    339:        assert(pn->node == NODE_TEXT);
                    340:        for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
                    341:                if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
                    342:                        break;
                    343: }
                    344:
1.9       schwarze  345: static void
                    346: xml_entity(struct parse *p, const char *name)
                    347: {
                    348:        const struct entity     *entity;
                    349:        struct pnode            *dat;
                    350:
                    351:        if (p->del > 0)
                    352:                return;
                    353:
                    354:        if (p->cur == NULL) {
                    355:                error_msg(p, "discarding entity before document: &%s;", name);
                    356:                return;
                    357:        }
                    358:
                    359:        /* Close out the text node, if there is one. */
                    360:        if (p->cur->node == NODE_TEXT) {
                    361:                pnode_trim(p->cur);
                    362:                p->cur = p->cur->parent;
                    363:        }
                    364:
                    365:        if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
                    366:                warn_msg(p, "entity after end of document: &%s;", name);
                    367:
                    368:        for (entity = entities; entity->name != NULL; entity++)
                    369:                if (strcmp(name, entity->name) == 0)
                    370:                        break;
                    371:
                    372:        if (entity->roff == NULL) {
                    373:                error_msg(p, "unknown entity &%s;", name);
                    374:                return;
                    375:        }
                    376:
                    377:        /* Create, append, and close out an entity node. */
                    378:        if ((dat = calloc(1, sizeof(*dat))) == NULL ||
                    379:            (dat->b = dat->real = strdup(entity->roff)) == NULL) {
                    380:                perror(NULL);
                    381:                exit(1);
                    382:        }
                    383:        dat->node = NODE_ESCAPE;
                    384:        dat->bsz = strlen(dat->b);
                    385:        dat->parent = p->cur;
                    386:        TAILQ_INIT(&dat->childq);
                    387:        TAILQ_INIT(&dat->attrq);
                    388:        TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
                    389: }
                    390:
1.1       schwarze  391: /*
                    392:  * Begin an element.
                    393:  */
                    394: static void
1.5       schwarze  395: xml_elem_start(struct parse *ps, const char *name)
1.1       schwarze  396: {
1.5       schwarze  397:        const struct element    *elem;
                    398:        struct pnode            *dat;
1.1       schwarze  399:
1.5       schwarze  400:        if (*name == '!' || *name == '?')
1.1       schwarze  401:                return;
                    402:
1.4       schwarze  403:        /*
                    404:         * An ancestor is excluded from the tree;
                    405:         * keep track of the number of levels excluded.
                    406:         */
                    407:        if (ps->del > 0) {
                    408:                ps->del++;
                    409:                return;
                    410:        }
                    411:
1.1       schwarze  412:        /* Close out the text node, if there is one. */
                    413:        if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
                    414:                pnode_trim(ps->cur);
                    415:                ps->cur = ps->cur->parent;
                    416:        }
                    417:
                    418:        for (elem = elements; elem->name != NULL; elem++)
                    419:                if (strcmp(elem->name, name) == 0)
                    420:                        break;
                    421:
1.6       schwarze  422:        if (elem->name == NULL)
                    423:                error_msg(ps, "unknown element <%s>", name);
                    424:
1.5       schwarze  425:        ps->ncur = elem->node;
1.1       schwarze  426:
1.5       schwarze  427:        switch (ps->ncur) {
1.4       schwarze  428:        case NODE_DELETE_WARN:
1.6       schwarze  429:                warn_msg(ps, "skipping element <%s>", name);
1.2       schwarze  430:                /* FALLTHROUGH */
1.4       schwarze  431:        case NODE_DELETE:
                    432:                ps->del = 1;
                    433:                /* FALLTHROUGH */
1.2       schwarze  434:        case NODE_IGNORE:
                    435:                return;
                    436:        case NODE_INLINEEQUATION:
1.1       schwarze  437:                ps->tree->flags |= TREE_EQN;
1.2       schwarze  438:                break;
                    439:        default:
                    440:                break;
                    441:        }
1.1       schwarze  442:
1.6       schwarze  443:        if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
                    444:                warn_msg(ps, "element after end of document: <%s>", name);
1.5       schwarze  445:
1.1       schwarze  446:        if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    447:                perror(NULL);
                    448:                exit(1);
                    449:        }
                    450:        dat->node = elem->node;
                    451:        dat->parent = ps->cur;
                    452:        TAILQ_INIT(&dat->childq);
                    453:        TAILQ_INIT(&dat->attrq);
                    454:
                    455:        if (ps->cur != NULL)
                    456:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    457:
                    458:        ps->cur = dat;
                    459:        if (ps->tree->root == NULL)
                    460:                ps->tree->root = dat;
1.5       schwarze  461: }
                    462:
                    463: static void
                    464: xml_attrkey(struct parse *ps, const char *name)
                    465: {
                    466:        struct pattr    *attr;
                    467:        enum attrkey     key;
1.1       schwarze  468:
1.5       schwarze  469:        if (ps->del > 0 || *name == '\0')
                    470:                return;
                    471:        if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
                    472:                ps->attr = 0;
                    473:                return;
                    474:        }
                    475:        if ((attr = calloc(1, sizeof(*attr))) == NULL) {
                    476:                perror(NULL);
                    477:                exit(1);
                    478:        }
                    479:        attr->key = key;
                    480:        attr->val = ATTRVAL__MAX;
                    481:        attr->rawval = NULL;
                    482:        TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
                    483:        ps->attr = 1;
                    484: }
                    485:
                    486: static void
                    487: xml_attrval(struct parse *ps, const char *name)
                    488: {
                    489:        struct pattr    *attr;
                    490:
                    491:        if (ps->del > 0 || ps->attr == 0)
                    492:                return;
                    493:        if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
                    494:                return;
                    495:        if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
                    496:            (attr->rawval = strdup(name)) == NULL) {
                    497:                perror(NULL);
                    498:                exit(1);
1.1       schwarze  499:        }
                    500: }
                    501:
                    502: /*
                    503:  * Roll up the parse tree.
                    504:  * If we're at a text node, roll that one up first.
                    505:  */
                    506: static void
1.5       schwarze  507: xml_elem_end(struct parse *ps, const char *name)
1.1       schwarze  508: {
1.5       schwarze  509:        const struct element    *elem;
                    510:        enum nodeid              node;
1.1       schwarze  511:
1.4       schwarze  512:        /*
                    513:         * An ancestor is excluded from the tree;
                    514:         * keep track of the number of levels excluded.
                    515:         */
                    516:        if (ps->del > 1) {
                    517:                ps->del--;
                    518:                return;
                    519:        }
                    520:
1.1       schwarze  521:        /* Close out the text node, if there is one. */
1.5       schwarze  522:        if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1       schwarze  523:                pnode_trim(ps->cur);
                    524:                ps->cur = ps->cur->parent;
                    525:        }
1.2       schwarze  526:
1.5       schwarze  527:        if (name != NULL) {
                    528:                for (elem = elements; elem->name != NULL; elem++)
                    529:                        if (strcmp(elem->name, name) == 0)
                    530:                                break;
                    531:                node = elem->node;
                    532:        } else
                    533:                node = ps->ncur;
1.2       schwarze  534:
1.5       schwarze  535:        switch (node) {
1.4       schwarze  536:        case NODE_DELETE_WARN:
                    537:        case NODE_DELETE:
1.5       schwarze  538:                if (ps->del > 0)
                    539:                        ps->del--;
1.4       schwarze  540:                break;
1.2       schwarze  541:        case NODE_IGNORE:
                    542:                break;
                    543:        default:
1.5       schwarze  544:                if (ps->cur == NULL || node != ps->cur->node) {
1.6       schwarze  545:                        warn_msg(ps, "element not open: </%s>", name);
1.5       schwarze  546:                        break;
                    547:                }
                    548:
                    549:                /*
                    550:                 * Refrain from actually closing the document element.
                    551:                 * If no more content follows, no harm is done, but if
                    552:                 * some content still follows, simply processing it is
                    553:                 * obviously better than discarding it or crashing.
                    554:                 */
                    555:
                    556:                if (ps->cur->parent == NULL)
                    557:                        ps->tree->flags |= TREE_CLOSED;
                    558:                else
                    559:                        ps->cur = ps->cur->parent;
1.4       schwarze  560:                break;
1.2       schwarze  561:        }
1.4       schwarze  562:        assert(ps->del == 0);
1.1       schwarze  563: }
                    564:
                    565: struct parse *
                    566: parse_alloc(int warn)
                    567: {
                    568:        struct parse    *p;
                    569:
                    570:        if ((p = calloc(1, sizeof(*p))) == NULL)
                    571:                return NULL;
                    572:
                    573:        if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
                    574:                free(p);
                    575:                return NULL;
                    576:        }
                    577:        p->warn = warn;
                    578:        return p;
                    579: }
                    580:
                    581: void
                    582: parse_free(struct parse *p)
                    583: {
                    584:        if (p == NULL)
                    585:                return;
                    586:        if (p->tree != NULL) {
                    587:                pnode_unlink(p->tree->root);
                    588:                free(p->tree);
                    589:        }
                    590:        free(p);
                    591: }
                    592:
1.14    ! schwarze  593: static void
        !           594: increment(struct parse *p, char *b, size_t *pend, int refill)
        !           595: {
        !           596:        if (refill) {
        !           597:                if (b[*pend] == '\n') {
        !           598:                        p->nline++;
        !           599:                        p->ncol = 1;
        !           600:                } else
        !           601:                        p->ncol++;
        !           602:        }
        !           603:        ++*pend;
        !           604: }
        !           605:
1.5       schwarze  606: /*
                    607:  * Advance the pend pointer to the next character in the charset.
                    608:  * If the charset starts with a space, it stands for any whitespace.
                    609:  * Update the new input file position, used for messages.
                    610:  * Do not overrun the buffer b of length rlen.
                    611:  * When reaching the end, NUL-terminate the buffer and return 1;
                    612:  * otherwise, return 0.
                    613:  */
                    614: static int
                    615: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14    ! schwarze  616:     const char *charset, int refill)
1.5       schwarze  617: {
                    618:        int              space;
                    619:
                    620:        if (*charset == ' ') {
                    621:                space = 1;
                    622:                charset++;
                    623:        } else
                    624:                space = 0;
                    625:
1.14    ! schwarze  626:        if (refill) {
        !           627:                p->nline = p->line;
        !           628:                p->ncol = p->col;
        !           629:        }
1.5       schwarze  630:        while (*pend < rlen) {
                    631:                if (space && isspace((unsigned char)b[*pend]))
                    632:                        break;
                    633:                if (strchr(charset, b[*pend]) != NULL)
                    634:                        break;
1.14    ! schwarze  635:                increment(p, b, pend, refill);
1.5       schwarze  636:        }
                    637:        if (*pend == rlen) {
                    638:                b[rlen] = '\0';
1.14    ! schwarze  639:                return refill;
1.5       schwarze  640:        } else
                    641:                return 0;
                    642: }
                    643:
1.14    ! schwarze  644: size_t
        !           645: parse_string(struct parse *p, char *b, size_t rlen,
        !           646:     enum pstate *pstate, int refill)
        !           647: {
        !           648:        char            *cp;
        !           649:        size_t           poff;  /* Parse offset in b[]. */
        !           650:        size_t           pend;  /* Offset of the end of the current word. */
        !           651:        int              elem_end;
        !           652:
        !           653:        pend = 0;
        !           654:        for (;;) {
        !           655:
        !           656:                /* Proceed to the next token, skipping whitespace. */
        !           657:
        !           658:                if (refill) {
        !           659:                        p->line = p->nline;
        !           660:                        p->col = p->ncol;
        !           661:                }
        !           662:                if ((poff = pend) == rlen)
        !           663:                        break;
        !           664:                if (isspace((unsigned char)b[pend])) {
        !           665:                        increment(p, b, &pend, refill);
        !           666:                        continue;
        !           667:                }
        !           668:
        !           669:                /*
        !           670:                 * The following four cases (ARG, TAG, and starting an
        !           671:                 * entity or a tag) all parse a word or quoted string.
        !           672:                 * If that extends beyond the read buffer and the last
        !           673:                 * read(2) still got data, they all break out of the
        !           674:                 * token loop to request more data from the read loop.
        !           675:                 *
        !           676:                 * Also, three of them detect self-closing tags, those
        !           677:                 * ending with "/>", setting the flag elem_end and
        !           678:                 * calling xml_elem_end() at the very end, after
        !           679:                 * handling the attribute value, attribute name, or
        !           680:                 * tag name, respectively.
        !           681:                 */
        !           682:
        !           683:                /* Parse an attribute value. */
        !           684:
        !           685:                if (*pstate >= PARSE_ARG) {
        !           686:                        if (*pstate == PARSE_ARG &&
        !           687:                            (b[pend] == '\'' || b[pend] == '"')) {
        !           688:                                *pstate = b[pend] == '"' ?
        !           689:                                    PARSE_DQ : PARSE_SQ;
        !           690:                                increment(p, b, &pend, refill);
        !           691:                                continue;
        !           692:                        }
        !           693:                        if (advance(p, b, rlen, &pend,
        !           694:                            *pstate == PARSE_DQ ? "\"" :
        !           695:                            *pstate == PARSE_SQ ? "'" : " >", refill))
        !           696:                                break;
        !           697:                        *pstate = PARSE_TAG;
        !           698:                        elem_end = 0;
        !           699:                        if (b[pend] == '>') {
        !           700:                                *pstate = PARSE_ELEM;
        !           701:                                if (pend > 0 && b[pend - 1] == '/') {
        !           702:                                        b[pend - 1] = '\0';
        !           703:                                        elem_end = 1;
        !           704:                                }
        !           705:                        }
        !           706:                        b[pend] = '\0';
        !           707:                        if (pend < rlen)
        !           708:                                increment(p, b, &pend, refill);
        !           709:                        xml_attrval(p, b + poff);
        !           710:                        if (elem_end)
        !           711:                                xml_elem_end(p, NULL);
        !           712:
        !           713:                /* Look for an attribute name. */
        !           714:
        !           715:                } else if (*pstate == PARSE_TAG) {
        !           716:                        if (advance(p, b, rlen, &pend, " =>", refill))
        !           717:                                break;
        !           718:                        elem_end = 0;
        !           719:                        switch (b[pend]) {
        !           720:                        case '>':
        !           721:                                *pstate = PARSE_ELEM;
        !           722:                                if (pend > 0 && b[pend - 1] == '/') {
        !           723:                                        b[pend - 1] = '\0';
        !           724:                                        elem_end = 1;
        !           725:                                }
        !           726:                                break;
        !           727:                        case '=':
        !           728:                                *pstate = PARSE_ARG;
        !           729:                                break;
        !           730:                        default:
        !           731:                                break;
        !           732:                        }
        !           733:                        b[pend] = '\0';
        !           734:                        if (pend < rlen)
        !           735:                                increment(p, b, &pend, refill);
        !           736:                        xml_attrkey(p, b + poff);
        !           737:                        if (elem_end)
        !           738:                                xml_elem_end(p, NULL);
        !           739:
        !           740:                /* Begin an opening or closing tag. */
        !           741:
        !           742:                } else if (b[poff] == '<') {
        !           743:                        if (advance(p, b, rlen, &pend, " >", refill))
        !           744:                                break;
        !           745:                        if (pend > poff + 3 &&
        !           746:                            strncmp(b + poff, "<!--", 4) == 0) {
        !           747:
        !           748:                                /* Skip a comment. */
        !           749:
        !           750:                                cp = strstr(b + pend - 2, "-->");
        !           751:                                if (cp == NULL) {
        !           752:                                        if (refill)
        !           753:                                                break;
        !           754:                                        cp = b + rlen;
        !           755:                                } else
        !           756:                                        cp += 3;
        !           757:                                while (b + pend < cp)
        !           758:                                        increment(p, b, &pend, refill);
        !           759:                                continue;
        !           760:                        }
        !           761:                        elem_end = 0;
        !           762:                        if (b[pend] != '>')
        !           763:                                *pstate = PARSE_TAG;
        !           764:                        else if (pend > 0 && b[pend - 1] == '/') {
        !           765:                                b[pend - 1] = '\0';
        !           766:                                elem_end = 1;
        !           767:                        }
        !           768:                        b[pend] = '\0';
        !           769:                        if (pend < rlen)
        !           770:                                increment(p, b, &pend, refill);
        !           771:                        if (b[++poff] == '/') {
        !           772:                                elem_end = 1;
        !           773:                                poff++;
        !           774:                        } else
        !           775:                                xml_elem_start(p, b + poff);
        !           776:                        if (elem_end)
        !           777:                                xml_elem_end(p, b + poff);
        !           778:
        !           779:                /* Process an entity. */
        !           780:
        !           781:                } else if (b[poff] == '&') {
        !           782:                        if (advance(p, b, rlen, &pend, ";", refill))
        !           783:                                break;
        !           784:                        b[pend] = '\0';
        !           785:                        if (pend < rlen)
        !           786:                                increment(p, b, &pend, refill);
        !           787:                        xml_entity(p, b + poff + 1);
        !           788:
        !           789:                /* Process text up to the next tag, entity, or EOL. */
        !           790:
        !           791:                } else {
        !           792:                        advance(p, b, rlen, &pend, "<&", refill);
        !           793:                        xml_char(p, b + poff, pend - poff);
        !           794:                }
        !           795:        }
        !           796:        return poff;
        !           797: }
        !           798:
1.1       schwarze  799: struct ptree *
                    800: parse_file(struct parse *p, int fd, const char *fname)
                    801: {
                    802:        char             b[4096];
1.5       schwarze  803:        ssize_t          rsz;   /* Return value from read(2). */
1.14    ! schwarze  804:        size_t           rlen;  /* Number of bytes in b[]. */
1.5       schwarze  805:        size_t           poff;  /* Parse offset in b[]. */
1.14    ! schwarze  806:        enum pstate      pstate;
1.1       schwarze  807:
                    808:        p->fname = fname;
1.5       schwarze  809:        p->nline = 1;
                    810:        p->ncol = 1;
1.14    ! schwarze  811:        pstate = PARSE_ELEM;
1.5       schwarze  812:        rlen = 0;
                    813:
                    814:        /*
                    815:         * Read loop.
                    816:         *
1.14    ! schwarze  817:          * If the previous token was incomplete and asked for more
        !           818:          * input, we have to enter the read loop once more even on EOF.
1.5       schwarze  819:         * Once rsz is 0, incomplete tokens will no longer ask
                    820:         * for more input but instead use whatever there is,
                    821:         * and then exit the read loop.
                    822:         * The minus one on the size limit for read(2) is needed
                    823:         * such that advance() can set b[rlen] to NUL when needed.
                    824:         */
                    825:
1.14    ! schwarze  826:        while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
        !           827:            (rlen += rsz) > 0) {
        !           828:                poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5       schwarze  829:                /* Buffer exhausted; shift left and re-fill. */
                    830:                assert(poff > 0);
                    831:                rlen -= poff;
1.14    ! schwarze  832:                memmove(b, b + poff, rlen);
1.5       schwarze  833:        }
                    834:        if (rsz < 0) {
                    835:                perror(fname);
                    836:                p->tree->flags |= TREE_FAIL;
                    837:        }
                    838:        if (p->cur != NULL && p->cur->node == NODE_TEXT) {
                    839:                pnode_trim(p->cur);
                    840:                p->cur = p->cur->parent;
                    841:        }
1.6       schwarze  842:        if ((p->tree->flags & TREE_CLOSED) == 0)
                    843:                warn_msg(p, "document not closed");
1.1       schwarze  844:        return p->tree;
                    845: }
CVSweb