docbook2mdoc/parse.c - annotate

Return to parse.c CVS log
Up to [cvsweb.bsd.lv] / docbook2mdoc
Annotation of docbook2mdoc/parse.c, Revision 1.5

1.5     ! schwarze    1: /* $Id: parse.c,v 1.4 2019/03/26 22:39:33 schwarze Exp $ */
1.1       schwarze    2: /*
                      3:  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
                      4:  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
                      5:  *
                      6:  * Permission to use, copy, modify, and distribute this software for any
                      7:  * purpose with or without fee is hereby granted, provided that the above
                      8:  * copyright notice and this permission notice appear in all copies.
                      9:  *
                     10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
                     11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
                     12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
                     13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     14:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
                     15:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
                     16:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
                     17:  */
                     18: #include <assert.h>
                     19: #include <ctype.h>
                     20: #include <stdio.h>
1.5     ! schwarze   21: #include <stdlib.h>
1.1       schwarze   22: #include <string.h>
                     23: #include <unistd.h>
                     24:
                     25: #include "node.h"
                     26: #include "parse.h"
                     27:
                     28: /*
                     29:  * The implementation of the DocBook parser.
                     30:  */
                     31:
                     32: /*
                     33:  * Global parse state.
                     34:  * Keep this as simple and small as possible.
                     35:  */
                     36: struct parse {
                     37:        const char      *fname;  /* Name of the input file. */
                     38:        struct ptree    *tree;   /* Complete parse result. */
                     39:        struct pnode    *cur;    /* Current node in the tree. */
1.5     ! schwarze   40:        enum nodeid      ncur;   /* Type of the current node. */
        !            41:        int              line;   /* Line number in the input file. */
        !            42:        int              col;    /* Column number in the input file. */
        !            43:        int              nline;  /* Line number of next token. */
        !            44:        int              ncol;   /* Column number of next token. */
1.4       schwarze   45:        int              del;    /* Levels of nested nodes being deleted. */
1.5     ! schwarze   46:        int              attr;   /* The most recent attribute is valid. */
1.1       schwarze   47:        int              warn;
                     48: };
                     49:
                     50: struct element {
                     51:        const char      *name;   /* DocBook element name. */
                     52:        enum nodeid      node;   /* Node type to generate. */
                     53: };
                     54:
                     55: static const struct element elements[] = {
1.3       schwarze   56:        { "acronym",            NODE_IGNORE },
1.1       schwarze   57:        { "affiliation",        NODE_AFFILIATION },
1.4       schwarze   58:        { "anchor",             NODE_DELETE },
1.1       schwarze   59:        { "application",        NODE_APPLICATION },
                     60:        { "arg",                NODE_ARG },
                     61:        { "author",             NODE_AUTHOR },
                     62:        { "authorgroup",        NODE_AUTHORGROUP },
                     63:        { "blockquote",         NODE_BLOCKQUOTE },
                     64:        { "book",               NODE_BOOK },
                     65:        { "bookinfo",           NODE_BOOKINFO },
                     66:        { "caution",            NODE_CAUTION },
                     67:        { "chapter",            NODE_SECTION },
                     68:        { "citerefentry",       NODE_CITEREFENTRY },
                     69:        { "citetitle",          NODE_CITETITLE },
                     70:        { "cmdsynopsis",        NODE_CMDSYNOPSIS },
                     71:        { "code",               NODE_CODE },
                     72:        { "colspec",            NODE_COLSPEC },
                     73:        { "command",            NODE_COMMAND },
                     74:        { "constant",           NODE_CONSTANT },
                     75:        { "copyright",          NODE_COPYRIGHT },
                     76:        { "date",               NODE_DATE },
                     77:        { "editor",             NODE_EDITOR },
                     78:        { "email",              NODE_EMAIL },
                     79:        { "emphasis",           NODE_EMPHASIS },
                     80:        { "entry",              NODE_ENTRY },
                     81:        { "envar",              NODE_ENVAR },
                     82:        { "fieldsynopsis",      NODE_FIELDSYNOPSIS },
                     83:        { "filename",           NODE_FILENAME },
1.3       schwarze   84:        { "firstname",          NODE_IGNORE },
1.1       schwarze   85:        { "firstterm",          NODE_FIRSTTERM },
                     86:        { "footnote",           NODE_FOOTNOTE },
                     87:        { "funcdef",            NODE_FUNCDEF },
                     88:        { "funcprototype",      NODE_FUNCPROTOTYPE },
                     89:        { "funcsynopsis",       NODE_FUNCSYNOPSIS },
                     90:        { "funcsynopsisinfo",   NODE_FUNCSYNOPSISINFO },
                     91:        { "function",           NODE_FUNCTION },
                     92:        { "glossterm",          NODE_GLOSSTERM },
                     93:        { "group",              NODE_GROUP },
                     94:        { "holder",             NODE_HOLDER },
                     95:        { "index",              NODE_INDEX },
1.4       schwarze   96:        { "indexterm",          NODE_DELETE },
1.1       schwarze   97:        { "info",               NODE_INFO },
                     98:        { "informalequation",   NODE_INFORMALEQUATION },
                     99:        { "informaltable",      NODE_INFORMALTABLE },
                    100:        { "inlineequation",     NODE_INLINEEQUATION },
                    101:        { "itemizedlist",       NODE_ITEMIZEDLIST },
                    102:        { "keysym",             NODE_KEYSYM },
                    103:        { "legalnotice",        NODE_LEGALNOTICE },
                    104:        { "link",               NODE_LINK },
                    105:        { "listitem",           NODE_LISTITEM },
                    106:        { "literal",            NODE_LITERAL },
                    107:        { "literallayout",      NODE_LITERALLAYOUT },
                    108:        { "manvolnum",          NODE_MANVOLNUM },
                    109:        { "member",             NODE_MEMBER },
                    110:        { "mml:math",           NODE_MML_MATH },
                    111:        { "mml:mfenced",        NODE_MML_MFENCED },
                    112:        { "mml:mfrac",          NODE_MML_MFRAC },
                    113:        { "mml:mi",             NODE_MML_MI },
                    114:        { "mml:mn",             NODE_MML_MN },
                    115:        { "mml:mo",             NODE_MML_MO },
                    116:        { "mml:mrow",           NODE_MML_MROW },
                    117:        { "mml:msub",           NODE_MML_MSUB },
                    118:        { "mml:msup",           NODE_MML_MSUP },
                    119:        { "modifier",           NODE_MODIFIER },
                    120:        { "note",               NODE_NOTE },
                    121:        { "option",             NODE_OPTION },
                    122:        { "orderedlist",        NODE_ORDEREDLIST },
                    123:        { "orgname",            NODE_ORGNAME },
1.3       schwarze  124:        { "othername",          NODE_IGNORE },
1.1       schwarze  125:        { "para",               NODE_PARA },
                    126:        { "paramdef",           NODE_PARAMDEF },
                    127:        { "parameter",          NODE_PARAMETER },
                    128:        { "part",               NODE_SECTION },
                    129:        { "personname",         NODE_PERSONNAME },
1.3       schwarze  130:        { "phrase",             NODE_IGNORE },
1.1       schwarze  131:        { "preface",            NODE_PREFACE },
1.4       schwarze  132:        { "primary",            NODE_DELETE },
1.1       schwarze  133:        { "programlisting",     NODE_PROGRAMLISTING },
                    134:        { "prompt",             NODE_PROMPT },
                    135:        { "quote",              NODE_QUOTE },
                    136:        { "refclass",           NODE_REFCLASS },
                    137:        { "refdescriptor",      NODE_REFDESCRIPTOR },
                    138:        { "refentry",           NODE_REFENTRY },
                    139:        { "refentryinfo",       NODE_REFENTRYINFO },
                    140:        { "refentrytitle",      NODE_REFENTRYTITLE },
                    141:        { "refmeta",            NODE_REFMETA },
                    142:        { "refmetainfo",        NODE_REFMETAINFO },
                    143:        { "refmiscinfo",        NODE_REFMISCINFO },
                    144:        { "refname",            NODE_REFNAME },
                    145:        { "refnamediv",         NODE_REFNAMEDIV },
                    146:        { "refpurpose",         NODE_REFPURPOSE },
                    147:        { "refsect1",           NODE_SECTION },
                    148:        { "refsect2",           NODE_SECTION },
                    149:        { "refsect3",           NODE_SECTION },
                    150:        { "refsection",         NODE_SECTION },
                    151:        { "refsynopsisdiv",     NODE_REFSYNOPSISDIV },
                    152:        { "releaseinfo",        NODE_RELEASEINFO },
                    153:        { "replaceable",        NODE_REPLACEABLE },
                    154:        { "row",                NODE_ROW },
                    155:        { "sbr",                NODE_SBR },
                    156:        { "screen",             NODE_SCREEN },
1.4       schwarze  157:        { "secondary",          NODE_DELETE },
1.1       schwarze  158:        { "sect1",              NODE_SECTION },
                    159:        { "sect2",              NODE_SECTION },
                    160:        { "section",            NODE_SECTION },
                    161:        { "sgmltag",            NODE_SGMLTAG },
                    162:        { "simplelist",         NODE_SIMPLELIST },
                    163:        { "spanspec",           NODE_SPANSPEC },
                    164:        { "structname",         NODE_STRUCTNAME },
                    165:        { "subtitle",           NODE_SUBTITLE },
1.3       schwarze  166:        { "surname",            NODE_IGNORE },
1.1       schwarze  167:        { "synopsis",           NODE_SYNOPSIS },
                    168:        { "table",              NODE_TABLE },
                    169:        { "tbody",              NODE_TBODY },
                    170:        { "term",               NODE_TERM },
                    171:        { "tfoot",              NODE_TFOOT },
                    172:        { "tgroup",             NODE_TGROUP },
                    173:        { "thead",              NODE_THEAD },
                    174:        { "tip",                NODE_TIP },
                    175:        { "title",              NODE_TITLE },
1.3       schwarze  176:        { "trademark",          NODE_IGNORE },
1.1       schwarze  177:        { "type",               NODE_TYPE },
                    178:        { "ulink",              NODE_ULINK },
                    179:        { "userinput",          NODE_USERINPUT },
                    180:        { "variablelist",       NODE_VARIABLELIST },
                    181:        { "varlistentry",       NODE_VARLISTENTRY },
                    182:        { "varname",            NODE_VARNAME },
                    183:        { "warning",            NODE_WARNING },
                    184:        { "wordasword",         NODE_WORDASWORD },
1.4       schwarze  185:        { "xi:include",         NODE_DELETE_WARN },
1.1       schwarze  186:        { "year",               NODE_YEAR },
1.5     ! schwarze  187:        { NULL,                 NODE_IGNORE }
1.1       schwarze  188: };
                    189:
                    190: /*
                    191:  * Process a string of characters.
                    192:  * If a text node is already open, append to it.
                    193:  * Otherwise, create a new one as a child of the current node.
                    194:  */
                    195: static void
1.5     ! schwarze  196: xml_char(struct parse *ps, const char *p, int sz)
1.1       schwarze  197: {
                    198:        struct pnode    *dat;
                    199:
1.5     ! schwarze  200:        if (ps->del > 0)
1.1       schwarze  201:                return;
                    202:
1.5     ! schwarze  203:        if (ps->cur == NULL) {
        !           204:                fprintf(stderr, "%s:%d:%d: discarding text before docum"
        !           205:                    "ent: %.*s\n", ps->fname, ps->line, ps->col, sz, p);
        !           206:                ps->tree->flags |= TREE_FAIL;
        !           207:                return;
        !           208:        }
        !           209:
1.1       schwarze  210:        if (ps->cur->node != NODE_TEXT) {
                    211:                if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    212:                        perror(NULL);
                    213:                        exit(1);
                    214:                }
                    215:                dat->node = NODE_TEXT;
                    216:                dat->parent = ps->cur;
                    217:                TAILQ_INIT(&dat->childq);
                    218:                TAILQ_INIT(&dat->attrq);
                    219:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    220:                ps->cur = dat;
                    221:        }
                    222:
1.5     ! schwarze  223:        if (ps->tree->flags & TREE_CLOSED &&
        !           224:            ps->cur->parent == ps->tree->root && ps->warn)
        !           225:                fprintf(stderr, "%s:%d:%d: warning: "
        !           226:                    "text after end of document: %.*s\n",
        !           227:                    ps->fname, ps->line, ps->col, sz, p);
        !           228:
1.1       schwarze  229:        /* Append to the current text node. */
                    230:
                    231:        assert(sz >= 0);
                    232:        ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
                    233:        if (ps->cur->b == NULL) {
                    234:                perror(NULL);
                    235:                exit(1);
                    236:        }
                    237:        memcpy(ps->cur->b + ps->cur->bsz, p, sz);
                    238:        ps->cur->bsz += sz;
                    239:        ps->cur->b[ps->cur->bsz] = '\0';
                    240:        ps->cur->real = ps->cur->b;
                    241: }
                    242:
                    243: static void
                    244: pnode_trim(struct pnode *pn)
                    245: {
                    246:        assert(pn->node == NODE_TEXT);
                    247:        for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
                    248:                if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
                    249:                        break;
                    250: }
                    251:
                    252: /*
                    253:  * Begin an element.
                    254:  */
                    255: static void
1.5     ! schwarze  256: xml_elem_start(struct parse *ps, const char *name)
1.1       schwarze  257: {
1.5     ! schwarze  258:        const struct element    *elem;
        !           259:        struct pnode            *dat;
1.1       schwarze  260:
1.5     ! schwarze  261:        if (*name == '!' || *name == '?')
1.1       schwarze  262:                return;
                    263:
1.4       schwarze  264:        /*
                    265:         * An ancestor is excluded from the tree;
                    266:         * keep track of the number of levels excluded.
                    267:         */
                    268:        if (ps->del > 0) {
                    269:                ps->del++;
                    270:                return;
                    271:        }
                    272:
1.1       schwarze  273:        /* Close out the text node, if there is one. */
                    274:        if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
                    275:                pnode_trim(ps->cur);
                    276:                ps->cur = ps->cur->parent;
                    277:        }
                    278:
                    279:        for (elem = elements; elem->name != NULL; elem++)
                    280:                if (strcmp(elem->name, name) == 0)
                    281:                        break;
                    282:
                    283:        if (elem->name == NULL) {
1.5     ! schwarze  284:                fprintf(stderr, "%s:%d:%d: unknown element <%s>\n",
        !           285:                        ps->fname, ps->line, ps->col, name);
1.1       schwarze  286:                ps->tree->flags |= TREE_FAIL;
                    287:        }
1.5     ! schwarze  288:        ps->ncur = elem->node;
1.1       schwarze  289:
1.5     ! schwarze  290:        switch (ps->ncur) {
1.4       schwarze  291:        case NODE_DELETE_WARN:
1.2       schwarze  292:                if (ps->warn)
1.5     ! schwarze  293:                        fprintf(stderr, "%s:%d:%d: warning: "
        !           294:                            "skipping element <%s>\n",
        !           295:                            ps->fname, ps->line, ps->col, name);
1.2       schwarze  296:                /* FALLTHROUGH */
1.4       schwarze  297:        case NODE_DELETE:
                    298:                ps->del = 1;
                    299:                /* FALLTHROUGH */
1.2       schwarze  300:        case NODE_IGNORE:
                    301:                return;
                    302:        case NODE_INLINEEQUATION:
1.1       schwarze  303:                ps->tree->flags |= TREE_EQN;
1.2       schwarze  304:                break;
                    305:        default:
                    306:                break;
                    307:        }
1.1       schwarze  308:
1.5     ! schwarze  309:        if (ps->tree->flags & TREE_CLOSED &&
        !           310:            ps->cur->parent == NULL && ps->warn)
        !           311:                fprintf(stderr, "%s:%d:%d: warning: "
        !           312:                    "element after end of document: %s\n",
        !           313:                    ps->fname, ps->line, ps->col, name);
        !           314:
1.1       schwarze  315:        if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    316:                perror(NULL);
                    317:                exit(1);
                    318:        }
                    319:        dat->node = elem->node;
                    320:        dat->parent = ps->cur;
                    321:        TAILQ_INIT(&dat->childq);
                    322:        TAILQ_INIT(&dat->attrq);
                    323:
                    324:        if (ps->cur != NULL)
                    325:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    326:
                    327:        ps->cur = dat;
                    328:        if (ps->tree->root == NULL)
                    329:                ps->tree->root = dat;
1.5     ! schwarze  330: }
        !           331:
        !           332: static void
        !           333: xml_attrkey(struct parse *ps, const char *name)
        !           334: {
        !           335:        struct pattr    *attr;
        !           336:        enum attrkey     key;
1.1       schwarze  337:
1.5     ! schwarze  338:        if (ps->del > 0 || *name == '\0')
        !           339:                return;
        !           340:        if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
        !           341:                if (ps->warn)
        !           342:                        fprintf(stderr, "%s:%d:%d: warning: "
        !           343:                            "unknown attribute \"%s\"\n",
        !           344:                            ps->fname, ps->line, ps->col, name);
        !           345:                ps->attr = 0;
        !           346:                return;
        !           347:        }
        !           348:        if ((attr = calloc(1, sizeof(*attr))) == NULL) {
        !           349:                perror(NULL);
        !           350:                exit(1);
        !           351:        }
        !           352:        attr->key = key;
        !           353:        attr->val = ATTRVAL__MAX;
        !           354:        attr->rawval = NULL;
        !           355:        TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
        !           356:        ps->attr = 1;
        !           357: }
        !           358:
        !           359: static void
        !           360: xml_attrval(struct parse *ps, const char *name)
        !           361: {
        !           362:        struct pattr    *attr;
        !           363:
        !           364:        if (ps->del > 0 || ps->attr == 0)
        !           365:                return;
        !           366:        if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
        !           367:                return;
        !           368:        if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
        !           369:            (attr->rawval = strdup(name)) == NULL) {
        !           370:                perror(NULL);
        !           371:                exit(1);
1.1       schwarze  372:        }
                    373: }
                    374:
                    375: /*
                    376:  * Roll up the parse tree.
                    377:  * If we're at a text node, roll that one up first.
                    378:  */
                    379: static void
1.5     ! schwarze  380: xml_elem_end(struct parse *ps, const char *name)
1.1       schwarze  381: {
1.5     ! schwarze  382:        const struct element    *elem;
        !           383:        enum nodeid              node;
1.1       schwarze  384:
1.4       schwarze  385:        /*
                    386:         * An ancestor is excluded from the tree;
                    387:         * keep track of the number of levels excluded.
                    388:         */
                    389:        if (ps->del > 1) {
                    390:                ps->del--;
                    391:                return;
                    392:        }
                    393:
1.1       schwarze  394:        /* Close out the text node, if there is one. */
1.5     ! schwarze  395:        if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1       schwarze  396:                pnode_trim(ps->cur);
                    397:                ps->cur = ps->cur->parent;
                    398:        }
1.2       schwarze  399:
1.5     ! schwarze  400:        if (name != NULL) {
        !           401:                for (elem = elements; elem->name != NULL; elem++)
        !           402:                        if (strcmp(elem->name, name) == 0)
        !           403:                                break;
        !           404:                node = elem->node;
        !           405:        } else
        !           406:                node = ps->ncur;
1.2       schwarze  407:
1.5     ! schwarze  408:        switch (node) {
1.4       schwarze  409:        case NODE_DELETE_WARN:
                    410:        case NODE_DELETE:
1.5     ! schwarze  411:                if (ps->del > 0)
        !           412:                        ps->del--;
1.4       schwarze  413:                break;
1.2       schwarze  414:        case NODE_IGNORE:
                    415:                break;
                    416:        default:
1.5     ! schwarze  417:                if (ps->cur == NULL || node != ps->cur->node) {
        !           418:                        if (ps->warn)
        !           419:                                fprintf(stderr, "%s:%d:%d: warning: "
        !           420:                                    "element not open: </%s>\n",
        !           421:                                    ps->fname, ps->line, ps->col, name);
        !           422:                        break;
        !           423:                }
        !           424:
        !           425:                /*
        !           426:                 * Refrain from actually closing the document element.
        !           427:                 * If no more content follows, no harm is done, but if
        !           428:                 * some content still follows, simply processing it is
        !           429:                 * obviously better than discarding it or crashing.
        !           430:                 */
        !           431:
        !           432:                if (ps->cur->parent == NULL)
        !           433:                        ps->tree->flags |= TREE_CLOSED;
        !           434:                else
        !           435:                        ps->cur = ps->cur->parent;
1.4       schwarze  436:                break;
1.2       schwarze  437:        }
1.4       schwarze  438:        assert(ps->del == 0);
1.1       schwarze  439: }
                    440:
                    441: struct parse *
                    442: parse_alloc(int warn)
                    443: {
                    444:        struct parse    *p;
                    445:
                    446:        if ((p = calloc(1, sizeof(*p))) == NULL)
                    447:                return NULL;
                    448:
                    449:        if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
                    450:                free(p);
                    451:                return NULL;
                    452:        }
                    453:        p->warn = warn;
                    454:        return p;
                    455: }
                    456:
                    457: void
                    458: parse_free(struct parse *p)
                    459: {
                    460:        if (p == NULL)
                    461:                return;
                    462:        if (p->tree != NULL) {
                    463:                pnode_unlink(p->tree->root);
                    464:                free(p->tree);
                    465:        }
                    466:        free(p);
                    467: }
                    468:
1.5     ! schwarze  469: /*
        !           470:  * Advance the pend pointer to the next character in the charset.
        !           471:  * If the charset starts with a space, it stands for any whitespace.
        !           472:  * Update the new input file position, used for messages.
        !           473:  * Do not overrun the buffer b of length rlen.
        !           474:  * When reaching the end, NUL-terminate the buffer and return 1;
        !           475:  * otherwise, return 0.
        !           476:  */
        !           477: static int
        !           478: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
        !           479:     const char *charset)
        !           480: {
        !           481:        int              space;
        !           482:
        !           483:        if (*charset == ' ') {
        !           484:                space = 1;
        !           485:                charset++;
        !           486:        } else
        !           487:                space = 0;
        !           488:
        !           489:        p->nline = p->line;
        !           490:        p->ncol = p->col;
        !           491:        while (*pend < rlen) {
        !           492:                if (b[*pend] == '\n') {
        !           493:                        p->nline++;
        !           494:                        p->ncol = 1;
        !           495:                } else
        !           496:                        p->ncol++;
        !           497:                if (space && isspace((unsigned char)b[*pend]))
        !           498:                        break;
        !           499:                if (strchr(charset, b[*pend]) != NULL)
        !           500:                        break;
        !           501:                ++*pend;
        !           502:        }
        !           503:        if (*pend == rlen) {
        !           504:                b[rlen] = '\0';
        !           505:                return 1;
        !           506:        } else
        !           507:                return 0;
        !           508: }
        !           509:
1.1       schwarze  510: struct ptree *
                    511: parse_file(struct parse *p, int fd, const char *fname)
                    512: {
                    513:        char             b[4096];
1.5     ! schwarze  514:        ssize_t          rsz;   /* Return value from read(2). */
        !           515:        size_t           rlen;  /* Number of bytes in b[]. */
        !           516:        size_t           poff;  /* Parse offset in b[]. */
        !           517:        size_t           pend;  /* Offset of the end of the current word. */
        !           518:        int              in_tag, in_arg, in_quotes, elem_end;
1.1       schwarze  519:
                    520:        p->fname = fname;
1.5     ! schwarze  521:        p->nline = 1;
        !           522:        p->ncol = 1;
        !           523:        rlen = 0;
        !           524:        in_tag = in_arg = in_quotes = 0;
        !           525:
        !           526:        /*
        !           527:         * Read loop.
        !           528:         *
        !           529:         * We have to enter the read loop once more even on EOF
        !           530:         * because the previous token may have been incomplete,
        !           531:         * such that it asked for more input.
        !           532:         * Once rsz is 0, incomplete tokens will no longer ask
        !           533:         * for more input but instead use whatever there is,
        !           534:         * and then exit the read loop.
        !           535:         * The minus one on the size limit for read(2) is needed
        !           536:         * such that advance() can set b[rlen] to NUL when needed.
        !           537:         */
        !           538:
        !           539:        while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
        !           540:                if ((rlen += rsz) == 0)
        !           541:                        break;
        !           542:
        !           543:                /* Token loop. */
        !           544:
        !           545:                pend = 0;
        !           546:                for (;;) {
        !           547:
        !           548:                        /* Proceed to the next token, skipping whitespace. */
        !           549:
        !           550:                        p->line = p->nline;
        !           551:                        p->col = p->ncol;
        !           552:                        if ((poff = pend) == rlen)
        !           553:                                break;
        !           554:                        if (isspace((unsigned char)b[pend])) {
        !           555:                                if (b[pend++] == '\n') {
        !           556:                                        p->nline++;
        !           557:                                        p->ncol = 1;
        !           558:                                } else
        !           559:                                        p->ncol++;
        !           560:                                continue;
        !           561:                        }
        !           562:
        !           563:                        /*
        !           564:                         * The following three cases (in_arg, in_tag,
        !           565:                         * and starting a tag) all parse a word or
        !           566:                         * quoted string.  If that extends beyond the
        !           567:                         * read buffer and the last read(2) still got
        !           568:                         * data, they all break out of the token loop
        !           569:                         * to request more data from the read loop.
        !           570:                         *
        !           571:                         * Also, they all detect self-closing tags,
        !           572:                         * those ending with "/>", setting the flag
        !           573:                         * elem_end and calling xml_elem_end() at the
        !           574:                         * very end, after handling the attribute value,
        !           575:                         * attribute name, or tag name, respectively.
        !           576:                         */
        !           577:
        !           578:                        /* Parse an attribute value. */
        !           579:
        !           580:                        if (in_arg) {
        !           581:                                if (in_quotes == 0 && b[pend] == '"') {
        !           582:                                        in_quotes = 1;
        !           583:                                        p->ncol++;
        !           584:                                        pend++;
        !           585:                                        continue;
        !           586:                                }
        !           587:                                if (advance(p, b, rlen, &pend,
        !           588:                                    in_quotes ? "\"" : " >") && rsz > 0)
        !           589:                                        break;
        !           590:                                in_arg = in_quotes = elem_end = 0;
        !           591:                                if (b[pend] == '>') {
        !           592:                                        in_tag = 0;
        !           593:                                        if (pend > 0 && b[pend - 1] == '/') {
        !           594:                                                b[pend - 1] = '\0';
        !           595:                                                elem_end = 1;
        !           596:                                        }
        !           597:                                }
        !           598:                                b[pend] = '\0';
        !           599:                                if (pend < rlen)
        !           600:                                        pend++;
        !           601:                                xml_attrval(p, b + poff);
        !           602:                                if (elem_end)
        !           603:                                        xml_elem_end(p, NULL);
        !           604:
        !           605:                        /* Look for an attribute name. */
        !           606:
        !           607:                        } else if (in_tag) {
        !           608:                                if (advance(p, b, rlen, &pend, " =>") &&
        !           609:                                    rsz > 0)
        !           610:                                        break;
        !           611:                                elem_end = 0;
        !           612:                                switch (b[pend]) {
        !           613:                                case '>':
        !           614:                                        in_tag = 0;
        !           615:                                        if (pend > 0 && b[pend - 1] == '/') {
        !           616:                                                b[pend - 1] = '\0';
        !           617:                                                elem_end = 1;
        !           618:                                        }
        !           619:                                        break;
        !           620:                                case '=':
        !           621:                                        in_arg = 1;
        !           622:                                        break;
        !           623:                                default:
        !           624:                                        break;
        !           625:                                }
        !           626:                                b[pend] = '\0';
        !           627:                                if (pend < rlen)
        !           628:                                        pend++;
        !           629:                                xml_attrkey(p, b + poff);
        !           630:                                if (elem_end)
        !           631:                                        xml_elem_end(p, NULL);
        !           632:
        !           633:                        /* Begin an opening or closing tag. */
        !           634:
        !           635:                        } else if (b[poff] == '<') {
        !           636:                                if (advance(p, b, rlen, &pend, " >") &&
        !           637:                                    rsz > 0)
        !           638:                                        break;
        !           639:                                elem_end = 0;
        !           640:                                if (b[pend] != '>')
        !           641:                                        in_tag = 1;
        !           642:                                else if (pend > 0 && b[pend - 1] == '/') {
        !           643:                                        b[pend - 1] = '\0';
        !           644:                                        elem_end = 1;
        !           645:                                }
        !           646:                                b[pend] = '\0';
        !           647:                                if (pend < rlen)
        !           648:                                        pend++;
        !           649:                                if (b[++poff] == '/') {
        !           650:                                        elem_end = 1;
        !           651:                                        poff++;
        !           652:                                } else
        !           653:                                        xml_elem_start(p, b + poff);
        !           654:                                if (elem_end)
        !           655:                                        xml_elem_end(p, b + poff);
        !           656:
        !           657:                        /* Process text up to the next tag. */
        !           658:
        !           659:                        } else {
        !           660:                                if (advance(p, b, rlen, &pend, "<") == 0)
        !           661:                                        p->ncol--;
        !           662:                                xml_char(p, b + poff, pend - poff);
        !           663:                        }
1.1       schwarze  664:                }
1.5     ! schwarze  665:
        !           666:                /* Buffer exhausted; shift left and re-fill. */
        !           667:
        !           668:                assert(poff > 0);
        !           669:                memmove(b, b + poff, rlen - poff);
        !           670:                rlen -= poff;
        !           671:        }
        !           672:        if (rsz < 0) {
        !           673:                perror(fname);
        !           674:                p->tree->flags |= TREE_FAIL;
        !           675:        }
        !           676:        if (p->cur != NULL && p->cur->node == NODE_TEXT) {
        !           677:                pnode_trim(p->cur);
        !           678:                p->cur = p->cur->parent;
        !           679:        }
        !           680:        if ((p->tree->flags & TREE_CLOSED) == 0 && p->warn)
        !           681:                fprintf(stderr, "%s:%d:%d: warning: document not closed\n",
        !           682:                    p->fname, p->line, p->col);
1.1       schwarze  683:        return p->tree;
                    684: }
CVSweb