docbook2mdoc/parse.c - annotate

Return to parse.c CVS log
Up to [cvsweb.bsd.lv] / docbook2mdoc
Annotation of docbook2mdoc/parse.c, Revision 1.6

1.6     ! schwarze    1: /* $Id: parse.c,v 1.5 2019/03/28 12:21:10 schwarze Exp $ */
1.1       schwarze    2: /*
                      3:  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
                      4:  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
                      5:  *
                      6:  * Permission to use, copy, modify, and distribute this software for any
                      7:  * purpose with or without fee is hereby granted, provided that the above
                      8:  * copyright notice and this permission notice appear in all copies.
                      9:  *
                     10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
                     11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
                     12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
                     13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     14:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
                     15:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
                     16:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
                     17:  */
                     18: #include <assert.h>
                     19: #include <ctype.h>
1.6     ! schwarze   20: #include <stdarg.h>
1.1       schwarze   21: #include <stdio.h>
1.5       schwarze   22: #include <stdlib.h>
1.1       schwarze   23: #include <string.h>
                     24: #include <unistd.h>
                     25:
                     26: #include "node.h"
                     27: #include "parse.h"
                     28:
                     29: /*
                     30:  * The implementation of the DocBook parser.
                     31:  */
                     32:
                     33: /*
                     34:  * Global parse state.
                     35:  * Keep this as simple and small as possible.
                     36:  */
                     37: struct parse {
                     38:        const char      *fname;  /* Name of the input file. */
                     39:        struct ptree    *tree;   /* Complete parse result. */
                     40:        struct pnode    *cur;    /* Current node in the tree. */
1.5       schwarze   41:        enum nodeid      ncur;   /* Type of the current node. */
                     42:        int              line;   /* Line number in the input file. */
                     43:        int              col;    /* Column number in the input file. */
                     44:        int              nline;  /* Line number of next token. */
                     45:        int              ncol;   /* Column number of next token. */
1.4       schwarze   46:        int              del;    /* Levels of nested nodes being deleted. */
1.5       schwarze   47:        int              attr;   /* The most recent attribute is valid. */
1.1       schwarze   48:        int              warn;
                     49: };
                     50:
                     51: struct element {
                     52:        const char      *name;   /* DocBook element name. */
                     53:        enum nodeid      node;   /* Node type to generate. */
                     54: };
                     55:
                     56: static const struct element elements[] = {
1.3       schwarze   57:        { "acronym",            NODE_IGNORE },
1.1       schwarze   58:        { "affiliation",        NODE_AFFILIATION },
1.4       schwarze   59:        { "anchor",             NODE_DELETE },
1.1       schwarze   60:        { "application",        NODE_APPLICATION },
                     61:        { "arg",                NODE_ARG },
                     62:        { "author",             NODE_AUTHOR },
                     63:        { "authorgroup",        NODE_AUTHORGROUP },
                     64:        { "blockquote",         NODE_BLOCKQUOTE },
                     65:        { "book",               NODE_BOOK },
                     66:        { "bookinfo",           NODE_BOOKINFO },
                     67:        { "caution",            NODE_CAUTION },
                     68:        { "chapter",            NODE_SECTION },
                     69:        { "citerefentry",       NODE_CITEREFENTRY },
                     70:        { "citetitle",          NODE_CITETITLE },
                     71:        { "cmdsynopsis",        NODE_CMDSYNOPSIS },
                     72:        { "code",               NODE_CODE },
                     73:        { "colspec",            NODE_COLSPEC },
                     74:        { "command",            NODE_COMMAND },
                     75:        { "constant",           NODE_CONSTANT },
                     76:        { "copyright",          NODE_COPYRIGHT },
                     77:        { "date",               NODE_DATE },
                     78:        { "editor",             NODE_EDITOR },
                     79:        { "email",              NODE_EMAIL },
                     80:        { "emphasis",           NODE_EMPHASIS },
                     81:        { "entry",              NODE_ENTRY },
                     82:        { "envar",              NODE_ENVAR },
                     83:        { "fieldsynopsis",      NODE_FIELDSYNOPSIS },
                     84:        { "filename",           NODE_FILENAME },
1.3       schwarze   85:        { "firstname",          NODE_IGNORE },
1.1       schwarze   86:        { "firstterm",          NODE_FIRSTTERM },
                     87:        { "footnote",           NODE_FOOTNOTE },
                     88:        { "funcdef",            NODE_FUNCDEF },
                     89:        { "funcprototype",      NODE_FUNCPROTOTYPE },
                     90:        { "funcsynopsis",       NODE_FUNCSYNOPSIS },
                     91:        { "funcsynopsisinfo",   NODE_FUNCSYNOPSISINFO },
                     92:        { "function",           NODE_FUNCTION },
                     93:        { "glossterm",          NODE_GLOSSTERM },
                     94:        { "group",              NODE_GROUP },
                     95:        { "holder",             NODE_HOLDER },
                     96:        { "index",              NODE_INDEX },
1.4       schwarze   97:        { "indexterm",          NODE_DELETE },
1.1       schwarze   98:        { "info",               NODE_INFO },
                     99:        { "informalequation",   NODE_INFORMALEQUATION },
                    100:        { "informaltable",      NODE_INFORMALTABLE },
                    101:        { "inlineequation",     NODE_INLINEEQUATION },
                    102:        { "itemizedlist",       NODE_ITEMIZEDLIST },
                    103:        { "keysym",             NODE_KEYSYM },
                    104:        { "legalnotice",        NODE_LEGALNOTICE },
                    105:        { "link",               NODE_LINK },
                    106:        { "listitem",           NODE_LISTITEM },
                    107:        { "literal",            NODE_LITERAL },
                    108:        { "literallayout",      NODE_LITERALLAYOUT },
                    109:        { "manvolnum",          NODE_MANVOLNUM },
                    110:        { "member",             NODE_MEMBER },
                    111:        { "mml:math",           NODE_MML_MATH },
                    112:        { "mml:mfenced",        NODE_MML_MFENCED },
                    113:        { "mml:mfrac",          NODE_MML_MFRAC },
                    114:        { "mml:mi",             NODE_MML_MI },
                    115:        { "mml:mn",             NODE_MML_MN },
                    116:        { "mml:mo",             NODE_MML_MO },
                    117:        { "mml:mrow",           NODE_MML_MROW },
                    118:        { "mml:msub",           NODE_MML_MSUB },
                    119:        { "mml:msup",           NODE_MML_MSUP },
                    120:        { "modifier",           NODE_MODIFIER },
                    121:        { "note",               NODE_NOTE },
                    122:        { "option",             NODE_OPTION },
                    123:        { "orderedlist",        NODE_ORDEREDLIST },
                    124:        { "orgname",            NODE_ORGNAME },
1.3       schwarze  125:        { "othername",          NODE_IGNORE },
1.1       schwarze  126:        { "para",               NODE_PARA },
                    127:        { "paramdef",           NODE_PARAMDEF },
                    128:        { "parameter",          NODE_PARAMETER },
                    129:        { "part",               NODE_SECTION },
                    130:        { "personname",         NODE_PERSONNAME },
1.3       schwarze  131:        { "phrase",             NODE_IGNORE },
1.1       schwarze  132:        { "preface",            NODE_PREFACE },
1.4       schwarze  133:        { "primary",            NODE_DELETE },
1.1       schwarze  134:        { "programlisting",     NODE_PROGRAMLISTING },
                    135:        { "prompt",             NODE_PROMPT },
                    136:        { "quote",              NODE_QUOTE },
                    137:        { "refclass",           NODE_REFCLASS },
                    138:        { "refdescriptor",      NODE_REFDESCRIPTOR },
                    139:        { "refentry",           NODE_REFENTRY },
                    140:        { "refentryinfo",       NODE_REFENTRYINFO },
                    141:        { "refentrytitle",      NODE_REFENTRYTITLE },
                    142:        { "refmeta",            NODE_REFMETA },
                    143:        { "refmetainfo",        NODE_REFMETAINFO },
                    144:        { "refmiscinfo",        NODE_REFMISCINFO },
                    145:        { "refname",            NODE_REFNAME },
                    146:        { "refnamediv",         NODE_REFNAMEDIV },
                    147:        { "refpurpose",         NODE_REFPURPOSE },
                    148:        { "refsect1",           NODE_SECTION },
                    149:        { "refsect2",           NODE_SECTION },
                    150:        { "refsect3",           NODE_SECTION },
                    151:        { "refsection",         NODE_SECTION },
                    152:        { "refsynopsisdiv",     NODE_REFSYNOPSISDIV },
                    153:        { "releaseinfo",        NODE_RELEASEINFO },
                    154:        { "replaceable",        NODE_REPLACEABLE },
                    155:        { "row",                NODE_ROW },
                    156:        { "sbr",                NODE_SBR },
                    157:        { "screen",             NODE_SCREEN },
1.4       schwarze  158:        { "secondary",          NODE_DELETE },
1.1       schwarze  159:        { "sect1",              NODE_SECTION },
                    160:        { "sect2",              NODE_SECTION },
                    161:        { "section",            NODE_SECTION },
                    162:        { "sgmltag",            NODE_SGMLTAG },
                    163:        { "simplelist",         NODE_SIMPLELIST },
                    164:        { "spanspec",           NODE_SPANSPEC },
                    165:        { "structname",         NODE_STRUCTNAME },
                    166:        { "subtitle",           NODE_SUBTITLE },
1.3       schwarze  167:        { "surname",            NODE_IGNORE },
1.1       schwarze  168:        { "synopsis",           NODE_SYNOPSIS },
                    169:        { "table",              NODE_TABLE },
                    170:        { "tbody",              NODE_TBODY },
                    171:        { "term",               NODE_TERM },
                    172:        { "tfoot",              NODE_TFOOT },
                    173:        { "tgroup",             NODE_TGROUP },
                    174:        { "thead",              NODE_THEAD },
                    175:        { "tip",                NODE_TIP },
                    176:        { "title",              NODE_TITLE },
1.3       schwarze  177:        { "trademark",          NODE_IGNORE },
1.1       schwarze  178:        { "type",               NODE_TYPE },
                    179:        { "ulink",              NODE_ULINK },
                    180:        { "userinput",          NODE_USERINPUT },
                    181:        { "variablelist",       NODE_VARIABLELIST },
                    182:        { "varlistentry",       NODE_VARLISTENTRY },
                    183:        { "varname",            NODE_VARNAME },
                    184:        { "warning",            NODE_WARNING },
                    185:        { "wordasword",         NODE_WORDASWORD },
1.4       schwarze  186:        { "xi:include",         NODE_DELETE_WARN },
1.1       schwarze  187:        { "year",               NODE_YEAR },
1.5       schwarze  188:        { NULL,                 NODE_IGNORE }
1.1       schwarze  189: };
                    190:
1.6     ! schwarze  191: static void
        !           192: error_msg(struct parse *p, const char *fmt, ...)
        !           193: {
        !           194:        va_list          ap;
        !           195:
        !           196:        fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
        !           197:        va_start(ap, fmt);
        !           198:        vfprintf(stderr, fmt, ap);
        !           199:        va_end(ap);
        !           200:        fputc('\n', stderr);
        !           201:        p->tree->flags |= TREE_FAIL;
        !           202: }
        !           203:
        !           204: static void
        !           205: warn_msg(struct parse *p, const char *fmt, ...)
        !           206: {
        !           207:        va_list          ap;
        !           208:
        !           209:        if (p->warn == 0)
        !           210:                return;
        !           211:
        !           212:        fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
        !           213:        va_start(ap, fmt);
        !           214:        vfprintf(stderr, fmt, ap);
        !           215:        va_end(ap);
        !           216:        fputc('\n', stderr);
        !           217: }
        !           218:
1.1       schwarze  219: /*
                    220:  * Process a string of characters.
                    221:  * If a text node is already open, append to it.
                    222:  * Otherwise, create a new one as a child of the current node.
                    223:  */
                    224: static void
1.5       schwarze  225: xml_char(struct parse *ps, const char *p, int sz)
1.1       schwarze  226: {
                    227:        struct pnode    *dat;
                    228:
1.5       schwarze  229:        if (ps->del > 0)
1.1       schwarze  230:                return;
                    231:
1.5       schwarze  232:        if (ps->cur == NULL) {
1.6     ! schwarze  233:                error_msg(ps, "discarding text before document: %.*s", sz, p);
1.5       schwarze  234:                return;
                    235:        }
                    236:
1.1       schwarze  237:        if (ps->cur->node != NODE_TEXT) {
                    238:                if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    239:                        perror(NULL);
                    240:                        exit(1);
                    241:                }
                    242:                dat->node = NODE_TEXT;
                    243:                dat->parent = ps->cur;
                    244:                TAILQ_INIT(&dat->childq);
                    245:                TAILQ_INIT(&dat->attrq);
                    246:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    247:                ps->cur = dat;
                    248:        }
                    249:
1.5       schwarze  250:        if (ps->tree->flags & TREE_CLOSED &&
1.6     ! schwarze  251:            ps->cur->parent == ps->tree->root)
        !           252:                warn_msg(ps, "text after end of document: %.*s", sz, p);
1.5       schwarze  253:
1.1       schwarze  254:        /* Append to the current text node. */
                    255:
                    256:        assert(sz >= 0);
                    257:        ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
                    258:        if (ps->cur->b == NULL) {
                    259:                perror(NULL);
                    260:                exit(1);
                    261:        }
                    262:        memcpy(ps->cur->b + ps->cur->bsz, p, sz);
                    263:        ps->cur->bsz += sz;
                    264:        ps->cur->b[ps->cur->bsz] = '\0';
                    265:        ps->cur->real = ps->cur->b;
                    266: }
                    267:
                    268: static void
                    269: pnode_trim(struct pnode *pn)
                    270: {
                    271:        assert(pn->node == NODE_TEXT);
                    272:        for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
                    273:                if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
                    274:                        break;
                    275: }
                    276:
                    277: /*
                    278:  * Begin an element.
                    279:  */
                    280: static void
1.5       schwarze  281: xml_elem_start(struct parse *ps, const char *name)
1.1       schwarze  282: {
1.5       schwarze  283:        const struct element    *elem;
                    284:        struct pnode            *dat;
1.1       schwarze  285:
1.5       schwarze  286:        if (*name == '!' || *name == '?')
1.1       schwarze  287:                return;
                    288:
1.4       schwarze  289:        /*
                    290:         * An ancestor is excluded from the tree;
                    291:         * keep track of the number of levels excluded.
                    292:         */
                    293:        if (ps->del > 0) {
                    294:                ps->del++;
                    295:                return;
                    296:        }
                    297:
1.1       schwarze  298:        /* Close out the text node, if there is one. */
                    299:        if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
                    300:                pnode_trim(ps->cur);
                    301:                ps->cur = ps->cur->parent;
                    302:        }
                    303:
                    304:        for (elem = elements; elem->name != NULL; elem++)
                    305:                if (strcmp(elem->name, name) == 0)
                    306:                        break;
                    307:
1.6     ! schwarze  308:        if (elem->name == NULL)
        !           309:                error_msg(ps, "unknown element <%s>", name);
        !           310:
1.5       schwarze  311:        ps->ncur = elem->node;
1.1       schwarze  312:
1.5       schwarze  313:        switch (ps->ncur) {
1.4       schwarze  314:        case NODE_DELETE_WARN:
1.6     ! schwarze  315:                warn_msg(ps, "skipping element <%s>", name);
1.2       schwarze  316:                /* FALLTHROUGH */
1.4       schwarze  317:        case NODE_DELETE:
                    318:                ps->del = 1;
                    319:                /* FALLTHROUGH */
1.2       schwarze  320:        case NODE_IGNORE:
                    321:                return;
                    322:        case NODE_INLINEEQUATION:
1.1       schwarze  323:                ps->tree->flags |= TREE_EQN;
1.2       schwarze  324:                break;
                    325:        default:
                    326:                break;
                    327:        }
1.1       schwarze  328:
1.6     ! schwarze  329:        if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
        !           330:                warn_msg(ps, "element after end of document: <%s>", name);
1.5       schwarze  331:
1.1       schwarze  332:        if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                    333:                perror(NULL);
                    334:                exit(1);
                    335:        }
                    336:        dat->node = elem->node;
                    337:        dat->parent = ps->cur;
                    338:        TAILQ_INIT(&dat->childq);
                    339:        TAILQ_INIT(&dat->attrq);
                    340:
                    341:        if (ps->cur != NULL)
                    342:                TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
                    343:
                    344:        ps->cur = dat;
                    345:        if (ps->tree->root == NULL)
                    346:                ps->tree->root = dat;
1.5       schwarze  347: }
                    348:
                    349: static void
                    350: xml_attrkey(struct parse *ps, const char *name)
                    351: {
                    352:        struct pattr    *attr;
                    353:        enum attrkey     key;
1.1       schwarze  354:
1.5       schwarze  355:        if (ps->del > 0 || *name == '\0')
                    356:                return;
                    357:        if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
                    358:                ps->attr = 0;
                    359:                return;
                    360:        }
                    361:        if ((attr = calloc(1, sizeof(*attr))) == NULL) {
                    362:                perror(NULL);
                    363:                exit(1);
                    364:        }
                    365:        attr->key = key;
                    366:        attr->val = ATTRVAL__MAX;
                    367:        attr->rawval = NULL;
                    368:        TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
                    369:        ps->attr = 1;
                    370: }
                    371:
                    372: static void
                    373: xml_attrval(struct parse *ps, const char *name)
                    374: {
                    375:        struct pattr    *attr;
                    376:
                    377:        if (ps->del > 0 || ps->attr == 0)
                    378:                return;
                    379:        if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
                    380:                return;
                    381:        if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
                    382:            (attr->rawval = strdup(name)) == NULL) {
                    383:                perror(NULL);
                    384:                exit(1);
1.1       schwarze  385:        }
                    386: }
                    387:
                    388: /*
                    389:  * Roll up the parse tree.
                    390:  * If we're at a text node, roll that one up first.
                    391:  */
                    392: static void
1.5       schwarze  393: xml_elem_end(struct parse *ps, const char *name)
1.1       schwarze  394: {
1.5       schwarze  395:        const struct element    *elem;
                    396:        enum nodeid              node;
1.1       schwarze  397:
1.4       schwarze  398:        /*
                    399:         * An ancestor is excluded from the tree;
                    400:         * keep track of the number of levels excluded.
                    401:         */
                    402:        if (ps->del > 1) {
                    403:                ps->del--;
                    404:                return;
                    405:        }
                    406:
1.1       schwarze  407:        /* Close out the text node, if there is one. */
1.5       schwarze  408:        if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
1.1       schwarze  409:                pnode_trim(ps->cur);
                    410:                ps->cur = ps->cur->parent;
                    411:        }
1.2       schwarze  412:
1.5       schwarze  413:        if (name != NULL) {
                    414:                for (elem = elements; elem->name != NULL; elem++)
                    415:                        if (strcmp(elem->name, name) == 0)
                    416:                                break;
                    417:                node = elem->node;
                    418:        } else
                    419:                node = ps->ncur;
1.2       schwarze  420:
1.5       schwarze  421:        switch (node) {
1.4       schwarze  422:        case NODE_DELETE_WARN:
                    423:        case NODE_DELETE:
1.5       schwarze  424:                if (ps->del > 0)
                    425:                        ps->del--;
1.4       schwarze  426:                break;
1.2       schwarze  427:        case NODE_IGNORE:
                    428:                break;
                    429:        default:
1.5       schwarze  430:                if (ps->cur == NULL || node != ps->cur->node) {
1.6     ! schwarze  431:                        warn_msg(ps, "element not open: </%s>", name);
1.5       schwarze  432:                        break;
                    433:                }
                    434:
                    435:                /*
                    436:                 * Refrain from actually closing the document element.
                    437:                 * If no more content follows, no harm is done, but if
                    438:                 * some content still follows, simply processing it is
                    439:                 * obviously better than discarding it or crashing.
                    440:                 */
                    441:
                    442:                if (ps->cur->parent == NULL)
                    443:                        ps->tree->flags |= TREE_CLOSED;
                    444:                else
                    445:                        ps->cur = ps->cur->parent;
1.4       schwarze  446:                break;
1.2       schwarze  447:        }
1.4       schwarze  448:        assert(ps->del == 0);
1.1       schwarze  449: }
                    450:
                    451: struct parse *
                    452: parse_alloc(int warn)
                    453: {
                    454:        struct parse    *p;
                    455:
                    456:        if ((p = calloc(1, sizeof(*p))) == NULL)
                    457:                return NULL;
                    458:
                    459:        if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
                    460:                free(p);
                    461:                return NULL;
                    462:        }
                    463:        p->warn = warn;
                    464:        return p;
                    465: }
                    466:
                    467: void
                    468: parse_free(struct parse *p)
                    469: {
                    470:        if (p == NULL)
                    471:                return;
                    472:        if (p->tree != NULL) {
                    473:                pnode_unlink(p->tree->root);
                    474:                free(p->tree);
                    475:        }
                    476:        free(p);
                    477: }
                    478:
1.5       schwarze  479: /*
                    480:  * Advance the pend pointer to the next character in the charset.
                    481:  * If the charset starts with a space, it stands for any whitespace.
                    482:  * Update the new input file position, used for messages.
                    483:  * Do not overrun the buffer b of length rlen.
                    484:  * When reaching the end, NUL-terminate the buffer and return 1;
                    485:  * otherwise, return 0.
                    486:  */
                    487: static int
                    488: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
                    489:     const char *charset)
                    490: {
                    491:        int              space;
                    492:
                    493:        if (*charset == ' ') {
                    494:                space = 1;
                    495:                charset++;
                    496:        } else
                    497:                space = 0;
                    498:
                    499:        p->nline = p->line;
                    500:        p->ncol = p->col;
                    501:        while (*pend < rlen) {
                    502:                if (b[*pend] == '\n') {
                    503:                        p->nline++;
                    504:                        p->ncol = 1;
                    505:                } else
                    506:                        p->ncol++;
                    507:                if (space && isspace((unsigned char)b[*pend]))
                    508:                        break;
                    509:                if (strchr(charset, b[*pend]) != NULL)
                    510:                        break;
                    511:                ++*pend;
                    512:        }
                    513:        if (*pend == rlen) {
                    514:                b[rlen] = '\0';
                    515:                return 1;
                    516:        } else
                    517:                return 0;
                    518: }
                    519:
1.1       schwarze  520: struct ptree *
                    521: parse_file(struct parse *p, int fd, const char *fname)
                    522: {
                    523:        char             b[4096];
1.5       schwarze  524:        ssize_t          rsz;   /* Return value from read(2). */
                    525:        size_t           rlen;  /* Number of bytes in b[]. */
                    526:        size_t           poff;  /* Parse offset in b[]. */
                    527:        size_t           pend;  /* Offset of the end of the current word. */
                    528:        int              in_tag, in_arg, in_quotes, elem_end;
1.1       schwarze  529:
                    530:        p->fname = fname;
1.5       schwarze  531:        p->nline = 1;
                    532:        p->ncol = 1;
                    533:        rlen = 0;
                    534:        in_tag = in_arg = in_quotes = 0;
                    535:
                    536:        /*
                    537:         * Read loop.
                    538:         *
                    539:         * We have to enter the read loop once more even on EOF
                    540:         * because the previous token may have been incomplete,
                    541:         * such that it asked for more input.
                    542:         * Once rsz is 0, incomplete tokens will no longer ask
                    543:         * for more input but instead use whatever there is,
                    544:         * and then exit the read loop.
                    545:         * The minus one on the size limit for read(2) is needed
                    546:         * such that advance() can set b[rlen] to NUL when needed.
                    547:         */
                    548:
                    549:        while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
                    550:                if ((rlen += rsz) == 0)
                    551:                        break;
                    552:
                    553:                /* Token loop. */
                    554:
                    555:                pend = 0;
                    556:                for (;;) {
                    557:
                    558:                        /* Proceed to the next token, skipping whitespace. */
                    559:
                    560:                        p->line = p->nline;
                    561:                        p->col = p->ncol;
                    562:                        if ((poff = pend) == rlen)
                    563:                                break;
                    564:                        if (isspace((unsigned char)b[pend])) {
                    565:                                if (b[pend++] == '\n') {
                    566:                                        p->nline++;
                    567:                                        p->ncol = 1;
                    568:                                } else
                    569:                                        p->ncol++;
                    570:                                continue;
                    571:                        }
                    572:
                    573:                        /*
                    574:                         * The following three cases (in_arg, in_tag,
                    575:                         * and starting a tag) all parse a word or
                    576:                         * quoted string.  If that extends beyond the
                    577:                         * read buffer and the last read(2) still got
                    578:                         * data, they all break out of the token loop
                    579:                         * to request more data from the read loop.
                    580:                         *
                    581:                         * Also, they all detect self-closing tags,
                    582:                         * those ending with "/>", setting the flag
                    583:                         * elem_end and calling xml_elem_end() at the
                    584:                         * very end, after handling the attribute value,
                    585:                         * attribute name, or tag name, respectively.
                    586:                         */
                    587:
                    588:                        /* Parse an attribute value. */
                    589:
                    590:                        if (in_arg) {
                    591:                                if (in_quotes == 0 && b[pend] == '"') {
                    592:                                        in_quotes = 1;
                    593:                                        p->ncol++;
                    594:                                        pend++;
                    595:                                        continue;
                    596:                                }
                    597:                                if (advance(p, b, rlen, &pend,
                    598:                                    in_quotes ? "\"" : " >") && rsz > 0)
                    599:                                        break;
                    600:                                in_arg = in_quotes = elem_end = 0;
                    601:                                if (b[pend] == '>') {
                    602:                                        in_tag = 0;
                    603:                                        if (pend > 0 && b[pend - 1] == '/') {
                    604:                                                b[pend - 1] = '\0';
                    605:                                                elem_end = 1;
                    606:                                        }
                    607:                                }
                    608:                                b[pend] = '\0';
                    609:                                if (pend < rlen)
                    610:                                        pend++;
                    611:                                xml_attrval(p, b + poff);
                    612:                                if (elem_end)
                    613:                                        xml_elem_end(p, NULL);
                    614:
                    615:                        /* Look for an attribute name. */
                    616:
                    617:                        } else if (in_tag) {
                    618:                                if (advance(p, b, rlen, &pend, " =>") &&
                    619:                                    rsz > 0)
                    620:                                        break;
                    621:                                elem_end = 0;
                    622:                                switch (b[pend]) {
                    623:                                case '>':
                    624:                                        in_tag = 0;
                    625:                                        if (pend > 0 && b[pend - 1] == '/') {
                    626:                                                b[pend - 1] = '\0';
                    627:                                                elem_end = 1;
                    628:                                        }
                    629:                                        break;
                    630:                                case '=':
                    631:                                        in_arg = 1;
                    632:                                        break;
                    633:                                default:
                    634:                                        break;
                    635:                                }
                    636:                                b[pend] = '\0';
                    637:                                if (pend < rlen)
                    638:                                        pend++;
                    639:                                xml_attrkey(p, b + poff);
                    640:                                if (elem_end)
                    641:                                        xml_elem_end(p, NULL);
                    642:
                    643:                        /* Begin an opening or closing tag. */
                    644:
                    645:                        } else if (b[poff] == '<') {
                    646:                                if (advance(p, b, rlen, &pend, " >") &&
                    647:                                    rsz > 0)
                    648:                                        break;
                    649:                                elem_end = 0;
                    650:                                if (b[pend] != '>')
                    651:                                        in_tag = 1;
                    652:                                else if (pend > 0 && b[pend - 1] == '/') {
                    653:                                        b[pend - 1] = '\0';
                    654:                                        elem_end = 1;
                    655:                                }
                    656:                                b[pend] = '\0';
                    657:                                if (pend < rlen)
                    658:                                        pend++;
                    659:                                if (b[++poff] == '/') {
                    660:                                        elem_end = 1;
                    661:                                        poff++;
                    662:                                } else
                    663:                                        xml_elem_start(p, b + poff);
                    664:                                if (elem_end)
                    665:                                        xml_elem_end(p, b + poff);
                    666:
                    667:                        /* Process text up to the next tag. */
                    668:
                    669:                        } else {
                    670:                                if (advance(p, b, rlen, &pend, "<") == 0)
                    671:                                        p->ncol--;
                    672:                                xml_char(p, b + poff, pend - poff);
                    673:                        }
1.1       schwarze  674:                }
1.5       schwarze  675:
                    676:                /* Buffer exhausted; shift left and re-fill. */
                    677:
                    678:                assert(poff > 0);
                    679:                memmove(b, b + poff, rlen - poff);
                    680:                rlen -= poff;
                    681:        }
                    682:        if (rsz < 0) {
                    683:                perror(fname);
                    684:                p->tree->flags |= TREE_FAIL;
                    685:        }
                    686:        if (p->cur != NULL && p->cur->node == NODE_TEXT) {
                    687:                pnode_trim(p->cur);
                    688:                p->cur = p->cur->parent;
                    689:        }
1.6     ! schwarze  690:        if ((p->tree->flags & TREE_CLOSED) == 0)
        !           691:                warn_msg(p, "document not closed");
1.1       schwarze  692:        return p->tree;
                    693: }
CVSweb