docbook2mdoc/parse.c - annotate

Return to parse.c CVS log
Up to [cvsweb.bsd.lv] / docbook2mdoc
Annotation of docbook2mdoc/parse.c, Revision 1.39

1.39    ! schwarze    1: /* $Id: parse.c,v 1.38 2019/04/12 11:37:09 schwarze Exp $ */
1.1       schwarze    2: /*
                      3:  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
                      4:  * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
                      5:  *
                      6:  * Permission to use, copy, modify, and distribute this software for any
                      7:  * purpose with or without fee is hereby granted, provided that the above
                      8:  * copyright notice and this permission notice appear in all copies.
                      9:  *
                     10:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
                     11:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
                     12:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
                     13:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
                     14:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
                     15:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
                     16:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
                     17:  */
                     18: #include <assert.h>
                     19: #include <ctype.h>
1.24      schwarze   20: #include <errno.h>
                     21: #include <fcntl.h>
                     22: #include <libgen.h>
1.6       schwarze   23: #include <stdarg.h>
1.1       schwarze   24: #include <stdio.h>
1.5       schwarze   25: #include <stdlib.h>
1.1       schwarze   26: #include <string.h>
                     27: #include <unistd.h>
                     28:
                     29: #include "node.h"
                     30: #include "parse.h"
                     31:
                     32: /*
                     33:  * The implementation of the DocBook parser.
                     34:  */
                     35:
1.14      schwarze   36: enum   pstate {
                     37:        PARSE_ELEM,
                     38:        PARSE_TAG,
                     39:        PARSE_ARG,
                     40:        PARSE_SQ,
                     41:        PARSE_DQ
                     42: };
                     43:
1.1       schwarze   44: /*
                     45:  * Global parse state.
                     46:  * Keep this as simple and small as possible.
                     47:  */
                     48: struct parse {
                     49:        const char      *fname;  /* Name of the input file. */
                     50:        struct ptree    *tree;   /* Complete parse result. */
1.23      schwarze   51:        struct pnode    *doctype;
1.1       schwarze   52:        struct pnode    *cur;    /* Current node in the tree. */
1.5       schwarze   53:        enum nodeid      ncur;   /* Type of the current node. */
                     54:        int              line;   /* Line number in the input file. */
                     55:        int              col;    /* Column number in the input file. */
                     56:        int              nline;  /* Line number of next token. */
                     57:        int              ncol;   /* Column number of next token. */
1.4       schwarze   58:        int              del;    /* Levels of nested nodes being deleted. */
1.23      schwarze   59:        int              flags;
                     60: #define        PFLAG_WARN       (1 << 0)  /* Print warning messages. */
                     61: #define        PFLAG_SPC        (1 << 1)  /* Whitespace before the next element. */
                     62: #define        PFLAG_ATTR       (1 << 2)  /* The most recent attribute is valid. */
                     63: #define        PFLAG_EEND       (1 << 3)  /* This element is self-closing. */
1.1       schwarze   64: };
                     65:
1.39    ! schwarze   66: struct alias {
1.1       schwarze   67:        const char      *name;   /* DocBook element name. */
                     68:        enum nodeid      node;   /* Node type to generate. */
                     69: };
                     70:
1.39    ! schwarze   71: static const struct alias aliases[] = {
1.3       schwarze   72:        { "acronym",            NODE_IGNORE },
1.4       schwarze   73:        { "anchor",             NODE_DELETE },
1.22      schwarze   74:        { "article",            NODE_SECTION },
                     75:        { "book",               NODE_SECTION },
1.1       schwarze   76:        { "chapter",            NODE_SECTION },
1.13      schwarze   77:        { "code",               NODE_LITERAL },
1.36      schwarze   78:        { "computeroutput",     NODE_LITERAL },
1.23      schwarze   79:        { "!doctype",           NODE_DOCTYPE },
1.7       schwarze   80:        { "firstname",          NODE_PERSONNAME },
1.21      schwarze   81:        { "glossary",           NODE_VARIABLELIST },
                     82:        { "glossdef",           NODE_IGNORE },
                     83:        { "glossdiv",           NODE_IGNORE },
                     84:        { "glossentry",         NODE_VARLISTENTRY },
                     85:        { "glosslist",          NODE_VARIABLELIST },
1.4       schwarze   86:        { "indexterm",          NODE_DELETE },
1.11      schwarze   87:        { "informaltable",      NODE_TABLE },
1.7       schwarze   88:        { "othername",          NODE_PERSONNAME },
1.1       schwarze   89:        { "part",               NODE_SECTION },
1.3       schwarze   90:        { "phrase",             NODE_IGNORE },
1.4       schwarze   91:        { "primary",            NODE_DELETE },
1.1       schwarze   92:        { "refsect1",           NODE_SECTION },
                     93:        { "refsect2",           NODE_SECTION },
                     94:        { "refsect3",           NODE_SECTION },
                     95:        { "refsection",         NODE_SECTION },
1.4       schwarze   96:        { "secondary",          NODE_DELETE },
1.1       schwarze   97:        { "sect1",              NODE_SECTION },
                     98:        { "sect2",              NODE_SECTION },
1.36      schwarze   99:        { "sgmltag",            NODE_MARKUP },
1.15      schwarze  100:        { "simpara",            NODE_PARA },
1.13      schwarze  101:        { "structfield",        NODE_PARAMETER },
                    102:        { "structname",         NODE_TYPE },
1.7       schwarze  103:        { "surname",            NODE_PERSONNAME },
1.12      schwarze  104:        { "symbol",             NODE_CONSTANT },
1.3       schwarze  105:        { "trademark",          NODE_IGNORE },
1.18      schwarze  106:        { "ulink",              NODE_LINK },
1.13      schwarze  107:        { "userinput",          NODE_LITERAL },
1.5       schwarze  108:        { NULL,                 NODE_IGNORE }
1.1       schwarze  109: };
                    110:
1.9       schwarze  111: struct entity {
                    112:        const char      *name;
                    113:        const char      *roff;
                    114: };
                    115:
                    116: /*
                    117:  * XML character entity references found in the wild.
                    118:  * Those that don't have an exact mandoc_char(7) representation
                    119:  * are approximated, and the desired codepoint is given as a comment.
                    120:  * Encoding them as \\[u...] would leave -Tascii out in the cold.
                    121:  */
                    122: static const struct entity entities[] = {
                    123:        { "alpha",      "\\(*a" },
                    124:        { "amp",        "&" },
                    125:        { "apos",       "'" },
                    126:        { "auml",       "\\(:a" },
                    127:        { "beta",       "\\(*b" },
                    128:        { "circ",       "^" },      /* U+02C6 */
                    129:        { "copy",       "\\(co" },
                    130:        { "dagger",     "\\(dg" },
                    131:        { "Delta",      "\\(*D" },
                    132:        { "eacute",     "\\('e" },
                    133:        { "emsp",       "\\ " },    /* U+2003 */
                    134:        { "gt",         ">" },
                    135:        { "hairsp",     "\\^" },
                    136:        { "kappa",      "\\(*k" },
                    137:        { "larr",       "\\(<-" },
                    138:        { "ldquo",      "\\(lq" },
                    139:        { "le",         "\\(<=" },
                    140:        { "lowbar",     "_" },
                    141:        { "lsqb",       "[" },
                    142:        { "lt",         "<" },
                    143:        { "mdash",      "\\(em" },
                    144:        { "minus",      "\\-" },
                    145:        { "ndash",      "\\(en" },
                    146:        { "nbsp",       "\\ " },
                    147:        { "num",        "#" },
                    148:        { "oslash",     "\\(/o" },
                    149:        { "ouml",       "\\(:o" },
                    150:        { "percnt",     "%" },
                    151:        { "quot",       "\\(dq" },
                    152:        { "rarr",       "\\(->" },
                    153:        { "rArr",       "\\(rA" },
                    154:        { "rdquo",      "\\(rq" },
                    155:        { "reg",        "\\(rg" },
                    156:        { "rho",        "\\(*r" },
                    157:        { "rsqb",       "]" },
                    158:        { "sigma",      "\\(*s" },
                    159:        { "shy",        "\\&" },     /* U+00AD */
                    160:        { "tau",        "\\(*t" },
                    161:        { "tilde",      "\\[u02DC]" },
                    162:        { "times",      "\\[tmu]" },
                    163:        { "uuml",       "\\(:u" },
                    164:        { NULL,         NULL }
                    165: };
                    166:
1.23      schwarze  167: static size_t   parse_string(struct parse *, char *, size_t,
                    168:                         enum pstate *, int);
1.24      schwarze  169: static void     parse_fd(struct parse *, int);
1.23      schwarze  170:
                    171:
1.6       schwarze  172: static void
1.29      schwarze  173: fatal(struct parse *p)
                    174: {
                    175:        fprintf(stderr, "%s:%d:%d: FATAL: ", p->fname, p->line, p->col);
                    176:        perror(NULL);
                    177:        exit(6);
                    178: }
                    179:
                    180: static void
1.6       schwarze  181: error_msg(struct parse *p, const char *fmt, ...)
                    182: {
                    183:        va_list          ap;
                    184:
1.29      schwarze  185:        fprintf(stderr, "%s:%d:%d: ERROR: ", p->fname, p->line, p->col);
1.6       schwarze  186:        va_start(ap, fmt);
                    187:        vfprintf(stderr, fmt, ap);
                    188:        va_end(ap);
                    189:        fputc('\n', stderr);
1.29      schwarze  190:        p->tree->flags |= TREE_ERROR;
1.6       schwarze  191: }
                    192:
                    193: static void
                    194: warn_msg(struct parse *p, const char *fmt, ...)
                    195: {
                    196:        va_list          ap;
                    197:
1.23      schwarze  198:        if ((p->flags & PFLAG_WARN) == 0)
1.6       schwarze  199:                return;
                    200:
1.29      schwarze  201:        fprintf(stderr, "%s:%d:%d: WARNING: ", p->fname, p->line, p->col);
1.6       schwarze  202:        va_start(ap, fmt);
                    203:        vfprintf(stderr, fmt, ap);
                    204:        va_end(ap);
                    205:        fputc('\n', stderr);
1.29      schwarze  206:        p->tree->flags |= TREE_WARN;
1.6       schwarze  207: }
                    208:
1.1       schwarze  209: /*
                    210:  * Process a string of characters.
                    211:  * If a text node is already open, append to it.
                    212:  * Otherwise, create a new one as a child of the current node.
                    213:  */
                    214: static void
1.35      schwarze  215: xml_text(struct parse *p, const char *word, int sz)
1.1       schwarze  216: {
1.35      schwarze  217:        struct pnode    *n, *np;
1.32      schwarze  218:        size_t           oldsz, newsz;
1.35      schwarze  219:        int              i;
1.1       schwarze  220:
1.32      schwarze  221:        assert(sz > 0);
1.30      schwarze  222:        if (p->del > 0)
1.1       schwarze  223:                return;
                    224:
1.32      schwarze  225:        if ((n = p->cur) == NULL) {
1.35      schwarze  226:                error_msg(p, "discarding text before document: %.*s",
                    227:                    sz, word);
1.5       schwarze  228:                return;
                    229:        }
                    230:
1.35      schwarze  231:        /* Append to the current text node, if one is open. */
                    232:
                    233:        if (n->node == NODE_TEXT) {
                    234:                oldsz = strlen(n->b);
                    235:                newsz = oldsz + sz;
                    236:                if (oldsz && (p->flags & PFLAG_SPC))
                    237:                        newsz++;
                    238:                if ((n->b = realloc(n->b, newsz + 1)) == NULL)
1.30      schwarze  239:                        fatal(p);
1.35      schwarze  240:                if (oldsz && (p->flags & PFLAG_SPC))
                    241:                        n->b[oldsz++] = ' ';
                    242:                memcpy(n->b + oldsz, word, sz);
                    243:                n->b[newsz] = '\0';
                    244:                p->flags &= ~PFLAG_SPC;
                    245:                return;
1.1       schwarze  246:        }
                    247:
1.35      schwarze  248:        if (p->tree->flags & TREE_CLOSED && n == p->tree->root)
1.30      schwarze  249:                warn_msg(p, "text after end of document: %.*s", sz, word);
1.5       schwarze  250:
1.35      schwarze  251:        /* Create a new text node. */
1.1       schwarze  252:
1.35      schwarze  253:        if ((n = pnode_alloc(p->cur)) == NULL)
1.30      schwarze  254:                fatal(p);
1.35      schwarze  255:        n->node = NODE_TEXT;
                    256:        n->spc = (p->flags & PFLAG_SPC) != 0;
1.30      schwarze  257:        p->flags &= ~PFLAG_SPC;
1.35      schwarze  258:
                    259:        /*
1.39    ! schwarze  260:         * If this node follows an in-line macro without intervening
1.35      schwarze  261:         * whitespace, keep the text in it as short as possible,
                    262:         * and do not keep it open.
                    263:         */
                    264:
1.39    ! schwarze  265:        np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child);
        !           266:        while (np != NULL) {
        !           267:                switch (pnode_class(np->node)) {
        !           268:                case CLASS_VOID:
        !           269:                case CLASS_TEXT:
        !           270:                case CLASS_BLOCK:
        !           271:                        np = NULL;
        !           272:                        break;
        !           273:                case CLASS_TRANS:
        !           274:                        np = TAILQ_LAST(&np->childq, pnodeq);
        !           275:                        continue;
        !           276:                case CLASS_LINE:
        !           277:                case CLASS_ENCL:
        !           278:                        break;
        !           279:                }
        !           280:                break;
        !           281:        }
        !           282:        if (np != NULL) {
1.35      schwarze  283:                i = 0;
                    284:                while (i < sz && !isspace((unsigned char)word[i]))
                    285:                        i++;
                    286:                if ((n->b = strndup(word, i)) == NULL)
                    287:                        fatal(p);
                    288:                if (i == sz)
                    289:                        return;
                    290:                while (i < sz && isspace((unsigned char)word[i]))
                    291:                        i++;
                    292:                if (i == sz) {
                    293:                        p->flags |= PFLAG_SPC;
                    294:                        return;
                    295:                }
                    296:
                    297:                /* Put any remaining text into a second node. */
                    298:
                    299:                if ((n = pnode_alloc(p->cur)) == NULL)
                    300:                        fatal(p);
                    301:                n->node = NODE_TEXT;
                    302:                n->spc = 1;
                    303:                word += i;
                    304:                sz -= i;
                    305:        }
                    306:        if ((n->b = strndup(word, sz)) == NULL)
                    307:                fatal(p);
                    308:
                    309:        /* The new node remains open for later pnode_closetext(). */
                    310:
                    311:        p->cur = n;
1.1       schwarze  312: }
                    313:
1.16      schwarze  314: /*
                    315:  * Close out the text node and strip trailing whitespace, if one is open.
                    316:  */
1.1       schwarze  317: static void
1.37      schwarze  318: pnode_closetext(struct parse *p, int check_last_word)
1.1       schwarze  319: {
1.16      schwarze  320:        struct pnode    *n;
1.37      schwarze  321:        char            *cp, *last_word;
1.16      schwarze  322:
                    323:        if ((n = p->cur) == NULL || n->node != NODE_TEXT)
                    324:                return;
                    325:        p->cur = n->parent;
1.32      schwarze  326:        for (cp = strchr(n->b, '\0');
                    327:            cp > n->b && isspace((unsigned char)cp[-1]);
                    328:            *--cp = '\0')
1.23      schwarze  329:                p->flags |= PFLAG_SPC;
1.37      schwarze  330:
                    331:        if (p->flags & PFLAG_SPC || !check_last_word)
                    332:                return;
                    333:
                    334:        /*
                    335:         * Find the beginning of the last word
                    336:         * and delete whitespace before it.
                    337:         */
                    338:
                    339:        while (cp > n->b && !isspace((unsigned char)cp[-1]))
                    340:                cp--;
                    341:        if (cp == n->b)
                    342:                return;
                    343:
                    344:        last_word = cp;
                    345:        while (cp > n->b && isspace((unsigned char)cp[-1]))
                    346:            *--cp = '\0';
                    347:
                    348:        /* Move the last word into its own node, for use with .Pf. */
                    349:
                    350:        if ((n = pnode_alloc(p->cur)) == NULL)
                    351:                fatal(p);
                    352:        n->node = NODE_TEXT;
                    353:        n->spc = 1;
                    354:        if ((n->b = strdup(last_word)) == NULL)
                    355:                fatal(p);
1.1       schwarze  356: }
                    357:
1.9       schwarze  358: static void
                    359: xml_entity(struct parse *p, const char *name)
                    360: {
                    361:        const struct entity     *entity;
1.30      schwarze  362:        struct pnode            *n;
1.23      schwarze  363:        const char              *ccp;
                    364:        char                    *cp;
                    365:        enum pstate              pstate;
1.9       schwarze  366:
                    367:        if (p->del > 0)
                    368:                return;
                    369:
                    370:        if (p->cur == NULL) {
                    371:                error_msg(p, "discarding entity before document: &%s;", name);
                    372:                return;
                    373:        }
                    374:
1.37      schwarze  375:        pnode_closetext(p, 0);
1.9       schwarze  376:
                    377:        if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
                    378:                warn_msg(p, "entity after end of document: &%s;", name);
                    379:
                    380:        for (entity = entities; entity->name != NULL; entity++)
                    381:                if (strcmp(name, entity->name) == 0)
                    382:                        break;
                    383:
                    384:        if (entity->roff == NULL) {
1.23      schwarze  385:                if (p->doctype != NULL) {
1.30      schwarze  386:                        TAILQ_FOREACH(n, &p->doctype->childq, child) {
                    387:                                if ((ccp = pnode_getattr_raw(n,
1.23      schwarze  388:                                     ATTRKEY_NAME, NULL)) == NULL ||
1.25      schwarze  389:                                    strcmp(ccp, name) != 0)
                    390:                                        continue;
1.30      schwarze  391:                                if ((ccp = pnode_getattr_raw(n,
1.25      schwarze  392:                                    ATTRKEY_SYSTEM, NULL)) != NULL) {
                    393:                                        parse_file(p, -1, ccp);
                    394:                                        p->flags &= ~PFLAG_SPC;
                    395:                                        return;
                    396:                                }
1.30      schwarze  397:                                if ((ccp = pnode_getattr_raw(n,
1.23      schwarze  398:                                     ATTRKEY_DEFINITION, NULL)) == NULL)
                    399:                                        continue;
1.29      schwarze  400:                                if ((cp = strdup(ccp)) == NULL)
                    401:                                        fatal(p);
1.23      schwarze  402:                                pstate = PARSE_ELEM;
                    403:                                parse_string(p, cp, strlen(cp), &pstate, 0);
                    404:                                p->flags &= ~PFLAG_SPC;
                    405:                                free(cp);
                    406:                                return;
                    407:                        }
                    408:                }
1.9       schwarze  409:                error_msg(p, "unknown entity &%s;", name);
                    410:                return;
                    411:        }
                    412:
                    413:        /* Create, append, and close out an entity node. */
1.34      schwarze  414:        if ((n = pnode_alloc(p->cur)) == NULL ||
1.32      schwarze  415:            (n->b = strdup(entity->roff)) == NULL)
1.29      schwarze  416:                fatal(p);
1.30      schwarze  417:        n->node = NODE_ESCAPE;
                    418:        n->spc = (p->flags & PFLAG_SPC) != 0;
1.23      schwarze  419:        p->flags &= ~PFLAG_SPC;
1.9       schwarze  420: }
                    421:
1.1       schwarze  422: /*
1.39    ! schwarze  423:  * Parse an element name.
        !           424:  */
        !           425: static enum nodeid
        !           426: xml_name2node(struct parse *p, const char *name)
        !           427: {
        !           428:        const struct alias      *alias;
        !           429:        enum nodeid              node;
        !           430:
        !           431:        if ((node = pnode_parse(name)) < NODE_UNKNOWN)
        !           432:                return node;
        !           433:
        !           434:        for (alias = aliases; alias->name != NULL; alias++)
        !           435:                if (strcmp(alias->name, name) == 0)
        !           436:                        return alias->node;
        !           437:
        !           438:        return NODE_UNKNOWN;
        !           439: }
        !           440:
        !           441: /*
1.1       schwarze  442:  * Begin an element.
                    443:  */
                    444: static void
1.30      schwarze  445: xml_elem_start(struct parse *p, const char *name)
1.1       schwarze  446: {
1.30      schwarze  447:        struct pnode            *n;
1.1       schwarze  448:
1.4       schwarze  449:        /*
                    450:         * An ancestor is excluded from the tree;
                    451:         * keep track of the number of levels excluded.
                    452:         */
1.30      schwarze  453:        if (p->del > 0) {
1.23      schwarze  454:                if (*name != '!' && *name != '?')
1.30      schwarze  455:                        p->del++;
1.4       schwarze  456:                return;
                    457:        }
                    458:
1.39    ! schwarze  459:        switch (p->ncur = xml_name2node(p, name)) {
1.4       schwarze  460:        case NODE_DELETE_WARN:
1.30      schwarze  461:                warn_msg(p, "skipping element <%s>", name);
1.2       schwarze  462:                /* FALLTHROUGH */
1.4       schwarze  463:        case NODE_DELETE:
1.30      schwarze  464:                p->del = 1;
1.4       schwarze  465:                /* FALLTHROUGH */
1.2       schwarze  466:        case NODE_IGNORE:
                    467:                return;
1.39    ! schwarze  468:        case NODE_UNKNOWN:
        !           469:                if (*name != '!' && *name != '?')
        !           470:                        error_msg(p, "unknown element <%s>", name);
        !           471:                return;
1.2       schwarze  472:        default:
                    473:                break;
                    474:        }
1.1       schwarze  475:
1.30      schwarze  476:        if (p->tree->flags & TREE_CLOSED && p->cur->parent == NULL)
                    477:                warn_msg(p, "element after end of document: <%s>", name);
1.5       schwarze  478:
1.39    ! schwarze  479:        switch (pnode_class(p->ncur)) {
        !           480:        case CLASS_LINE:
        !           481:        case CLASS_ENCL:
        !           482:                pnode_closetext(p, 1);
        !           483:                break;
        !           484:        default:
        !           485:                pnode_closetext(p, 0);
        !           486:                break;
        !           487:        }
        !           488:
1.34      schwarze  489:        if ((n = pnode_alloc(p->cur)) == NULL)
1.30      schwarze  490:                fatal(p);
1.17      schwarze  491:
                    492:        /*
1.39    ! schwarze  493:         * Some elements are self-closing.
1.17      schwarze  494:         * Nodes that begin a new macro or request line or start by
                    495:         * printing text always want whitespace before themselves.
                    496:         */
                    497:
1.39    ! schwarze  498:        switch (n->node = p->ncur) {
1.23      schwarze  499:        case NODE_DOCTYPE:
                    500:        case NODE_ENTITY:
                    501:        case NODE_SBR:
1.30      schwarze  502:                p->flags |= PFLAG_EEND;
1.17      schwarze  503:                break;
                    504:        default:
1.39    ! schwarze  505:                break;
        !           506:        }
        !           507:        switch (pnode_class(p->ncur)) {
        !           508:        case CLASS_LINE:
        !           509:        case CLASS_ENCL:
1.30      schwarze  510:                n->spc = (p->flags & PFLAG_SPC) != 0;
1.17      schwarze  511:                break;
1.39    ! schwarze  512:        default:
        !           513:                n->spc = 1;
        !           514:                break;
1.17      schwarze  515:        }
1.30      schwarze  516:        p->cur = n;
                    517:        if (n->node == NODE_DOCTYPE) {
                    518:                if (p->doctype == NULL)
                    519:                        p->doctype = n;
1.23      schwarze  520:                else
1.30      schwarze  521:                        error_msg(p, "duplicate doctype");
                    522:        } else if (n->parent == NULL && p->tree->root == NULL)
                    523:                p->tree->root = n;
1.5       schwarze  524: }
                    525:
                    526: static void
1.30      schwarze  527: xml_attrkey(struct parse *p, const char *name)
1.5       schwarze  528: {
1.30      schwarze  529:        struct pattr    *a;
1.23      schwarze  530:        const char      *value;
1.5       schwarze  531:        enum attrkey     key;
1.1       schwarze  532:
1.30      schwarze  533:        if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0')
1.5       schwarze  534:                return;
1.23      schwarze  535:
1.30      schwarze  536:        if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) &&
                    537:            TAILQ_FIRST(&p->cur->attrq) == NULL) {
1.23      schwarze  538:                value = name;
                    539:                name = "NAME";
                    540:        } else
                    541:                value = NULL;
                    542:
1.5       schwarze  543:        if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
1.30      schwarze  544:                p->flags &= ~PFLAG_ATTR;
1.5       schwarze  545:                return;
                    546:        }
1.30      schwarze  547:        if ((a = calloc(1, sizeof(*a))) == NULL)
                    548:                fatal(p);
1.29      schwarze  549:
1.30      schwarze  550:        a->key = key;
                    551:        a->val = ATTRVAL__MAX;
1.23      schwarze  552:        if (value == NULL) {
1.30      schwarze  553:                a->rawval = NULL;
                    554:                p->flags |= PFLAG_ATTR;
1.23      schwarze  555:        } else {
1.30      schwarze  556:                if ((a->rawval = strdup(value)) == NULL)
                    557:                        fatal(p);
                    558:                p->flags &= ~PFLAG_ATTR;
                    559:        }
                    560:        TAILQ_INSERT_TAIL(&p->cur->attrq, a, child);
                    561:        if (p->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
                    562:                xml_attrkey(p, "DEFINITION");
1.5       schwarze  563: }
                    564:
                    565: static void
1.30      schwarze  566: xml_attrval(struct parse *p, const char *name)
1.5       schwarze  567: {
1.30      schwarze  568:        struct pattr    *a;
1.5       schwarze  569:
1.30      schwarze  570:        if (p->del > 0 || p->ncur == NODE_IGNORE ||
                    571:            (p->flags & PFLAG_ATTR) == 0)
1.5       schwarze  572:                return;
1.30      schwarze  573:        if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL)
1.5       schwarze  574:                return;
1.30      schwarze  575:        if ((a->val = attrval_parse(name)) == ATTRVAL__MAX &&
                    576:            (a->rawval = strdup(name)) == NULL)
                    577:                fatal(p);
                    578:        p->flags &= ~PFLAG_ATTR;
1.1       schwarze  579: }
                    580:
                    581: /*
                    582:  * Roll up the parse tree.
                    583:  * If we're at a text node, roll that one up first.
                    584:  */
                    585: static void
1.31      schwarze  586: xml_elem_end(struct parse *p, const char *name)
1.1       schwarze  587: {
1.26      schwarze  588:        struct pnode            *n;
                    589:        const char              *cp;
1.5       schwarze  590:        enum nodeid              node;
1.1       schwarze  591:
1.4       schwarze  592:        /*
                    593:         * An ancestor is excluded from the tree;
                    594:         * keep track of the number of levels excluded.
                    595:         */
1.31      schwarze  596:        if (p->del > 1) {
                    597:                p->del--;
1.4       schwarze  598:                return;
                    599:        }
                    600:
1.31      schwarze  601:        if (p->del == 0)
1.37      schwarze  602:                pnode_closetext(p, 0);
1.2       schwarze  603:
1.39    ! schwarze  604:        node = name == NULL ? p->ncur : xml_name2node(p, name);
1.2       schwarze  605:
1.5       schwarze  606:        switch (node) {
1.4       schwarze  607:        case NODE_DELETE_WARN:
                    608:        case NODE_DELETE:
1.31      schwarze  609:                if (p->del > 0)
                    610:                        p->del--;
1.4       schwarze  611:                break;
1.2       schwarze  612:        case NODE_IGNORE:
1.39    ! schwarze  613:        case NODE_UNKNOWN:
1.26      schwarze  614:                break;
                    615:        case NODE_INCLUDE:
1.31      schwarze  616:                n = p->cur;
                    617:                p->cur = p->cur->parent;
1.26      schwarze  618:                cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL);
                    619:                if (cp == NULL)
1.31      schwarze  620:                        error_msg(p, "<xi:include> element "
1.26      schwarze  621:                            "without href attribute");
                    622:                else
1.31      schwarze  623:                        parse_file(p, -1, cp);
1.26      schwarze  624:                pnode_unlink(n);
1.31      schwarze  625:                p->flags &= ~PFLAG_SPC;
1.2       schwarze  626:                break;
1.23      schwarze  627:        case NODE_DOCTYPE:
1.32      schwarze  628:        case NODE_SBR:
1.31      schwarze  629:                p->flags &= ~PFLAG_EEND;
1.23      schwarze  630:                /* FALLTHROUGH */
1.2       schwarze  631:        default:
1.31      schwarze  632:                if (p->cur == NULL || node != p->cur->node) {
                    633:                        warn_msg(p, "element not open: </%s>", name);
1.5       schwarze  634:                        break;
                    635:                }
                    636:
                    637:                /*
                    638:                 * Refrain from actually closing the document element.
                    639:                 * If no more content follows, no harm is done, but if
                    640:                 * some content still follows, simply processing it is
                    641:                 * obviously better than discarding it or crashing.
                    642:                 */
                    643:
1.31      schwarze  644:                if (p->cur->parent != NULL || node == NODE_DOCTYPE) {
                    645:                        p->cur = p->cur->parent;
                    646:                        if (p->cur != NULL)
                    647:                                p->ncur = p->cur->node;
1.23      schwarze  648:                } else
1.31      schwarze  649:                        p->tree->flags |= TREE_CLOSED;
                    650:                p->flags &= ~PFLAG_SPC;
1.4       schwarze  651:                break;
1.2       schwarze  652:        }
1.31      schwarze  653:        assert(p->del == 0);
1.1       schwarze  654: }
                    655:
                    656: struct parse *
                    657: parse_alloc(int warn)
                    658: {
                    659:        struct parse    *p;
                    660:
                    661:        if ((p = calloc(1, sizeof(*p))) == NULL)
                    662:                return NULL;
                    663:
                    664:        if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
                    665:                free(p);
                    666:                return NULL;
                    667:        }
1.23      schwarze  668:        if (warn)
                    669:                p->flags |= PFLAG_WARN;
                    670:        else
                    671:                p->flags &= ~PFLAG_WARN;
1.1       schwarze  672:        return p;
                    673: }
                    674:
                    675: void
                    676: parse_free(struct parse *p)
                    677: {
                    678:        if (p == NULL)
                    679:                return;
                    680:        if (p->tree != NULL) {
                    681:                pnode_unlink(p->tree->root);
                    682:                free(p->tree);
                    683:        }
                    684:        free(p);
                    685: }
                    686:
1.14      schwarze  687: static void
                    688: increment(struct parse *p, char *b, size_t *pend, int refill)
                    689: {
                    690:        if (refill) {
                    691:                if (b[*pend] == '\n') {
                    692:                        p->nline++;
                    693:                        p->ncol = 1;
                    694:                } else
                    695:                        p->ncol++;
                    696:        }
                    697:        ++*pend;
                    698: }
                    699:
1.5       schwarze  700: /*
                    701:  * Advance the pend pointer to the next character in the charset.
                    702:  * If the charset starts with a space, it stands for any whitespace.
                    703:  * Update the new input file position, used for messages.
                    704:  * Do not overrun the buffer b of length rlen.
                    705:  * When reaching the end, NUL-terminate the buffer and return 1;
                    706:  * otherwise, return 0.
                    707:  */
                    708: static int
                    709: advance(struct parse *p, char *b, size_t rlen, size_t *pend,
1.14      schwarze  710:     const char *charset, int refill)
1.5       schwarze  711: {
                    712:        int              space;
                    713:
                    714:        if (*charset == ' ') {
                    715:                space = 1;
                    716:                charset++;
                    717:        } else
                    718:                space = 0;
                    719:
1.14      schwarze  720:        if (refill) {
                    721:                p->nline = p->line;
                    722:                p->ncol = p->col;
                    723:        }
1.5       schwarze  724:        while (*pend < rlen) {
                    725:                if (space && isspace((unsigned char)b[*pend]))
                    726:                        break;
                    727:                if (strchr(charset, b[*pend]) != NULL)
                    728:                        break;
1.14      schwarze  729:                increment(p, b, pend, refill);
1.5       schwarze  730:        }
                    731:        if (*pend == rlen) {
                    732:                b[rlen] = '\0';
1.14      schwarze  733:                return refill;
1.5       schwarze  734:        } else
                    735:                return 0;
                    736: }
                    737:
1.14      schwarze  738: size_t
                    739: parse_string(struct parse *p, char *b, size_t rlen,
                    740:     enum pstate *pstate, int refill)
                    741: {
                    742:        char            *cp;
                    743:        size_t           poff;  /* Parse offset in b[]. */
                    744:        size_t           pend;  /* Offset of the end of the current word. */
                    745:        int              elem_end;
                    746:
                    747:        pend = 0;
                    748:        for (;;) {
                    749:
                    750:                /* Proceed to the next token, skipping whitespace. */
                    751:
                    752:                if (refill) {
                    753:                        p->line = p->nline;
                    754:                        p->col = p->ncol;
                    755:                }
                    756:                if ((poff = pend) == rlen)
                    757:                        break;
                    758:                if (isspace((unsigned char)b[pend])) {
1.23      schwarze  759:                        p->flags |= PFLAG_SPC;
1.14      schwarze  760:                        increment(p, b, &pend, refill);
                    761:                        continue;
                    762:                }
                    763:
                    764:                /*
                    765:                 * The following four cases (ARG, TAG, and starting an
                    766:                 * entity or a tag) all parse a word or quoted string.
                    767:                 * If that extends beyond the read buffer and the last
                    768:                 * read(2) still got data, they all break out of the
                    769:                 * token loop to request more data from the read loop.
                    770:                 *
                    771:                 * Also, three of them detect self-closing tags, those
                    772:                 * ending with "/>", setting the flag elem_end and
                    773:                 * calling xml_elem_end() at the very end, after
                    774:                 * handling the attribute value, attribute name, or
                    775:                 * tag name, respectively.
                    776:                 */
                    777:
                    778:                /* Parse an attribute value. */
                    779:
                    780:                if (*pstate >= PARSE_ARG) {
                    781:                        if (*pstate == PARSE_ARG &&
                    782:                            (b[pend] == '\'' || b[pend] == '"')) {
                    783:                                *pstate = b[pend] == '"' ?
                    784:                                    PARSE_DQ : PARSE_SQ;
                    785:                                increment(p, b, &pend, refill);
                    786:                                continue;
                    787:                        }
                    788:                        if (advance(p, b, rlen, &pend,
                    789:                            *pstate == PARSE_DQ ? "\"" :
                    790:                            *pstate == PARSE_SQ ? "'" : " >", refill))
                    791:                                break;
                    792:                        *pstate = PARSE_TAG;
                    793:                        elem_end = 0;
                    794:                        if (b[pend] == '>') {
                    795:                                *pstate = PARSE_ELEM;
                    796:                                if (pend > 0 && b[pend - 1] == '/') {
                    797:                                        b[pend - 1] = '\0';
                    798:                                        elem_end = 1;
                    799:                                }
1.23      schwarze  800:                                if (p->flags & PFLAG_EEND)
                    801:                                        elem_end = 1;
1.14      schwarze  802:                        }
                    803:                        b[pend] = '\0';
                    804:                        if (pend < rlen)
                    805:                                increment(p, b, &pend, refill);
                    806:                        xml_attrval(p, b + poff);
                    807:                        if (elem_end)
                    808:                                xml_elem_end(p, NULL);
                    809:
                    810:                /* Look for an attribute name. */
                    811:
                    812:                } else if (*pstate == PARSE_TAG) {
1.23      schwarze  813:                        switch (p->ncur) {
                    814:                        case NODE_DOCTYPE:
                    815:                                if (b[pend] == '[') {
                    816:                                        *pstate = PARSE_ELEM;
                    817:                                        increment(p, b, &pend, refill);
                    818:                                        continue;
                    819:                                }
                    820:                                /* FALLTHROUGH */
                    821:                        case NODE_ENTITY:
                    822:                                if (b[pend] == '"' || b[pend] == '\'') {
                    823:                                        *pstate = PARSE_ARG;
                    824:                                        continue;
                    825:                                }
                    826:                                break;
                    827:                        default:
                    828:                                break;
                    829:                        }
1.14      schwarze  830:                        if (advance(p, b, rlen, &pend, " =>", refill))
                    831:                                break;
                    832:                        elem_end = 0;
                    833:                        switch (b[pend]) {
                    834:                        case '>':
                    835:                                *pstate = PARSE_ELEM;
                    836:                                if (pend > 0 && b[pend - 1] == '/') {
                    837:                                        b[pend - 1] = '\0';
                    838:                                        elem_end = 1;
                    839:                                }
1.23      schwarze  840:                                if (p->flags & PFLAG_EEND)
                    841:                                        elem_end = 1;
1.14      schwarze  842:                                break;
                    843:                        case '=':
                    844:                                *pstate = PARSE_ARG;
                    845:                                break;
                    846:                        default:
                    847:                                break;
                    848:                        }
                    849:                        b[pend] = '\0';
                    850:                        if (pend < rlen)
                    851:                                increment(p, b, &pend, refill);
                    852:                        xml_attrkey(p, b + poff);
                    853:                        if (elem_end)
                    854:                                xml_elem_end(p, NULL);
                    855:
                    856:                /* Begin an opening or closing tag. */
                    857:
                    858:                } else if (b[poff] == '<') {
                    859:                        if (advance(p, b, rlen, &pend, " >", refill))
                    860:                                break;
                    861:                        if (pend > poff + 3 &&
                    862:                            strncmp(b + poff, "<!--", 4) == 0) {
                    863:
                    864:                                /* Skip a comment. */
                    865:
                    866:                                cp = strstr(b + pend - 2, "-->");
                    867:                                if (cp == NULL) {
                    868:                                        if (refill)
                    869:                                                break;
                    870:                                        cp = b + rlen;
                    871:                                } else
                    872:                                        cp += 3;
                    873:                                while (b + pend < cp)
                    874:                                        increment(p, b, &pend, refill);
                    875:                                continue;
                    876:                        }
                    877:                        elem_end = 0;
                    878:                        if (b[pend] != '>')
                    879:                                *pstate = PARSE_TAG;
                    880:                        else if (pend > 0 && b[pend - 1] == '/') {
                    881:                                b[pend - 1] = '\0';
                    882:                                elem_end = 1;
                    883:                        }
                    884:                        b[pend] = '\0';
                    885:                        if (pend < rlen)
                    886:                                increment(p, b, &pend, refill);
                    887:                        if (b[++poff] == '/') {
                    888:                                elem_end = 1;
                    889:                                poff++;
1.23      schwarze  890:                        } else {
1.14      schwarze  891:                                xml_elem_start(p, b + poff);
1.23      schwarze  892:                                if (*pstate == PARSE_ELEM &&
                    893:                                    p->flags & PFLAG_EEND)
                    894:                                        elem_end = 1;
                    895:                        }
1.14      schwarze  896:                        if (elem_end)
                    897:                                xml_elem_end(p, b + poff);
                    898:
1.23      schwarze  899:                /* Close a doctype. */
                    900:
                    901:                } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
                    902:                        *pstate = PARSE_TAG;
                    903:                        increment(p, b, &pend, refill);
                    904:
1.14      schwarze  905:                /* Process an entity. */
                    906:
                    907:                } else if (b[poff] == '&') {
                    908:                        if (advance(p, b, rlen, &pend, ";", refill))
                    909:                                break;
                    910:                        b[pend] = '\0';
                    911:                        if (pend < rlen)
                    912:                                increment(p, b, &pend, refill);
                    913:                        xml_entity(p, b + poff + 1);
                    914:
                    915:                /* Process text up to the next tag, entity, or EOL. */
                    916:
                    917:                } else {
1.28      schwarze  918:                        advance(p, b, rlen, &pend,
1.33      schwarze  919:                            p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n",
1.28      schwarze  920:                            refill);
1.35      schwarze  921:                        xml_text(p, b + poff, pend - poff);
1.33      schwarze  922:                        if (b[pend] == '\n')
1.37      schwarze  923:                                pnode_closetext(p, 0);
1.14      schwarze  924:                }
                    925:        }
                    926:        return poff;
                    927: }
                    928:
1.24      schwarze  929:
                    930: /*
                    931:  * The read loop.
                    932:  * If the previous token was incomplete and asked for more input,
                    933:  * we have to enter the read loop once more even on EOF.
                    934:  * Once rsz is 0, incomplete tokens will no longer ask for more input
                    935:  * but instead use whatever there is, and then exit the read loop.
                    936:  * The minus one on the size limit for read(2) is needed such that
                    937:  * advance() can set b[rlen] to NUL when needed.
                    938:  */
                    939: static void
                    940: parse_fd(struct parse *p, int fd)
1.1       schwarze  941: {
                    942:        char             b[4096];
1.5       schwarze  943:        ssize_t          rsz;   /* Return value from read(2). */
1.14      schwarze  944:        size_t           rlen;  /* Number of bytes in b[]. */
1.5       schwarze  945:        size_t           poff;  /* Parse offset in b[]. */
1.14      schwarze  946:        enum pstate      pstate;
1.1       schwarze  947:
1.24      schwarze  948:        rlen = 0;
1.14      schwarze  949:        pstate = PARSE_ELEM;
                    950:        while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
                    951:            (rlen += rsz) > 0) {
                    952:                poff = parse_string(p, b, rlen, &pstate, rsz > 0);
1.5       schwarze  953:                /* Buffer exhausted; shift left and re-fill. */
                    954:                assert(poff > 0);
                    955:                rlen -= poff;
1.14      schwarze  956:                memmove(b, b + poff, rlen);
1.5       schwarze  957:        }
1.24      schwarze  958:        if (rsz < 0)
                    959:                error_msg(p, "read: %s", strerror(errno));
                    960: }
                    961:
                    962: /*
                    963:  * Open and parse a file.
                    964:  */
                    965: struct ptree *
                    966: parse_file(struct parse *p, int fd, const char *fname)
                    967: {
                    968:        const char      *save_fname;
                    969:        int              save_line, save_col;
                    970:
                    971:        /* Save and initialize reporting data. */
                    972:
                    973:        save_fname = p->fname;
                    974:        save_line = p->nline;
                    975:        save_col = p->ncol;
                    976:        p->fname = fname;
                    977:        p->line = 0;
                    978:        p->col = 0;
                    979:
                    980:        /* Open the file, unless it is already open. */
                    981:
                    982:        if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
                    983:                error_msg(p, "open: %s", strerror(errno));
                    984:                p->fname = save_fname;
                    985:                return p->tree;
1.5       schwarze  986:        }
1.24      schwarze  987:
                    988:        /*
                    989:         * After opening the starting file, change to the directory it
                    990:         * is located in, in case it wants to include any further files,
                    991:         * which are typically given with relative paths in DocBook.
                    992:         * Do this on a best-effort basis; don't complain about failure.
                    993:         */
                    994:
                    995:        if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
                    996:            strcmp(fname, ".") != 0)
                    997:                (void)chdir(fname);
                    998:
                    999:        /* Run the read loop. */
                   1000:
                   1001:        p->nline = 1;
                   1002:        p->ncol = 1;
                   1003:        parse_fd(p, fd);
                   1004:
                   1005:        /* On the top level, finalize the parse tree. */
                   1006:
                   1007:        if (save_fname == NULL) {
1.37      schwarze 1008:                pnode_closetext(p, 0);
1.24      schwarze 1009:                if (p->tree->root == NULL)
                   1010:                        error_msg(p, "empty document");
                   1011:                else if ((p->tree->flags & TREE_CLOSED) == 0)
                   1012:                        warn_msg(p, "document not closed");
                   1013:                pnode_unlink(p->doctype);
                   1014:        }
                   1015:
                   1016:        /* Clean up. */
                   1017:
                   1018:        if (fd != STDIN_FILENO)
                   1019:                close(fd);
                   1020:        p->fname = save_fname;
                   1021:        p->nline = save_line;
                   1022:        p->ncol = save_col;
1.1       schwarze 1023:        return p->tree;
                   1024: }
CVSweb