=================================================================== RCS file: /cvs/docbook2mdoc/parse.c,v retrieving revision 1.41 retrieving revision 1.51 diff -u -p -r1.41 -r1.51 --- docbook2mdoc/parse.c 2019/04/13 13:06:35 1.41 +++ docbook2mdoc/parse.c 2019/04/24 18:38:02 1.51 @@ -1,4 +1,4 @@ -/* $Id: parse.c,v 1.41 2019/04/13 13:06:35 schwarze Exp $ */ +/* $Id: parse.c,v 1.51 2019/04/24 18:38:02 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2019 Ingo Schwarze @@ -56,11 +56,13 @@ struct parse { int nline; /* Line number of next token. */ int ncol; /* Column number of next token. */ int del; /* Levels of nested nodes being deleted. */ + int nofill; /* Levels of open no-fill displays. */ int flags; #define PFLAG_WARN (1 << 0) /* Print warning messages. */ -#define PFLAG_SPC (1 << 1) /* Whitespace before the next element. */ -#define PFLAG_ATTR (1 << 2) /* The most recent attribute is valid. */ -#define PFLAG_EEND (1 << 3) /* This element is self-closing. */ +#define PFLAG_LINE (1 << 1) /* New line before the next element. */ +#define PFLAG_SPC (1 << 2) /* Whitespace before the next element. */ +#define PFLAG_ATTR (1 << 3) /* The most recent attribute is valid. */ +#define PFLAG_EEND (1 << 4) /* This element is self-closing. */ }; struct alias { @@ -70,43 +72,60 @@ struct alias { static const struct alias aliases[] = { { "acronym", NODE_IGNORE }, + { "affiliation", NODE_IGNORE }, { "anchor", NODE_DELETE }, + { "application", NODE_COMMAND }, { "article", NODE_SECTION }, { "articleinfo", NODE_BOOKINFO }, { "book", NODE_SECTION }, { "chapter", NODE_SECTION }, + { "caption", NODE_IGNORE }, { "code", NODE_LITERAL }, { "computeroutput", NODE_LITERAL }, { "!doctype", NODE_DOCTYPE }, + { "figure", NODE_IGNORE }, { "firstname", NODE_PERSONNAME }, { "glossary", NODE_VARIABLELIST }, { "glossdef", NODE_IGNORE }, { "glossdiv", NODE_IGNORE }, { "glossentry", NODE_VARLISTENTRY }, { "glosslist", NODE_VARIABLELIST }, + { "holder", NODE_IGNORE }, + { "imageobject", NODE_IGNORE }, { "indexterm", NODE_DELETE }, { "informaltable", NODE_TABLE }, + { "keycap", NODE_KEYSYM }, + { "keycode", NODE_IGNORE }, + { "mediaobject", NODE_BLOCKQUOTE }, + { "orgname", NODE_IGNORE }, { "othercredit", NODE_AUTHOR }, { "othername", NODE_PERSONNAME }, { "part", NODE_SECTION }, { "phrase", NODE_IGNORE }, { "primary", NODE_DELETE }, + { "property", NODE_PARAMETER }, { "refsect1", NODE_SECTION }, { "refsect2", NODE_SECTION }, { "refsect3", NODE_SECTION }, { "refsection", NODE_SECTION }, + { "releaseinfo", NODE_IGNORE }, + { "returnvalue", NODE_IGNORE }, { "secondary", NODE_DELETE }, { "sect1", NODE_SECTION }, { "sect2", NODE_SECTION }, + { "sect3", NODE_SECTION }, + { "sect4", NODE_SECTION }, { "sgmltag", NODE_MARKUP }, { "simpara", NODE_PARA }, { "structfield", NODE_PARAMETER }, { "structname", NODE_TYPE }, { "surname", NODE_PERSONNAME }, { "symbol", NODE_CONSTANT }, + { "tag", NODE_MARKUP }, { "trademark", NODE_IGNORE }, { "ulink", NODE_LINK }, { "userinput", NODE_LITERAL }, + { "year", NODE_IGNORE }, { NULL, NODE_IGNORE } }; @@ -243,7 +262,7 @@ xml_text(struct parse *p, const char *word, int sz) n->b[oldsz++] = ' '; memcpy(n->b + oldsz, word, sz); n->b[newsz] = '\0'; - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); return; } @@ -255,8 +274,9 @@ xml_text(struct parse *p, const char *word, int sz) if ((n = pnode_alloc(p->cur)) == NULL) fatal(p); n->node = NODE_TEXT; - n->spc = (p->flags & PFLAG_SPC) != 0; - p->flags &= ~PFLAG_SPC; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); /* * If this node follows an in-line macro without intervening @@ -264,12 +284,13 @@ xml_text(struct parse *p, const char *word, int sz) * and do not keep it open. */ - np = n->spc ? NULL : TAILQ_PREV(n, pnodeq, child); + np = n->flags & NFLAG_SPC ? NULL : TAILQ_PREV(n, pnodeq, child); while (np != NULL) { switch (pnode_class(np->node)) { case CLASS_VOID: case CLASS_TEXT: case CLASS_BLOCK: + case CLASS_NOFILL: np = NULL; break; case CLASS_TRANS: @@ -301,7 +322,7 @@ xml_text(struct parse *p, const char *word, int sz) if ((n = pnode_alloc(p->cur)) == NULL) fatal(p); n->node = NODE_TEXT; - n->spc = 1; + n->flags |= NFLAG_SPC; word += i; sz -= i; } @@ -352,7 +373,7 @@ pnode_closetext(struct parse *p, int check_last_word) if ((n = pnode_alloc(p->cur)) == NULL) fatal(p); n->node = NODE_TEXT; - n->spc = 1; + n->flags |= NFLAG_SPC; if ((n->b = strdup(last_word)) == NULL) fatal(p); } @@ -364,6 +385,7 @@ xml_entity(struct parse *p, const char *name) struct pnode *n; const char *ccp; char *cp; + unsigned int codepoint; enum pstate pstate; if (p->del > 0) @@ -393,7 +415,7 @@ xml_entity(struct parse *p, const char *name) if ((ccp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL) { parse_file(p, -1, ccp); - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); return; } if ((ccp = pnode_getattr_raw(n, @@ -403,11 +425,21 @@ xml_entity(struct parse *p, const char *name) fatal(p); pstate = PARSE_ELEM; parse_string(p, cp, strlen(cp), &pstate, 0); - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); free(cp); return; } } + if (*name == '#') { + codepoint = strtonum(name + 1, 0, 0x10ffff, &ccp); + if (ccp == NULL) { + if ((n = pnode_alloc(p->cur)) == NULL || + asprintf(&n->b, "\\[u%4.4X]", + codepoint) < 0) + fatal(p); + goto done; + } + } error_msg(p, "unknown entity &%s;", name); return; } @@ -416,9 +448,11 @@ xml_entity(struct parse *p, const char *name) if ((n = pnode_alloc(p->cur)) == NULL || (n->b = strdup(entity->roff)) == NULL) fatal(p); +done: n->node = NODE_ESCAPE; - n->spc = (p->flags & PFLAG_SPC) != 0; - p->flags &= ~PFLAG_SPC; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); } /* @@ -501,6 +535,7 @@ xml_elem_start(struct parse *p, const char *name) case NODE_DOCTYPE: case NODE_ENTITY: case NODE_SBR: + case NODE_VOID: p->flags |= PFLAG_EEND; break; default: @@ -509,10 +544,14 @@ xml_elem_start(struct parse *p, const char *name) switch (pnode_class(p->ncur)) { case CLASS_LINE: case CLASS_ENCL: - n->spc = (p->flags & PFLAG_SPC) != 0; + n->flags = ((p->flags & PFLAG_LINE) ? NFLAG_LINE : 0) | + ((p->flags & PFLAG_SPC) ? NFLAG_SPC : 0); break; + case CLASS_NOFILL: + p->nofill++; + /* FALLTHROUGH */ default: - n->spc = 1; + n->flags |= NFLAG_SPC; break; } p->cur = n; @@ -532,7 +571,7 @@ xml_attrkey(struct parse *p, const char *name) const char *value; enum attrkey key; - if (p->del > 0 || p->ncur == NODE_IGNORE || *name == '\0') + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || *name == '\0') return; if ((p->ncur == NODE_DOCTYPE || p->ncur == NODE_ENTITY) && @@ -569,7 +608,7 @@ xml_attrval(struct parse *p, const char *name) { struct pattr *a; - if (p->del > 0 || p->ncur == NODE_IGNORE || + if (p->del > 0 || p->ncur >= NODE_UNKNOWN || (p->flags & PFLAG_ATTR) == 0) return; if ((a = TAILQ_LAST(&p->cur->attrq, pattrq)) == NULL) @@ -603,6 +642,7 @@ xml_elem_end(struct parse *p, const char *name) if (p->del == 0) pnode_closetext(p, 0); + n = p->cur; node = name == NULL ? p->ncur : xml_name2node(p, name); switch (node) { @@ -615,8 +655,7 @@ xml_elem_end(struct parse *p, const char *name) case NODE_UNKNOWN: break; case NODE_INCLUDE: - n = p->cur; - p->cur = p->cur->parent; + p->cur = n->parent; cp = pnode_getattr_raw(n, ATTRKEY_HREF, NULL); if (cp == NULL) error_msg(p, " element " @@ -624,17 +663,20 @@ xml_elem_end(struct parse *p, const char *name) else parse_file(p, -1, cp); pnode_unlink(n); - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); break; case NODE_DOCTYPE: case NODE_SBR: + case NODE_VOID: p->flags &= ~PFLAG_EEND; /* FALLTHROUGH */ default: - if (p->cur == NULL || node != p->cur->node) { + if (n == NULL || node != n->node) { warn_msg(p, "element not open: ", name); break; } + if (pnode_class(node) == CLASS_NOFILL) + p->nofill--; /* * Refrain from actually closing the document element. @@ -643,13 +685,21 @@ xml_elem_end(struct parse *p, const char *name) * obviously better than discarding it or crashing. */ - if (p->cur->parent != NULL || node == NODE_DOCTYPE) { - p->cur = p->cur->parent; + if (n->parent != NULL || node == NODE_DOCTYPE) { + p->cur = n->parent; if (p->cur != NULL) p->ncur = p->cur->node; } else p->tree->flags |= TREE_CLOSED; - p->flags &= ~PFLAG_SPC; + p->flags &= ~(PFLAG_LINE | PFLAG_SPC); + + /* Include a file containing entity declarations. */ + + if (node == NODE_ENTITY && strcmp("%", + pnode_getattr_raw(n, ATTRKEY_NAME, "")) == 0 && + (cp = pnode_getattr_raw(n, ATTRKEY_SYSTEM, NULL)) != NULL) + parse_file(p, -1, cp); + break; } assert(p->del == 0); @@ -742,11 +792,12 @@ parse_string(struct parse *p, char *b, size_t rlen, enum pstate *pstate, int refill) { char *cp; + size_t pws; /* Parse offset including whitespace. */ size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ int elem_end; - pend = 0; + pend = pws = 0; for (;;) { /* Proceed to the next token, skipping whitespace. */ @@ -759,6 +810,10 @@ parse_string(struct parse *p, char *b, size_t rlen, break; if (isspace((unsigned char)b[pend])) { p->flags |= PFLAG_SPC; + if (b[pend] == '\n') { + p->flags |= PFLAG_LINE; + pws = pend + 1; + } increment(p, b, &pend, refill); continue; } @@ -920,10 +975,13 @@ parse_string(struct parse *p, char *b, size_t rlen, advance(p, b, rlen, &pend, p->ncur == NODE_DOCTYPE ? "<&]\n" : "<&\n", refill); + if (p->nofill) + poff = pws; xml_text(p, b + poff, pend - poff); if (b[pend] == '\n') pnode_closetext(p, 0); } + pws = pend; } return poff; }