docbook2mdoc/parse.c - diff

Return to parse.c CVS log

Up to [cvsweb.bsd.lv] / docbook2mdoc

Diff for /docbook2mdoc/parse.c between version 1.2 and 1.25

-version 1.2, 2019/03/26 20:54:43
+version 1.25, 2019/04/08 23:40:17
 Line 17
 Line 17
 Line 17
   */
  #include <assert.h>
  #include <ctype.h>
- #include <expat.h>
+ #include <errno.h>
+ #include <fcntl.h>
+ #include <libgen.h>
+ #include <stdarg.h>
  #include <stdio.h>
+ #include <stdlib.h>
  #include <string.h>
  #include <unistd.h>
-Line 29
+Line 33
 Line 29
 Line 33
   * The implementation of the DocBook parser.
   */
+ enum    pstate {
+         PARSE_ELEM,
+         PARSE_TAG,
+         PARSE_ARG,
+         PARSE_SQ,
+         PARSE_DQ
+ };
  /*
   * Global parse state.
   * Keep this as simple and small as possible.
   */
  struct  parse {
-         XML_Parser       xml;
          const char      *fname;  /* Name of the input file. */
          struct ptree    *tree;   /* Complete parse result. */
+         struct pnode    *doctype;
          struct pnode    *cur;    /* Current node in the tree. */
-         int              warn;
+         enum nodeid      ncur;   /* Type of the current node. */
+         int              line;   /* Line number in the input file. */
+         int              col;    /* Column number in the input file. */
+         int              nline;  /* Line number of next token. */
+         int              ncol;   /* Column number of next token. */
+         int              del;    /* Levels of nested nodes being deleted. */
+         int              flags;
+ #define PFLAG_WARN       (1 << 0)  /* Print warning messages. */
+ #define PFLAG_SPC        (1 << 1)  /* Whitespace before the next element. */
+ #define PFLAG_ATTR       (1 << 2)  /* The most recent attribute is valid. */
+ #define PFLAG_EEND       (1 << 3)  /* This element is self-closing. */
  };
  struct  element {
-Line 47  struct element {
+Line 69  struct element {
 Line 47  struct element {
 Line 69  struct element {
  };
  static  const struct element elements[] = {
-         { "acronym",            NODE_ACRONYM },
+         { "acronym",            NODE_IGNORE },
          { "affiliation",        NODE_AFFILIATION },
-         { "anchor",             NODE_ANCHOR },
+         { "anchor",             NODE_DELETE },
+         { "appendix",           NODE_APPENDIX },
          { "application",        NODE_APPLICATION },
          { "arg",                NODE_ARG },
+         { "article",            NODE_SECTION },
          { "author",             NODE_AUTHOR },
          { "authorgroup",        NODE_AUTHORGROUP },
          { "blockquote",         NODE_BLOCKQUOTE },
-         { "book",               NODE_BOOK },
+         { "book",               NODE_SECTION },
          { "bookinfo",           NODE_BOOKINFO },
          { "caution",            NODE_CAUTION },
          { "chapter",            NODE_SECTION },
          { "citerefentry",       NODE_CITEREFENTRY },
          { "citetitle",          NODE_CITETITLE },
          { "cmdsynopsis",        NODE_CMDSYNOPSIS },
-         { "code",               NODE_CODE },
+         { "code",               NODE_LITERAL },
          { "colspec",            NODE_COLSPEC },
          { "command",            NODE_COMMAND },
          { "constant",           NODE_CONSTANT },
+         { "contrib",            NODE_CONTRIB },
          { "copyright",          NODE_COPYRIGHT },
          { "date",               NODE_DATE },
+         { "!doctype",           NODE_DOCTYPE },
+         { "!DOCTYPE",           NODE_DOCTYPE },
          { "editor",             NODE_EDITOR },
          { "email",              NODE_EMAIL },
          { "emphasis",           NODE_EMPHASIS },
+         { "!ENTITY",            NODE_ENTITY },
          { "entry",              NODE_ENTRY },
          { "envar",              NODE_ENVAR },
+         { "errorname",          NODE_ERRORNAME },
          { "fieldsynopsis",      NODE_FIELDSYNOPSIS },
          { "filename",           NODE_FILENAME },
-         { "firstname",          NODE_FIRSTNAME },
+         { "firstname",          NODE_PERSONNAME },
          { "firstterm",          NODE_FIRSTTERM },
          { "footnote",           NODE_FOOTNOTE },
          { "funcdef",            NODE_FUNCDEF },
-Line 83  static const struct element elements[] = {
+Line 112  static const struct element elements[] = {
 Line 83  static const struct element elements[] = {
 Line 112  static const struct element elements[] = {
          { "funcsynopsis",       NODE_FUNCSYNOPSIS },
          { "funcsynopsisinfo",   NODE_FUNCSYNOPSISINFO },
          { "function",           NODE_FUNCTION },
+         { "glossary",           NODE_VARIABLELIST },
+         { "glossdef",           NODE_IGNORE },
+         { "glossdiv",           NODE_IGNORE },
+         { "glossentry",         NODE_VARLISTENTRY },
+         { "glosslist",          NODE_VARIABLELIST },
          { "glossterm",          NODE_GLOSSTERM },
          { "group",              NODE_GROUP },
          { "holder",             NODE_HOLDER },
          { "index",              NODE_INDEX },
-         { "indexterm",          NODE_INDEXTERM },
+         { "indexterm",          NODE_DELETE },
          { "info",               NODE_INFO },
          { "informalequation",   NODE_INFORMALEQUATION },
-         { "informaltable",      NODE_INFORMALTABLE },
+         { "informaltable",      NODE_TABLE },
          { "inlineequation",     NODE_INLINEEQUATION },
          { "itemizedlist",       NODE_ITEMIZEDLIST },
          { "keysym",             NODE_KEYSYM },
-Line 115  static const struct element elements[] = {
+Line 149  static const struct element elements[] = {
 Line 115  static const struct element elements[] = {
 Line 149  static const struct element elements[] = {
          { "option",             NODE_OPTION },
          { "orderedlist",        NODE_ORDEREDLIST },
          { "orgname",            NODE_ORGNAME },
-         { "othername",          NODE_OTHERNAME },
+         { "othername",          NODE_PERSONNAME },
          { "para",               NODE_PARA },
          { "paramdef",           NODE_PARAMDEF },
          { "parameter",          NODE_PARAMETER },
          { "part",               NODE_SECTION },
          { "personname",         NODE_PERSONNAME },
-         { "phrase",             NODE_PHRASE },
+         { "phrase",             NODE_IGNORE },
          { "preface",            NODE_PREFACE },
-         { "primary",            NODE_PRIMARY },
+         { "primary",            NODE_DELETE },
          { "programlisting",     NODE_PROGRAMLISTING },
          { "prompt",             NODE_PROMPT },
          { "quote",              NODE_QUOTE },
-Line 148  static const struct element elements[] = {
+Line 182  static const struct element elements[] = {
 Line 148  static const struct element elements[] = {
 Line 182  static const struct element elements[] = {
          { "row",                NODE_ROW },
          { "sbr",                NODE_SBR },
          { "screen",             NODE_SCREEN },
-         { "secondary",          NODE_SECONDARY },
+         { "secondary",          NODE_DELETE },
          { "sect1",              NODE_SECTION },
          { "sect2",              NODE_SECTION },
          { "section",            NODE_SECTION },
          { "sgmltag",            NODE_SGMLTAG },
+         { "simpara",            NODE_PARA },
          { "simplelist",         NODE_SIMPLELIST },
          { "spanspec",           NODE_SPANSPEC },
-         { "structname",         NODE_STRUCTNAME },
+         { "structfield",        NODE_PARAMETER },
+         { "structname",         NODE_TYPE },
          { "subtitle",           NODE_SUBTITLE },
-         { "surname",            NODE_SURNAME },
+         { "surname",            NODE_PERSONNAME },
+         { "symbol",             NODE_CONSTANT },
          { "synopsis",           NODE_SYNOPSIS },
          { "table",              NODE_TABLE },
          { "tbody",              NODE_TBODY },
-Line 167  static const struct element elements[] = {
+Line 204  static const struct element elements[] = {
 Line 167  static const struct element elements[] = {
 Line 204  static const struct element elements[] = {
          { "thead",              NODE_THEAD },
          { "tip",                NODE_TIP },
          { "title",              NODE_TITLE },
-         { "trademark",          NODE_TRADEMARK },
+         { "trademark",          NODE_IGNORE },
          { "type",               NODE_TYPE },
-         { "ulink",              NODE_ULINK },
+         { "ulink",              NODE_LINK },
-         { "userinput",          NODE_USERINPUT },
+         { "userinput",          NODE_LITERAL },
          { "variablelist",       NODE_VARIABLELIST },
          { "varlistentry",       NODE_VARLISTENTRY },
          { "varname",            NODE_VARNAME },
          { "warning",            NODE_WARNING },
          { "wordasword",         NODE_WORDASWORD },
-         { "xi:include",         NODE_WARN },
+         { "xi:include",         NODE_DELETE_WARN },
          { "year",               NODE_YEAR },
-         { NULL,                 NODE__MAX }
+         { NULL,                 NODE_IGNORE }
  };
+ struct  entity {
+         const char      *name;
+         const char      *roff;
+ };
  /*
+  * XML character entity references found in the wild.
+  * Those that don't have an exact mandoc_char(7) representation
+  * are approximated, and the desired codepoint is given as a comment.
+  * Encoding them as \\[u...] would leave -Tascii out in the cold.
+  */
+ static  const struct entity entities[] = {
+         { "alpha",      "\\(*a" },
+         { "amp",        "&" },
+         { "apos",       "'" },
+         { "auml",       "\\(:a" },
+         { "beta",       "\\(*b" },
+         { "circ",       "^" },      /* U+02C6 */
+         { "copy",       "\\(co" },
+         { "dagger",     "\\(dg" },
+         { "Delta",      "\\(*D" },
+         { "eacute",     "\\('e" },
+         { "emsp",       "\\ " },    /* U+2003 */
+         { "gt",         ">" },
+         { "hairsp",     "\\^" },
+         { "kappa",      "\\(*k" },
+         { "larr",       "\\(<-" },
+         { "ldquo",      "\\(lq" },
+         { "le",         "\\(<=" },
+         { "lowbar",     "_" },
+         { "lsqb",       "[" },
+         { "lt",         "<" },
+         { "mdash",      "\\(em" },
+         { "minus",      "\\-" },
+         { "ndash",      "\\(en" },
+         { "nbsp",       "\\ " },
+         { "num",        "#" },
+         { "oslash",     "\\(/o" },
+         { "ouml",       "\\(:o" },
+         { "percnt",     "%" },
+         { "quot",       "\\(dq" },
+         { "rarr",       "\\(->" },
+         { "rArr",       "\\(rA" },
+         { "rdquo",      "\\(rq" },
+         { "reg",        "\\(rg" },
+         { "rho",        "\\(*r" },
+         { "rsqb",       "]" },
+         { "sigma",      "\\(*s" },
+         { "shy",        "\\&" },     /* U+00AD */
+         { "tau",        "\\(*t" },
+         { "tilde",      "\\[u02DC]" },
+         { "times",      "\\[tmu]" },
+         { "uuml",       "\\(:u" },
+         { NULL,         NULL }
+ };
+ static size_t    parse_string(struct parse *, char *, size_t,
+                          enum pstate *, int);
+ static void      parse_fd(struct parse *, int);
+ static void
+ error_msg(struct parse *p, const char *fmt, ...)
+ {
+         va_list          ap;
+         fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
+         va_start(ap, fmt);
+         vfprintf(stderr, fmt, ap);
+         va_end(ap);
+         fputc('\n', stderr);
+         p->tree->flags |= TREE_FAIL;
+ }
+ static void
+ warn_msg(struct parse *p, const char *fmt, ...)
+ {
+         va_list          ap;
+         if ((p->flags & PFLAG_WARN) == 0)
+                 return;
+         fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
+         va_start(ap, fmt);
+         vfprintf(stderr, fmt, ap);
+         va_end(ap);
+         fputc('\n', stderr);
+ }
+ /*
   * Process a string of characters.
   * If a text node is already open, append to it.
   * Otherwise, create a new one as a child of the current node.
   */
  static void
- xml_char(void *arg, const XML_Char *p, int sz)
+ xml_char(struct parse *ps, const char *p, int sz)
  {
-         struct parse    *ps;
          struct pnode    *dat;
-         int              i;
+         size_t           newsz;
-         ps = arg;
+         if (ps->del > 0)
-         if (ps->tree->flags && TREE_FAIL)
                  return;
-         /*
+         if (ps->cur == NULL) {
-          * Only create a new node if there is non-whitespace text.
+                 error_msg(ps, "discarding text before document: %.*s", sz, p);
-          * Strip all leading whitespace.
+                 return;
-          */
+         }
-         if (ps->cur->node != NODE_TEXT) {
-                 for (i = 0; i < sz; i++)
-                         if (isspace((unsigned char)p[i]) == 0)
-                                 break;
-                 if (i == sz)
-                         return;
-                 p += i;
-                 sz -= i;
+         if (ps->cur->node != NODE_TEXT) {
                  if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                          perror(NULL);
                          exit(1);
                  }
                  dat->node = NODE_TEXT;
+                 dat->spc = (ps->flags & PFLAG_SPC) != 0;
                  dat->parent = ps->cur;
                  TAILQ_INIT(&dat->childq);
                  TAILQ_INIT(&dat->attrq);
-Line 222  xml_char(void *arg, const XML_Char *p, int sz)
+Line 340  xml_char(void *arg, const XML_Char *p, int sz)
 Line 222  xml_char(void *arg, const XML_Char *p, int sz)
 Line 340  xml_char(void *arg, const XML_Char *p, int sz)
                  ps->cur = dat;
          }
+         if (ps->tree->flags & TREE_CLOSED &&
+             ps->cur->parent == ps->tree->root)
+                 warn_msg(ps, "text after end of document: %.*s", sz, p);
          /* Append to the current text node. */
          assert(sz >= 0);
-         ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
+         newsz = ps->cur->bsz + (ps->cur->bsz && (ps->flags & PFLAG_SPC)) + sz;
+         ps->cur->b = realloc(ps->cur->b, newsz + 1);
          if (ps->cur->b == NULL) {
                  perror(NULL);
                  exit(1);
          }
+         if (ps->cur->bsz && (ps->flags & PFLAG_SPC))
+                 ps->cur->b[ps->cur->bsz++] = ' ';
          memcpy(ps->cur->b + ps->cur->bsz, p, sz);
-         ps->cur->bsz += sz;
+         ps->cur->b[ps->cur->bsz = newsz] = '\0';
-         ps->cur->b[ps->cur->bsz] = '\0';
          ps->cur->real = ps->cur->b;
+         ps->flags &= ~PFLAG_SPC;
  }
+ /*
+  * Close out the text node and strip trailing whitespace, if one is open.
+  */
  static void
- pnode_trim(struct pnode *pn)
+ pnode_closetext(struct parse *p)
  {
-         assert(pn->node == NODE_TEXT);
+         struct pnode    *n;
-         for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
-                 if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
+         if ((n = p->cur) == NULL || n->node != NODE_TEXT)
+                 return;
+         p->cur = n->parent;
+         while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
+                 n->b[--n->bsz] = '\0';
+                 p->flags |= PFLAG_SPC;
+         }
+ }
+ static void
+ xml_entity(struct parse *p, const char *name)
+ {
+         const struct entity     *entity;
+         struct pnode            *dat;
+         const char              *ccp;
+         char                    *cp;
+         enum pstate              pstate;
+         if (p->del > 0)
+                 return;
+         if (p->cur == NULL) {
+                 error_msg(p, "discarding entity before document: &%s;", name);
+                 return;
+         }
+         pnode_closetext(p);
+         if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
+                 warn_msg(p, "entity after end of document: &%s;", name);
+         for (entity = entities; entity->name != NULL; entity++)
+                 if (strcmp(name, entity->name) == 0)
                          break;
+         if (entity->roff == NULL) {
+                 if (p->doctype != NULL) {
+                         TAILQ_FOREACH(dat, &p->doctype->childq, child) {
+                                 if ((ccp = pnode_getattr_raw(dat,
+                                      ATTRKEY_NAME, NULL)) == NULL ||
+                                     strcmp(ccp, name) != 0)
+                                         continue;
+                                 if ((ccp = pnode_getattr_raw(dat,
+                                     ATTRKEY_SYSTEM, NULL)) != NULL) {
+                                         parse_file(p, -1, ccp);
+                                         p->flags &= ~PFLAG_SPC;
+                                         return;
+                                 }
+                                 if ((ccp = pnode_getattr_raw(dat,
+                                      ATTRKEY_DEFINITION, NULL)) == NULL)
+                                         continue;
+                                 if ((cp = strdup(ccp)) == NULL) {
+                                         perror(NULL);
+                                         exit(1);
+                                 }
+                                 pstate = PARSE_ELEM;
+                                 parse_string(p, cp, strlen(cp), &pstate, 0);
+                                 p->flags &= ~PFLAG_SPC;
+                                 free(cp);
+                                 return;
+                         }
+                 }
+                 error_msg(p, "unknown entity &%s;", name);
+                 return;
+         }
+         /* Create, append, and close out an entity node. */
+         if ((dat = calloc(1, sizeof(*dat))) == NULL ||
+             (dat->b = dat->real = strdup(entity->roff)) == NULL) {
+                 perror(NULL);
+                 exit(1);
+         }
+         dat->node = NODE_ESCAPE;
+         dat->bsz = strlen(dat->b);
+         dat->spc = (p->flags & PFLAG_SPC) != 0;
+         dat->parent = p->cur;
+         TAILQ_INIT(&dat->childq);
+         TAILQ_INIT(&dat->attrq);
+         TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
+         p->flags &= ~PFLAG_SPC;
  }
  /*
   * Begin an element.
-  * If the name is unknown, abort parsing.
   */
  static void
- xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
+ xml_elem_start(struct parse *ps, const char *name)
  {
-         struct parse     *ps;
+         const struct element    *elem;
-         const struct element *elem;
+         struct pnode            *dat;
-         enum attrkey      key;
-         struct pnode     *dat;
-         struct pattr     *pattr;
-         const XML_Char  **att;
-         ps = arg;
+         /*
-         if (ps->tree->flags && TREE_FAIL)
+          * An ancestor is excluded from the tree;
+          * keep track of the number of levels excluded.
+          */
+         if (ps->del > 0) {
+                 if (*name != '!' && *name != '?')
+                         ps->del++;
                  return;
-         /* Close out the text node, if there is one. */
-         if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
-                 pnode_trim(ps->cur);
-                 ps->cur = ps->cur->parent;
          }
+         pnode_closetext(ps);
          for (elem = elements; elem->name != NULL; elem++)
                  if (strcmp(elem->name, name) == 0)
                          break;
          if (elem->name == NULL) {
-                 fprintf(stderr, "%s:%zu:%zu: unknown element \"%s\"\n",
+                 if (*name == '!' || *name == '?')
-                         ps->fname, XML_GetCurrentLineNumber(ps->xml),
+                         return;
-                         XML_GetCurrentColumnNumber(ps->xml), name);
+                 error_msg(ps, "unknown element <%s>", name);
-                 ps->tree->flags |= TREE_FAIL;
-                 return;
          }
-         switch (elem->node) {
+         ps->ncur = elem->node;
-         case NODE_WARN:
-                 if (ps->warn)
+         switch (ps->ncur) {
-                         fprintf(stderr, "%s:%zu:%zu: warning: "
+         case NODE_DELETE_WARN:
-                             "ignoring element <%s>\n", ps->fname,
+                 warn_msg(ps, "skipping element <%s>", name);
-                             XML_GetCurrentLineNumber(ps->xml),
-                             XML_GetCurrentColumnNumber(ps->xml), name);
                  /* FALLTHROUGH */
+         case NODE_DELETE:
+                 ps->del = 1;
+                 /* FALLTHROUGH */
          case NODE_IGNORE:
                  return;
          case NODE_INLINEEQUATION:
-Line 298  xml_elem_start(void *arg, const XML_Char *name, const
+Line 500  xml_elem_start(void *arg, const XML_Char *name, const
 Line 298  xml_elem_start(void *arg, const XML_Char *name, const
 Line 500  xml_elem_start(void *arg, const XML_Char *name, const
                  break;
          }
+         if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
+                 warn_msg(ps, "element after end of document: <%s>", name);
          if ((dat = calloc(1, sizeof(*dat))) == NULL) {
                  perror(NULL);
                  exit(1);
          }
-         dat->node = elem->node;
+         /*
+          * Nodes that begin a new macro or request line or start by
+          * printing text always want whitespace before themselves.
+          */
+         switch (dat->node = elem->node) {
+         case NODE_DOCTYPE:
+         case NODE_ENTITY:
+         case NODE_SBR:
+                 ps->flags |= PFLAG_EEND;
+                 /* FALLTHROUGH */
+         case NODE_APPENDIX:
+         case NODE_AUTHORGROUP:
+         case NODE_BLOCKQUOTE:
+         case NODE_BOOKINFO:
+         case NODE_CAUTION:
+         case NODE_EDITOR:
+         case NODE_ENTRY:
+         case NODE_FUNCDEF:
+         case NODE_FUNCPROTOTYPE:
+         case NODE_INFORMALEQUATION:
+         case NODE_INLINEEQUATION:
+         case NODE_ITEMIZEDLIST:
+         case NODE_LEGALNOTICE:
+         case NODE_LITERALLAYOUT:
+         case NODE_NOTE:
+         case NODE_ORDEREDLIST:
+         case NODE_PARA:
+         case NODE_PREFACE:
+         case NODE_PROGRAMLISTING:
+         case NODE_REFMETA:
+         case NODE_REFNAMEDIV:
+         case NODE_REFSYNOPSISDIV:
+         case NODE_ROW:
+         case NODE_SCREEN:
+         case NODE_SECTION:
+         case NODE_SYNOPSIS:
+         case NODE_TGROUP:
+         case NODE_TIP:
+         case NODE_TITLE:
+         case NODE_VARIABLELIST:
+         case NODE_VARLISTENTRY:
+         case NODE_WARNING:
+                 dat->spc = 1;
+                 break;
+         default:
+                 dat->spc = (ps->flags & PFLAG_SPC) != 0;
+                 break;
+         }
          dat->parent = ps->cur;
          TAILQ_INIT(&dat->childq);
          TAILQ_INIT(&dat->attrq);
-Line 311  xml_elem_start(void *arg, const XML_Char *name, const
+Line 565  xml_elem_start(void *arg, const XML_Char *name, const
 Line 311  xml_elem_start(void *arg, const XML_Char *name, const
 Line 565  xml_elem_start(void *arg, const XML_Char *name, const
                  TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
          ps->cur = dat;
-         if (ps->tree->root == NULL)
+         if (dat->node == NODE_DOCTYPE) {
+                 if (ps->doctype == NULL)
+                         ps->doctype = dat;
+                 else
+                         error_msg(ps, "duplicate doctype");
+         } else if (dat->parent == NULL && ps->tree->root == NULL)
                  ps->tree->root = dat;
+ }
-         /*
+ static void
-          * Process attributes.
+ xml_attrkey(struct parse *ps, const char *name)
-          */
+ {
-         for (att = atts; *att != NULL; att += 2) {
+         struct pattr    *attr;
-                 if ((key = attrkey_parse(*att)) == ATTRKEY__MAX) {
+         const char      *value;
-                         if (ps->warn)
+         enum attrkey     key;
-                                 fprintf(stderr, "%s:%zu:%zu: warning: "
-                                     "unknown attribute \"%s\"\n",
+         if (ps->del > 0 || ps->ncur == NODE_IGNORE || *name == '\0')
-                                     ps->fname,
+                 return;
-                                     XML_GetCurrentLineNumber(ps->xml),
-                                     XML_GetCurrentColumnNumber(ps->xml),
+         if ((ps->ncur == NODE_DOCTYPE || ps->ncur == NODE_ENTITY) &&
-                                     *att);
+             TAILQ_FIRST(&ps->cur->attrq) == NULL) {
-                         continue;
+                 value = name;
+                 name = "NAME";
+         } else
+                 value = NULL;
+         if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
+                 ps->flags &= ~PFLAG_ATTR;
+                 return;
+         }
+         if ((attr = calloc(1, sizeof(*attr))) == NULL) {
+                 perror(NULL);
+                 exit(1);
+         }
+         attr->key = key;
+         attr->val = ATTRVAL__MAX;
+         if (value == NULL) {
+                 attr->rawval = NULL;
+                 ps->flags |= PFLAG_ATTR;
+         } else {
+                 if ((attr->rawval = strdup(value)) == NULL) {
+                         perror(NULL);
+                         exit(1);
                  }
-                 pattr = calloc(1, sizeof(*pattr));
+                 ps->flags &= ~PFLAG_ATTR;
-                 pattr->key = key;
-                 if ((pattr->val = attrval_parse(att[1])) == ATTRVAL__MAX)
-                         pattr->rawval = strdup(att[1]);
-                 TAILQ_INSERT_TAIL(&dat->attrq, pattr, child);
          }
+         TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
+         if (ps->ncur == NODE_ENTITY && key == ATTRKEY_NAME)
+                 xml_attrkey(ps, "DEFINITION");
  }
+ static void
+ xml_attrval(struct parse *ps, const char *name)
+ {
+         struct pattr    *attr;
+         if (ps->del > 0 || ps->ncur == NODE_IGNORE ||
+             (ps->flags & PFLAG_ATTR) == 0)
+                 return;
+         if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
+                 return;
+         if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
+             (attr->rawval = strdup(name)) == NULL) {
+                 perror(NULL);
+                 exit(1);
+         }
+ }
  /*
   * Roll up the parse tree.
   * If we're at a text node, roll that one up first.
   */
  static void
- xml_elem_end(void *arg, const XML_Char *name)
+ xml_elem_end(struct parse *ps, const char *name)
  {
-         struct parse    *ps;
+         const struct element    *elem;
-         const struct element *elem;
+         enum nodeid              node;
-         ps = arg;
+         /*
-         if (ps->tree->flags && TREE_FAIL)
+          * An ancestor is excluded from the tree;
+          * keep track of the number of levels excluded.
+          */
+         if (ps->del > 1) {
+                 ps->del--;
                  return;
-         /* Close out the text node, if there is one. */
-         if (ps->cur->node == NODE_TEXT) {
-                 pnode_trim(ps->cur);
-                 ps->cur = ps->cur->parent;
          }
-         for (elem = elements; elem->name != NULL; elem++)
+         if (ps->del == 0)
-                 if (strcmp(elem->name, name) == 0)
+                 pnode_closetext(ps);
-                         break;
-         switch (elem->node) {
+         if (name != NULL) {
+                 for (elem = elements; elem->name != NULL; elem++)
+                         if (strcmp(elem->name, name) == 0)
+                                 break;
+                 node = elem->node;
+         } else
+                 node = ps->ncur;
+         switch (node) {
+         case NODE_DELETE_WARN:
+         case NODE_DELETE:
+                 if (ps->del > 0)
+                         ps->del--;
+                 break;
          case NODE_IGNORE:
-         case NODE_WARN:
                  break;
+         case NODE_DOCTYPE:
+                 ps->flags &= ~PFLAG_EEND;
+                 /* FALLTHROUGH */
          default:
-                 assert(elem->node == ps->cur->node);
+                 if (ps->cur == NULL || node != ps->cur->node) {
-                 ps->cur = ps->cur->parent;
+                         warn_msg(ps, "element not open: </%s>", name);
+                         break;
+                 }
+                 /*
+                  * Refrain from actually closing the document element.
+                  * If no more content follows, no harm is done, but if
+                  * some content still follows, simply processing it is
+                  * obviously better than discarding it or crashing.
+                  */
+                 if (ps->cur->parent != NULL || node == NODE_DOCTYPE) {
+                         ps->cur = ps->cur->parent;
+                         if (ps->cur != NULL)
+                                 ps->ncur = ps->cur->node;
+                 } else
+                         ps->tree->flags |= TREE_CLOSED;
+                 ps->flags &= ~PFLAG_SPC;
+                 break;
          }
+         assert(ps->del == 0);
  }
  struct parse *
-Line 382  parse_alloc(int warn)
+Line 711  parse_alloc(int warn)
 Line 382  parse_alloc(int warn)
 Line 711  parse_alloc(int warn)
                  free(p);
                  return NULL;
          }
+         if (warn)
-         if ((p->xml = XML_ParserCreate(NULL)) == NULL) {
+                 p->flags |= PFLAG_WARN;
-                 free(p->tree);
+         else
-                 free(p);
+                 p->flags &= ~PFLAG_WARN;
-                 return NULL;
-         }
-         p->warn = warn;
-         XML_SetCharacterDataHandler(p->xml, xml_char);
-         XML_SetElementHandler(p->xml, xml_elem_start, xml_elem_end);
-         XML_SetUserData(p->xml, p);
          return p;
  }
-Line 400  parse_free(struct parse *p)
+Line 723  parse_free(struct parse *p)
 Line 400  parse_free(struct parse *p)
 Line 723  parse_free(struct parse *p)
  {
          if (p == NULL)
                  return;
-         XML_ParserFree(p->xml);
          if (p->tree != NULL) {
                  pnode_unlink(p->tree->root);
                  free(p->tree);
-Line 408  parse_free(struct parse *p)
+Line 730  parse_free(struct parse *p)
 Line 408  parse_free(struct parse *p)
 Line 730  parse_free(struct parse *p)
          free(p);
  }
+ static void
+ increment(struct parse *p, char *b, size_t *pend, int refill)
+ {
+         if (refill) {
+                 if (b[*pend] == '\n') {
+                         p->nline++;
+                         p->ncol = 1;
+                 } else
+                         p->ncol++;
+         }
+         ++*pend;
+ }
+ /*
+  * Advance the pend pointer to the next character in the charset.
+  * If the charset starts with a space, it stands for any whitespace.
+  * Update the new input file position, used for messages.
+  * Do not overrun the buffer b of length rlen.
+  * When reaching the end, NUL-terminate the buffer and return 1;
+  * otherwise, return 0.
+  */
+ static int
+ advance(struct parse *p, char *b, size_t rlen, size_t *pend,
+     const char *charset, int refill)
+ {
+         int              space;
+         if (*charset == ' ') {
+                 space = 1;
+                 charset++;
+         } else
+                 space = 0;
+         if (refill) {
+                 p->nline = p->line;
+                 p->ncol = p->col;
+         }
+         while (*pend < rlen) {
+                 if (space && isspace((unsigned char)b[*pend]))
+                         break;
+                 if (strchr(charset, b[*pend]) != NULL)
+                         break;
+                 increment(p, b, pend, refill);
+         }
+         if (*pend == rlen) {
+                 b[rlen] = '\0';
+                 return refill;
+         } else
+                 return 0;
+ }
+ size_t
+ parse_string(struct parse *p, char *b, size_t rlen,
+     enum pstate *pstate, int refill)
+ {
+         char            *cp;
+         size_t           poff;  /* Parse offset in b[]. */
+         size_t           pend;  /* Offset of the end of the current word. */
+         int              elem_end;
+         pend = 0;
+         for (;;) {
+                 /* Proceed to the next token, skipping whitespace. */
+                 if (refill) {
+                         p->line = p->nline;
+                         p->col = p->ncol;
+                 }
+                 if ((poff = pend) == rlen)
+                         break;
+                 if (isspace((unsigned char)b[pend])) {
+                         p->flags |= PFLAG_SPC;
+                         increment(p, b, &pend, refill);
+                         continue;
+                 }
+                 /*
+                  * The following four cases (ARG, TAG, and starting an
+                  * entity or a tag) all parse a word or quoted string.
+                  * If that extends beyond the read buffer and the last
+                  * read(2) still got data, they all break out of the
+                  * token loop to request more data from the read loop.
+                  *
+                  * Also, three of them detect self-closing tags, those
+                  * ending with "/>", setting the flag elem_end and
+                  * calling xml_elem_end() at the very end, after
+                  * handling the attribute value, attribute name, or
+                  * tag name, respectively.
+                  */
+                 /* Parse an attribute value. */
+                 if (*pstate >= PARSE_ARG) {
+                         if (*pstate == PARSE_ARG &&
+                             (b[pend] == '\'' || b[pend] == '"')) {
+                                 *pstate = b[pend] == '"' ?
+                                     PARSE_DQ : PARSE_SQ;
+                                 increment(p, b, &pend, refill);
+                                 continue;
+                         }
+                         if (advance(p, b, rlen, &pend,
+                             *pstate == PARSE_DQ ? "\"" :
+                             *pstate == PARSE_SQ ? "'" : " >", refill))
+                                 break;
+                         *pstate = PARSE_TAG;
+                         elem_end = 0;
+                         if (b[pend] == '>') {
+                                 *pstate = PARSE_ELEM;
+                                 if (pend > 0 && b[pend - 1] == '/') {
+                                         b[pend - 1] = '\0';
+                                         elem_end = 1;
+                                 }
+                                 if (p->flags & PFLAG_EEND)
+                                         elem_end = 1;
+                         }
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         xml_attrval(p, b + poff);
+                         if (elem_end)
+                                 xml_elem_end(p, NULL);
+                 /* Look for an attribute name. */
+                 } else if (*pstate == PARSE_TAG) {
+                         switch (p->ncur) {
+                         case NODE_DOCTYPE:
+                                 if (b[pend] == '[') {
+                                         *pstate = PARSE_ELEM;
+                                         increment(p, b, &pend, refill);
+                                         continue;
+                                 }
+                                 /* FALLTHROUGH */
+                         case NODE_ENTITY:
+                                 if (b[pend] == '"' || b[pend] == '\'') {
+                                         *pstate = PARSE_ARG;
+                                         continue;
+                                 }
+                                 break;
+                         default:
+                                 break;
+                         }
+                         if (advance(p, b, rlen, &pend, " =>", refill))
+                                 break;
+                         elem_end = 0;
+                         switch (b[pend]) {
+                         case '>':
+                                 *pstate = PARSE_ELEM;
+                                 if (pend > 0 && b[pend - 1] == '/') {
+                                         b[pend - 1] = '\0';
+                                         elem_end = 1;
+                                 }
+                                 if (p->flags & PFLAG_EEND)
+                                         elem_end = 1;
+                                 break;
+                         case '=':
+                                 *pstate = PARSE_ARG;
+                                 break;
+                         default:
+                                 break;
+                         }
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         xml_attrkey(p, b + poff);
+                         if (elem_end)
+                                 xml_elem_end(p, NULL);
+                 /* Begin an opening or closing tag. */
+                 } else if (b[poff] == '<') {
+                         if (advance(p, b, rlen, &pend, " >", refill))
+                                 break;
+                         if (pend > poff + 3 &&
+                             strncmp(b + poff, "<!--", 4) == 0) {
+                                 /* Skip a comment. */
+                                 cp = strstr(b + pend - 2, "-->");
+                                 if (cp == NULL) {
+                                         if (refill)
+                                                 break;
+                                         cp = b + rlen;
+                                 } else
+                                         cp += 3;
+                                 while (b + pend < cp)
+                                         increment(p, b, &pend, refill);
+                                 continue;
+                         }
+                         elem_end = 0;
+                         if (b[pend] != '>')
+                                 *pstate = PARSE_TAG;
+                         else if (pend > 0 && b[pend - 1] == '/') {
+                                 b[pend - 1] = '\0';
+                                 elem_end = 1;
+                         }
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         if (b[++poff] == '/') {
+                                 elem_end = 1;
+                                 poff++;
+                         } else {
+                                 xml_elem_start(p, b + poff);
+                                 if (*pstate == PARSE_ELEM &&
+                                     p->flags & PFLAG_EEND)
+                                         elem_end = 1;
+                         }
+                         if (elem_end)
+                                 xml_elem_end(p, b + poff);
+                 /* Close a doctype. */
+                 } else if (p->ncur == NODE_DOCTYPE && b[poff] == ']') {
+                         *pstate = PARSE_TAG;
+                         increment(p, b, &pend, refill);
+                 /* Process an entity. */
+                 } else if (b[poff] == '&') {
+                         if (advance(p, b, rlen, &pend, ";", refill))
+                                 break;
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         xml_entity(p, b + poff + 1);
+                 /* Process text up to the next tag, entity, or EOL. */
+                 } else {
+                         advance(p, b, rlen, &pend, "<&", refill);
+                         xml_char(p, b + poff, pend - poff);
+                 }
+         }
+         return poff;
+ }
+ /*
+  * The read loop.
+  * If the previous token was incomplete and asked for more input,
+  * we have to enter the read loop once more even on EOF.
+  * Once rsz is 0, incomplete tokens will no longer ask for more input
+  * but instead use whatever there is, and then exit the read loop.
+  * The minus one on the size limit for read(2) is needed such that
+  * advance() can set b[rlen] to NUL when needed.
+  */
+ static void
+ parse_fd(struct parse *p, int fd)
+ {
+         char             b[4096];
+         ssize_t          rsz;   /* Return value from read(2). */
+         size_t           rlen;  /* Number of bytes in b[]. */
+         size_t           poff;  /* Parse offset in b[]. */
+         enum pstate      pstate;
+         rlen = 0;
+         pstate = PARSE_ELEM;
+         while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
+             (rlen += rsz) > 0) {
+                 poff = parse_string(p, b, rlen, &pstate, rsz > 0);
+                 /* Buffer exhausted; shift left and re-fill. */
+                 assert(poff > 0);
+                 rlen -= poff;
+                 memmove(b, b + poff, rlen);
+         }
+         if (rsz < 0)
+                 error_msg(p, "read: %s", strerror(errno));
+ }
+ /*
+  * Open and parse a file.
+  */
  struct ptree *
  parse_file(struct parse *p, int fd, const char *fname)
  {
-         char             b[4096];
+         const char      *save_fname;
-         ssize_t          ssz;
+         int              save_line, save_col;
+         /* Save and initialize reporting data. */
+         save_fname = p->fname;
+         save_line = p->nline;
+         save_col = p->ncol;
          p->fname = fname;
-         do {
+         p->line = 0;
-                 if ((ssz = read(fd, b, sizeof(b))) < 0) {
+         p->col = 0;
-                         perror(fname);
-                         pnode_unlink(p->tree->root);
+         /* Open the file, unless it is already open. */
-                         p->tree->root = p->cur = NULL;
-                         p->tree->flags |= TREE_FAIL;
+         if (fd == -1 && (fd = open(fname, O_RDONLY, 0)) == -1) {
-                         return NULL;
+                 error_msg(p, "open: %s", strerror(errno));
-                 }
+                 p->fname = save_fname;
-                 if (XML_Parse(p->xml, b, ssz, ssz == 0) == 0) {
+                 return p->tree;
-                         fprintf(stderr, "%s:%zu:%zu: %s\n", fname,
+         }
-                             XML_GetCurrentLineNumber(p->xml),
-                             XML_GetCurrentColumnNumber(p->xml),
+         /*
-                             XML_ErrorString(XML_GetErrorCode(p->xml)));
+          * After opening the starting file, change to the directory it
-                         p->tree->flags |= TREE_FAIL;
+          * is located in, in case it wants to include any further files,
-                 }
+          * which are typically given with relative paths in DocBook.
-         } while (ssz > 0 && (p->tree->flags & TREE_FAIL) == 0);
+          * Do this on a best-effort basis; don't complain about failure.
+          */
+         if (save_fname == NULL && (fname = dirname(fname)) != NULL &&
+             strcmp(fname, ".") != 0)
+                 (void)chdir(fname);
+         /* Run the read loop. */
+         p->nline = 1;
+         p->ncol = 1;
+         parse_fd(p, fd);
+         /* On the top level, finalize the parse tree. */
+         if (save_fname == NULL) {
+                 pnode_closetext(p);
+                 if (p->tree->root == NULL)
+                         error_msg(p, "empty document");
+                 else if ((p->tree->flags & TREE_CLOSED) == 0)
+                         warn_msg(p, "document not closed");
+                 pnode_unlink(p->doctype);
+         }
+         /* Clean up. */
+         if (fd != STDIN_FILENO)
+                 close(fd);
+         p->fname = save_fname;
+         p->nline = save_line;
+         p->ncol = save_col;
          return p->tree;
  }

CVSweb