docbook2mdoc/parse.c - diff

Return to parse.c CVS log

Up to [cvsweb.bsd.lv] / docbook2mdoc

Diff for /docbook2mdoc/parse.c between version 1.12 and 1.18

-version 1.12, 2019/04/03 16:08:57
+version 1.18, 2019/04/07 17:00:56
 Line 30
 Line 30
 Line 30
   * The implementation of the DocBook parser.
   */
+ enum    pstate {
+         PARSE_ELEM,
+         PARSE_TAG,
+         PARSE_ARG,
+         PARSE_SQ,
+         PARSE_DQ
+ };
  /*
   * Global parse state.
   * Keep this as simple and small as possible.
-Line 44  struct parse {
+Line 52  struct parse {
 Line 44  struct parse {
 Line 52  struct parse {
          int              nline;  /* Line number of next token. */
          int              ncol;   /* Column number of next token. */
          int              del;    /* Levels of nested nodes being deleted. */
+         int              spc;    /* Whitespace before the next element. */
          int              attr;   /* The most recent attribute is valid. */
          int              warn;
  };
-Line 69  static const struct element elements[] = {
+Line 78  static const struct element elements[] = {
 Line 69  static const struct element elements[] = {
 Line 78  static const struct element elements[] = {
          { "citerefentry",       NODE_CITEREFENTRY },
          { "citetitle",          NODE_CITETITLE },
          { "cmdsynopsis",        NODE_CMDSYNOPSIS },
-         { "code",               NODE_CODE },
+         { "code",               NODE_LITERAL },
          { "colspec",            NODE_COLSPEC },
          { "command",            NODE_COMMAND },
          { "constant",           NODE_CONSTANT },
-Line 81  static const struct element elements[] = {
+Line 90  static const struct element elements[] = {
 Line 81  static const struct element elements[] = {
 Line 90  static const struct element elements[] = {
          { "emphasis",           NODE_EMPHASIS },
          { "entry",              NODE_ENTRY },
          { "envar",              NODE_ENVAR },
+         { "errorname",          NODE_ERRORNAME },
          { "fieldsynopsis",      NODE_FIELDSYNOPSIS },
          { "filename",           NODE_FILENAME },
          { "firstname",          NODE_PERSONNAME },
-Line 161  static const struct element elements[] = {
+Line 171  static const struct element elements[] = {
 Line 161  static const struct element elements[] = {
 Line 171  static const struct element elements[] = {
          { "sect2",              NODE_SECTION },
          { "section",            NODE_SECTION },
          { "sgmltag",            NODE_SGMLTAG },
+         { "simpara",            NODE_PARA },
          { "simplelist",         NODE_SIMPLELIST },
          { "spanspec",           NODE_SPANSPEC },
-         { "structname",         NODE_STRUCTNAME },
+         { "structfield",        NODE_PARAMETER },
+         { "structname",         NODE_TYPE },
          { "subtitle",           NODE_SUBTITLE },
          { "surname",            NODE_PERSONNAME },
          { "symbol",             NODE_CONSTANT },
-Line 178  static const struct element elements[] = {
+Line 190  static const struct element elements[] = {
 Line 178  static const struct element elements[] = {
 Line 190  static const struct element elements[] = {
          { "title",              NODE_TITLE },
          { "trademark",          NODE_IGNORE },
          { "type",               NODE_TYPE },
-         { "ulink",              NODE_ULINK },
+         { "ulink",              NODE_LINK },
-         { "userinput",          NODE_USERINPUT },
+         { "userinput",          NODE_LITERAL },
          { "variablelist",       NODE_VARIABLELIST },
          { "varlistentry",       NODE_VARLISTENTRY },
          { "varname",            NODE_VARNAME },
-Line 283  static void
+Line 295  static void
 Line 283  static void
 Line 295  static void
  xml_char(struct parse *ps, const char *p, int sz)
  {
          struct pnode    *dat;
+         size_t           newsz;
          if (ps->del > 0)
                  return;
-Line 298  xml_char(struct parse *ps, const char *p, int sz)
+Line 311  xml_char(struct parse *ps, const char *p, int sz)
 Line 298  xml_char(struct parse *ps, const char *p, int sz)
 Line 311  xml_char(struct parse *ps, const char *p, int sz)
                          exit(1);
                  }
                  dat->node = NODE_TEXT;
+                 dat->spc = ps->spc;
                  dat->parent = ps->cur;
                  TAILQ_INIT(&dat->childq);
                  TAILQ_INIT(&dat->attrq);
-Line 312  xml_char(struct parse *ps, const char *p, int sz)
+Line 326  xml_char(struct parse *ps, const char *p, int sz)
 Line 312  xml_char(struct parse *ps, const char *p, int sz)
 Line 326  xml_char(struct parse *ps, const char *p, int sz)
          /* Append to the current text node. */
          assert(sz >= 0);
-         ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
+         newsz = ps->cur->bsz + (ps->cur->bsz && ps->spc) + sz;
+         ps->cur->b = realloc(ps->cur->b, newsz + 1);
          if (ps->cur->b == NULL) {
                  perror(NULL);
                  exit(1);
          }
+         if (ps->cur->bsz && ps->spc)
+                 ps->cur->b[ps->cur->bsz++] = ' ';
          memcpy(ps->cur->b + ps->cur->bsz, p, sz);
-         ps->cur->bsz += sz;
+         ps->cur->b[ps->cur->bsz = newsz] = '\0';
-         ps->cur->b[ps->cur->bsz] = '\0';
          ps->cur->real = ps->cur->b;
+         ps->spc = 0;
  }
+ /*
+  * Close out the text node and strip trailing whitespace, if one is open.
+  */
  static void
- pnode_trim(struct pnode *pn)
+ pnode_closetext(struct parse *p)
  {
-         assert(pn->node == NODE_TEXT);
+         struct pnode    *n;
-         for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
-                 if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
+         if ((n = p->cur) == NULL || n->node != NODE_TEXT)
-                         break;
+                 return;
+         p->cur = n->parent;
+         while (n->bsz > 0 && isspace((unsigned char)n->b[n->bsz - 1])) {
+                 n->b[--n->bsz] = '\0';
+                 p->spc = 1;
+         }
  }
  static void
-Line 346  xml_entity(struct parse *p, const char *name)
+Line 371  xml_entity(struct parse *p, const char *name)
 Line 346  xml_entity(struct parse *p, const char *name)
 Line 371  xml_entity(struct parse *p, const char *name)
                  return;
          }
-         /* Close out the text node, if there is one. */
+         pnode_closetext(p);
-         if (p->cur->node == NODE_TEXT) {
-                 pnode_trim(p->cur);
-                 p->cur = p->cur->parent;
-         }
          if (p->tree->flags & TREE_CLOSED && p->cur == p->tree->root)
                  warn_msg(p, "entity after end of document: &%s;", name);
-Line 372  xml_entity(struct parse *p, const char *name)
+Line 393  xml_entity(struct parse *p, const char *name)
 Line 372  xml_entity(struct parse *p, const char *name)
 Line 393  xml_entity(struct parse *p, const char *name)
          }
          dat->node = NODE_ESCAPE;
          dat->bsz = strlen(dat->b);
+         dat->spc = p->spc;
          dat->parent = p->cur;
          TAILQ_INIT(&dat->childq);
          TAILQ_INIT(&dat->attrq);
          TAILQ_INSERT_TAIL(&p->cur->childq, dat, child);
+         p->spc = 0;
  }
  /*
-Line 399  xml_elem_start(struct parse *ps, const char *name)
+Line 422  xml_elem_start(struct parse *ps, const char *name)
 Line 399  xml_elem_start(struct parse *ps, const char *name)
 Line 422  xml_elem_start(struct parse *ps, const char *name)
                  return;
          }
-         /* Close out the text node, if there is one. */
+         pnode_closetext(ps);
-         if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
-                 pnode_trim(ps->cur);
-                 ps->cur = ps->cur->parent;
-         }
          for (elem = elements; elem->name != NULL; elem++)
                  if (strcmp(elem->name, name) == 0)
-Line 437  xml_elem_start(struct parse *ps, const char *name)
+Line 456  xml_elem_start(struct parse *ps, const char *name)
 Line 437  xml_elem_start(struct parse *ps, const char *name)
 Line 456  xml_elem_start(struct parse *ps, const char *name)
                  perror(NULL);
                  exit(1);
          }
-         dat->node = elem->node;
+         /*
+          * Nodes that begin a new macro or request line or start by
+          * printing text always want whitespace before themselves.
+          */
+         switch (dat->node = elem->node) {
+         case NODE_AUTHORGROUP:
+         case NODE_BOOKINFO:
+         case NODE_CAUTION:
+         case NODE_EDITOR:
+         case NODE_ENTRY:
+         case NODE_FUNCDEF:
+         case NODE_FUNCPROTOTYPE:
+         case NODE_INFORMALEQUATION:
+         case NODE_INLINEEQUATION:
+         case NODE_ITEMIZEDLIST:
+         case NODE_LEGALNOTICE:
+         case NODE_LITERALLAYOUT:
+         case NODE_NOTE:
+         case NODE_ORDEREDLIST:
+         case NODE_PARA:
+         case NODE_PREFACE:
+         case NODE_PROGRAMLISTING:
+         case NODE_REFMETA:
+         case NODE_REFNAMEDIV:
+         case NODE_REFSYNOPSISDIV:
+         case NODE_ROW:
+         case NODE_SBR:
+         case NODE_SCREEN:
+         case NODE_SECTION:
+         case NODE_SYNOPSIS:
+         case NODE_TGROUP:
+         case NODE_TIP:
+         case NODE_TITLE:
+         case NODE_VARIABLELIST:
+         case NODE_VARLISTENTRY:
+         case NODE_WARNING:
+                 dat->spc = 1;
+                 break;
+         default:
+                 dat->spc = ps->spc;
+                 break;
+         }
          dat->parent = ps->cur;
          TAILQ_INIT(&dat->childq);
          TAILQ_INIT(&dat->attrq);
-Line 508  xml_elem_end(struct parse *ps, const char *name)
+Line 570  xml_elem_end(struct parse *ps, const char *name)
 Line 508  xml_elem_end(struct parse *ps, const char *name)
 Line 570  xml_elem_end(struct parse *ps, const char *name)
                  return;
          }
-         /* Close out the text node, if there is one. */
+         if (ps->del == 0)
-         if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
+                 pnode_closetext(ps);
-                 pnode_trim(ps->cur);
-                 ps->cur = ps->cur->parent;
-         }
          if (name != NULL) {
                  for (elem = elements; elem->name != NULL; elem++)
-Line 547  xml_elem_end(struct parse *ps, const char *name)
+Line 606  xml_elem_end(struct parse *ps, const char *name)
 Line 547  xml_elem_end(struct parse *ps, const char *name)
 Line 606  xml_elem_end(struct parse *ps, const char *name)
                          ps->tree->flags |= TREE_CLOSED;
                  else
                          ps->cur = ps->cur->parent;
+                 ps->spc = 0;
                  break;
          }
          assert(ps->del == 0);
-Line 580  parse_free(struct parse *p)
+Line 640  parse_free(struct parse *p)
 Line 580  parse_free(struct parse *p)
 Line 640  parse_free(struct parse *p)
          free(p);
  }
+ static void
+ increment(struct parse *p, char *b, size_t *pend, int refill)
+ {
+         if (refill) {
+                 if (b[*pend] == '\n') {
+                         p->nline++;
+                         p->ncol = 1;
+                 } else
+                         p->ncol++;
+         }
+         ++*pend;
+ }
  /*
   * Advance the pend pointer to the next character in the charset.
   * If the charset starts with a space, it stands for any whitespace.
-Line 590  parse_free(struct parse *p)
+Line 663  parse_free(struct parse *p)
 Line 590  parse_free(struct parse *p)
 Line 663  parse_free(struct parse *p)
   */
  static int
  advance(struct parse *p, char *b, size_t rlen, size_t *pend,
-     const char *charset)
+     const char *charset, int refill)
  {
          int              space;
-Line 600  advance(struct parse *p, char *b, size_t rlen, size_t
+Line 673  advance(struct parse *p, char *b, size_t rlen, size_t
 Line 600  advance(struct parse *p, char *b, size_t rlen, size_t
 Line 673  advance(struct parse *p, char *b, size_t rlen, size_t
          } else
                  space = 0;
-         p->nline = p->line;
+         if (refill) {
-         p->ncol = p->col;
+                 p->nline = p->line;
+                 p->ncol = p->col;
+         }
          while (*pend < rlen) {
-                 if (b[*pend] == '\n') {
-                         p->nline++;
-                         p->ncol = 1;
-                 } else
-                         p->ncol++;
                  if (space && isspace((unsigned char)b[*pend]))
                          break;
                  if (strchr(charset, b[*pend]) != NULL)
                          break;
-                 ++*pend;
+                 increment(p, b, pend, refill);
          }
          if (*pend == rlen) {
                  b[rlen] = '\0';
-                 return 1;
+                 return refill;
          } else
                  return 0;
  }
- struct ptree *
+ size_t
- parse_file(struct parse *p, int fd, const char *fname)
+ parse_string(struct parse *p, char *b, size_t rlen,
+     enum pstate *pstate, int refill)
  {
-         char             b[4096];
          char            *cp;
-         ssize_t          rsz;   /* Return value from read(2). */
-         size_t           rlen;  /* Number of bytes in b[]. */
          size_t           poff;  /* Parse offset in b[]. */
          size_t           pend;  /* Offset of the end of the current word. */
-         int              in_tag, in_arg, in_quotes, elem_end;
+         int              elem_end;
-         p->fname = fname;
+         p->spc = 0;
-         p->nline = 1;
+         pend = 0;
-         p->ncol = 1;
+         for (;;) {
-         rlen = 0;
-         in_tag = in_arg = in_quotes = 0;
-         /*
+                 /* Proceed to the next token, skipping whitespace. */
-          * Read loop.
-          *
-          * We have to enter the read loop once more even on EOF
-          * because the previous token may have been incomplete,
-          * such that it asked for more input.
-          * Once rsz is 0, incomplete tokens will no longer ask
-          * for more input but instead use whatever there is,
-          * and then exit the read loop.
-          * The minus one on the size limit for read(2) is needed
-          * such that advance() can set b[rlen] to NUL when needed.
-          */
-         while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
+                 if (refill) {
-                 if ((rlen += rsz) == 0)
+                         p->line = p->nline;
+                         p->col = p->ncol;
+                 }
+                 if ((poff = pend) == rlen)
                          break;
+                 if (isspace((unsigned char)b[pend])) {
+                         p->spc = 1;
+                         increment(p, b, &pend, refill);
+                         continue;
+                 }
-                 /* Token loop. */
+                 /*
+                  * The following four cases (ARG, TAG, and starting an
+                  * entity or a tag) all parse a word or quoted string.
+                  * If that extends beyond the read buffer and the last
+                  * read(2) still got data, they all break out of the
+                  * token loop to request more data from the read loop.
+                  *
+                  * Also, three of them detect self-closing tags, those
+                  * ending with "/>", setting the flag elem_end and
+                  * calling xml_elem_end() at the very end, after
+                  * handling the attribute value, attribute name, or
+                  * tag name, respectively.
+                  */
-                 pend = 0;
+                 /* Parse an attribute value. */
-                 for (;;) {
-                         /* Proceed to the next token, skipping whitespace. */
+                 if (*pstate >= PARSE_ARG) {
+                         if (*pstate == PARSE_ARG &&
-                         p->line = p->nline;
+                             (b[pend] == '\'' || b[pend] == '"')) {
-                         p->col = p->ncol;
+                                 *pstate = b[pend] == '"' ?
-                         if ((poff = pend) == rlen)
+                                     PARSE_DQ : PARSE_SQ;
-                                 break;
+                                 increment(p, b, &pend, refill);
-                         if (isspace((unsigned char)b[pend])) {
-                                 if (b[pend++] == '\n') {
-                                         p->nline++;
-                                         p->ncol = 1;
-                                 } else
-                                         p->ncol++;
                                  continue;
                          }
+                         if (advance(p, b, rlen, &pend,
-                         /*
+                             *pstate == PARSE_DQ ? "\"" :
-                          * The following four cases (in_arg, in_tag, and
+                             *pstate == PARSE_SQ ? "'" : " >", refill))
-                          * starting an entity or a tag) all parse a word
+                                 break;
-                          * or quoted string.  If that extends beyond the
+                         *pstate = PARSE_TAG;
-                          * read buffer and the last read(2) still got
+                         elem_end = 0;
-                          * data, they all break out of the token loop
+                         if (b[pend] == '>') {
-                          * to request more data from the read loop.
+                                 *pstate = PARSE_ELEM;
-                          *
+                                 if (pend > 0 && b[pend - 1] == '/') {
-                          * Also, three of them detect self-closing tags,
+                                         b[pend - 1] = '\0';
-                          * those ending with "/>", setting the flag
+                                         elem_end = 1;
-                          * elem_end and calling xml_elem_end() at the
-                          * very end, after handling the attribute value,
-                          * attribute name, or tag name, respectively.
-                          */
-                         /* Parse an attribute value. */
-                         if (in_arg) {
-                                 if (in_quotes == 0 &&
-                                     (b[pend] == '\'' || b[pend] == '"')) {
-                                         in_quotes = b[pend] == '"' ? 2 : 1;
-                                         p->ncol++;
-                                         pend++;
-                                         continue;
                                  }
-                                 if (advance(p, b, rlen, &pend,
+                         }
-                                     in_quotes == 2 ? "\"" :
+                         b[pend] = '\0';
-                                     in_quotes == 1 ? "'" : " >") && rsz > 0)
+                         if (pend < rlen)
-                                         break;
+                                 increment(p, b, &pend, refill);
-                                 in_arg = in_quotes = elem_end = 0;
+                         xml_attrval(p, b + poff);
-                                 if (b[pend] == '>') {
+                         if (elem_end)
-                                         in_tag = 0;
+                                 xml_elem_end(p, NULL);
-                                         if (pend > 0 && b[pend - 1] == '/') {
-                                                 b[pend - 1] = '\0';
-                                                 elem_end = 1;
-                                         }
-                                 }
-                                 b[pend] = '\0';
-                                 if (pend < rlen)
-                                         pend++;
-                                 xml_attrval(p, b + poff);
-                                 if (elem_end)
-                                         xml_elem_end(p, NULL);
-                         /* Look for an attribute name. */
+                 /* Look for an attribute name. */
-                         } else if (in_tag) {
+                 } else if (*pstate == PARSE_TAG) {
-                                 if (advance(p, b, rlen, &pend, " =>") &&
+                         if (advance(p, b, rlen, &pend, " =>", refill))
-                                     rsz > 0)
+                                 break;
-                                         break;
+                         elem_end = 0;
-                                 elem_end = 0;
+                         switch (b[pend]) {
-                                 switch (b[pend]) {
+                         case '>':
-                                 case '>':
+                                 *pstate = PARSE_ELEM;
-                                         in_tag = 0;
+                                 if (pend > 0 && b[pend - 1] == '/') {
-                                         if (pend > 0 && b[pend - 1] == '/') {
+                                         b[pend - 1] = '\0';
-                                                 b[pend - 1] = '\0';
+                                         elem_end = 1;
-                                                 elem_end = 1;
-                                         }
-                                         break;
-                                 case '=':
-                                         in_arg = 1;
-                                         break;
-                                 default:
-                                         break;
                                  }
-                                 b[pend] = '\0';
+                                 break;
-                                 if (pend < rlen)
+                         case '=':
-                                         pend++;
+                                 *pstate = PARSE_ARG;
-                                 xml_attrkey(p, b + poff);
+                                 break;
-                                 if (elem_end)
+                         default:
-                                         xml_elem_end(p, NULL);
+                                 break;
+                         }
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         xml_attrkey(p, b + poff);
+                         if (elem_end)
+                                 xml_elem_end(p, NULL);
-                         /* Begin an opening or closing tag. */
+                 /* Begin an opening or closing tag. */
-                         } else if (b[poff] == '<') {
+                 } else if (b[poff] == '<') {
-                                 if (advance(p, b, rlen, &pend, " >") &&
+                         if (advance(p, b, rlen, &pend, " >", refill))
-                                     rsz > 0)
+                                 break;
-                                         break;
+                         if (pend > poff + 3 &&
-                                 if (pend > poff + 3 &&
+                             strncmp(b + poff, "<!--", 4) == 0) {
-                                     strncmp(b + poff, "<!--", 4) == 0) {
-                                         /* Skip a comment. */
+                                 /* Skip a comment. */
-                                         cp = strstr(b + pend - 2, "-->");
+                                 cp = strstr(b + pend - 2, "-->");
-                                         if (cp == NULL) {
+                                 if (cp == NULL) {
-                                                 if (rsz > 0) {
+                                         if (refill)
-                                                         pend = rlen;
+                                                 break;
-                                                         break;
+                                         cp = b + rlen;
-                                                 }
-                                                 cp = b + rlen;
-                                         } else
-                                                 cp += 3;
-                                         while (b + pend < cp) {
-                                                 if (b[++pend] == '\n') {
-                                                         p->nline++;
-                                                         p->ncol = 1;
-                                                 } else
-                                                         p->ncol++;
-                                         }
-                                         continue;
-                                 }
-                                 elem_end = 0;
-                                 if (b[pend] != '>')
-                                         in_tag = 1;
-                                 else if (pend > 0 && b[pend - 1] == '/') {
-                                         b[pend - 1] = '\0';
-                                         elem_end = 1;
-                                 }
-                                 b[pend] = '\0';
-                                 if (pend < rlen)
-                                         pend++;
-                                 if (b[++poff] == '/') {
-                                         elem_end = 1;
-                                         poff++;
                                  } else
-                                         xml_elem_start(p, b + poff);
+                                         cp += 3;
-                                 if (elem_end)
+                                 while (b + pend < cp)
-                                         xml_elem_end(p, b + poff);
+                                         increment(p, b, &pend, refill);
+                                 continue;
+                         }
+                         elem_end = 0;
+                         if (b[pend] != '>')
+                                 *pstate = PARSE_TAG;
+                         else if (pend > 0 && b[pend - 1] == '/') {
+                                 b[pend - 1] = '\0';
+                                 elem_end = 1;
+                         }
+                         b[pend] = '\0';
+                         if (pend < rlen)
+                                 increment(p, b, &pend, refill);
+                         if (b[++poff] == '/') {
+                                 elem_end = 1;
+                                 poff++;
+                         } else
+                                 xml_elem_start(p, b + poff);
+                         if (elem_end)
+                                 xml_elem_end(p, b + poff);
-                         /* Process an entity. */
+                 /* Process an entity. */
-                         } else if (b[poff] == '&') {
+                 } else if (b[poff] == '&') {
-                                 if (advance(p, b, rlen, &pend, ";") &&
+                         if (advance(p, b, rlen, &pend, ";", refill))
-                                     rsz > 0)
+                                 break;
-                                         break;
+                         b[pend] = '\0';
-                                 b[pend] = '\0';
+                         if (pend < rlen)
-                                 if (pend < rlen)
+                                 increment(p, b, &pend, refill);
-                                         pend++;
+                         xml_entity(p, b + poff + 1);
-                                 xml_entity(p, b + poff + 1);
-                         /* Process text up to the next tag or entity. */
+                 /* Process text up to the next tag, entity, or EOL. */
-                         } else {
+                 } else {
-                                 if (advance(p, b, rlen, &pend, "<&") == 0)
+                         advance(p, b, rlen, &pend, "<&", refill);
-                                         p->ncol--;
+                         xml_char(p, b + poff, pend - poff);
-                                 xml_char(p, b + poff, pend - poff);
-                         }
                  }
+         }
+         return poff;
+ }
-                 /* Buffer exhausted; shift left and re-fill. */
+ struct ptree *
+ parse_file(struct parse *p, int fd, const char *fname)
+ {
+         char             b[4096];
+         ssize_t          rsz;   /* Return value from read(2). */
+         size_t           rlen;  /* Number of bytes in b[]. */
+         size_t           poff;  /* Parse offset in b[]. */
+         enum pstate      pstate;
+         p->fname = fname;
+         p->nline = 1;
+         p->ncol = 1;
+         pstate = PARSE_ELEM;
+         rlen = 0;
+         /*
+          * Read loop.
+          *
+          * If the previous token was incomplete and asked for more
+          * input, we have to enter the read loop once more even on EOF.
+          * Once rsz is 0, incomplete tokens will no longer ask
+          * for more input but instead use whatever there is,
+          * and then exit the read loop.
+          * The minus one on the size limit for read(2) is needed
+          * such that advance() can set b[rlen] to NUL when needed.
+          */
+         while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0 &&
+             (rlen += rsz) > 0) {
+                 poff = parse_string(p, b, rlen, &pstate, rsz > 0);
+                 /* Buffer exhausted; shift left and re-fill. */
                  assert(poff > 0);
-                 memmove(b, b + poff, rlen - poff);
                  rlen -= poff;
+                 memmove(b, b + poff, rlen);
          }
          if (rsz < 0) {
                  perror(fname);
                  p->tree->flags |= TREE_FAIL;
          }
-         if (p->cur != NULL && p->cur->node == NODE_TEXT) {
+         pnode_closetext(p);
-                 pnode_trim(p->cur);
-                 p->cur = p->cur->parent;
-         }
          if ((p->tree->flags & TREE_CLOSED) == 0)
                  warn_msg(p, "document not closed");
          return p->tree;

CVSweb