===================================================================
RCS file: /cvs/docbook2mdoc/docbook2mdoc.c,v
retrieving revision 1.4
retrieving revision 1.10
diff -u -p -r1.4 -r1.10
--- docbook2mdoc/docbook2mdoc.c	2014/03/28 10:00:40	1.4
+++ docbook2mdoc/docbook2mdoc.c	2014/03/29 10:56:21	1.10
@@ -1,4 +1,4 @@
-/*	$Id: docbook2mdoc.c,v 1.4 2014/03/28 10:00:40 kristaps Exp $ */
+/*	$Id: docbook2mdoc.c,v 1.10 2014/03/29 10:56:21 kristaps Exp $ */
 /*
  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  *
@@ -24,6 +24,7 @@
 #include <stdio.h>
 #include <stdlib.h>
 #include <string.h>
+#include <unistd.h>
 
 /*
  * All recognised node types.
@@ -58,6 +59,7 @@ enum	nodeid {
 	NODE_REFPURPOSE,
 	NODE_REFSECT1,
 	NODE_REFSYNOPSISDIV,
+	NODE_STRUCTNAME,
 	NODE_SYNOPSIS,
 	NODE_TEXT, 
 	NODE_TITLE,
@@ -73,13 +75,14 @@ struct	parse {
 	int		 stop; /* should we stop now? */
 	struct pnode	*root; /* root of parse tree */
 	struct pnode	*cur; /* current node in tree */
-	char		*b;
-	size_t		 bsz;
-	size_t		 mbsz;
+	char		*b; /* nil-terminated buffer for pre-print */
+	size_t		 bsz; /* current length of b */
+	size_t		 mbsz; /* max bsz allocation */
+	int		 newln; /* output: are we on a fresh line */
 };
 
 struct	node {
-	const char	*name;
+	const char	*name; /* docbook element name */
 	unsigned int	 flags;
 #define	NODE_IGNTEXT	 1 /* ignore all contained text */
 };
@@ -124,13 +127,19 @@ static	const struct node nodes[NODE__MAX] = {
 	{ "refpurpose", 0 }, 
 	{ "refsect1", 0 }, 
 	{ "refsynopsisdiv", NODE_IGNTEXT }, 
+	{ "structname", 0 }, 
 	{ "synopsis", 0 }, 
 	{ NULL, 0 }, 
 	{ "title", 0 }, 
 };
 
+static void
+pnode_print(struct parse *p, struct pnode *pn);
+
 /*
  * Look up whether "parent" is a valid parent for "node".
+ * This is sucked directly from the DocBook specification: look at the
+ * "children" and "parent" sections of each node.
  */
 static int
 isparent(enum nodeid node, enum nodeid parent)
@@ -334,6 +343,26 @@ isparent(enum nodeid node, enum nodeid parent)
 		return(parent == NODE_REFENTRY);
 	case (NODE_REFSYNOPSISDIV):
 		return(parent == NODE_REFENTRY);
+	case (NODE_STRUCTNAME):
+		switch (parent) {
+		case (NODE_CODE):
+		case (NODE_FUNCSYNOPSISINFO):
+		case (NODE_FUNCTION):
+		case (NODE_OPTION):
+		case (NODE_PARA):
+		case (NODE_PARAMETER):
+		case (NODE_PROGRAMLISTING):
+		case (NODE_REFDESCRIPTOR):
+		case (NODE_REFENTRYTITLE):
+		case (NODE_REFNAME):
+		case (NODE_REFPURPOSE):
+		case (NODE_SYNOPSIS):
+		case (NODE_TITLE):
+			return(1);
+		default:
+			break;
+		}
+		return(0);
 	case (NODE_SYNOPSIS):
 		switch (parent) {
 		case (NODE_REFSYNOPSISDIV):
@@ -362,6 +391,12 @@ isparent(enum nodeid node, enum nodeid parent)
 	return(0);
 }
 
+/*
+ * Process a stream of characters.
+ * We store text as nodes in and of themselves.
+ * If a text node is already open, append to it.
+ * If it's not open, open one under the current context.
+ */
 static void
 xml_char(void *arg, const XML_Char *p, int sz)
 {
@@ -383,7 +418,7 @@ xml_char(void *arg, const XML_Char *p, int sz)
 	 * If we're not processing text right now, then create a text
 	 * node for doing so.
 	 * However, don't do so unless we have some non-whitespace to
-	 * process!
+	 * process: strip out all leading whitespace to be sure.
 	 */
 	if (NODE_TEXT != ps->node) {
 		for (i = 0; i < sz; i++)
@@ -391,6 +426,8 @@ xml_char(void *arg, const XML_Char *p, int sz)
 				break;
 		if (i == sz)
 			return;
+		p += i;
+		sz -= i;
 		dat = calloc(1, sizeof(struct pnode));
 		if (NULL == dat) {
 			perror(NULL);
@@ -403,7 +440,6 @@ xml_char(void *arg, const XML_Char *p, int sz)
 		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
 		ps->cur = dat;
 		assert(NULL != ps->root);
-
 	}
 
 	/* Append to current buffer. */
@@ -418,14 +454,27 @@ xml_char(void *arg, const XML_Char *p, int sz)
 	ps->cur->bsz += (size_t)sz;
 }
 
+static void
+pnode_trim(struct pnode *pn)
+{
+
+	assert(NODE_TEXT == pn->node);
+	for ( ; pn->bsz > 0; pn->bsz--)
+		if ( ! isspace((int)pn->b[pn->bsz - 1]))
+			break;
+}
+
 /*
  * Begin an element.
  * First, look for the element.
  * If we don't find it and we're not parsing, keep going.
- * If we don't find it (and we're parsing), puke and exit.
+ * If we don't find it and we're parsing, puke and exit.
  * If we find it but we're not parsing yet (i.e., it's not a refentry
  * and thus out of context), keep going.
- * If we're at the root and already have a tree, puke and exit.
+ * If we find it and we're at the root and already have a tree, puke and
+ * exit (FIXME: I don't think this is right?).
+ * If we find it but we're parsing a text node, close out the text node,
+ * return to its parent, and keep going.
  * Make sure that the element is in the right context.
  * Lastly, put the node onto our parse tree and continue.
  */
@@ -442,6 +491,7 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 	/* Close out text node, if applicable... */
 	if (NODE_TEXT == ps->node) {
 		assert(NULL != ps->cur);
+		pnode_trim(ps->cur);
 		ps->cur = ps->cur->parent;
 		assert(NULL != ps->cur);
 		ps->node = ps->cur->node;
@@ -453,6 +503,7 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 		else if (0 == strcmp(nodes[node].name, name))
 			break;
 
+	/* FIXME: do more with these error messages... */
 	if (NODE__MAX == node && NODE_ROOT == ps->node) {
 		fprintf(stderr, "%s: ignoring node\n", name);
 		return;
@@ -492,7 +543,7 @@ xml_elem_start(void *arg, const XML_Char *name, const 
 
 /*
  * Roll up the parse tree.
- * Does nothing else special.
+ * If we're at a text node, roll that one up first.
  * If we hit the root, then assign ourselves as the NODE_ROOT.
  */
 static void
@@ -506,6 +557,7 @@ xml_elem_end(void *arg, const XML_Char *name)
 	/* Close out text node, if applicable... */
 	if (NODE_TEXT == ps->node) {
 		assert(NULL != ps->cur);
+		pnode_trim(ps->cur);
 		ps->cur = ps->cur->parent;
 		assert(NULL != ps->cur);
 		ps->node = ps->cur->node;
@@ -517,6 +569,9 @@ xml_elem_end(void *arg, const XML_Char *name)
 		ps->node = ps->cur->node;
 }
 
+/*
+ * Recursively free a node (NULL is ok).
+ */
 static void
 pnode_free(struct pnode *pn)
 {
@@ -534,6 +589,9 @@ pnode_free(struct pnode *pn)
 	free(pn);
 }
 
+/*
+ * Unlink a node from its parent and pnode_free() it.
+ */
 static void
 pnode_unlink(struct pnode *pn)
 {
@@ -543,6 +601,9 @@ pnode_unlink(struct pnode *pn)
 	pnode_free(pn);
 }
 
+/*
+ * Unlink all children of a node and pnode_free() them.
+ */
 static void
 pnode_unlinksub(struct pnode *pn)
 {
@@ -551,6 +612,9 @@ pnode_unlinksub(struct pnode *pn)
 		pnode_unlink(TAILQ_FIRST(&pn->childq));
 }
 
+/*
+ * Reset the lookaside buffer.
+ */
 static void
 bufclear(struct parse *p)
 {
@@ -558,6 +622,11 @@ bufclear(struct parse *p)
 	p->b[p->bsz = 0] = '\0';
 }
 
+/*
+ * Append NODE_TEXT contents to the current buffer, reallocating its
+ * size if necessary.
+ * The buffer is ALWAYS nil-terminated.
+ */
 static void
 bufappend(struct parse *p, struct pnode *pn)
 {
@@ -575,6 +644,12 @@ bufappend(struct parse *p, struct pnode *pn)
 	p->b[p->bsz] = '\0';
 }
 
+/*
+ * Recursively append all NODE_TEXT nodes to the buffer.
+ * This descends into non-text nodes, but doesn't do anything beyond
+ * them.
+ * In other words, this is a recursive text grok.
+ */
 static void
 bufappend_r(struct parse *p, struct pnode *pn)
 {
@@ -587,8 +662,7 @@ bufappend_r(struct parse *p, struct pnode *pn)
 }
 
 /*
- * Print text presumably on a macro line.
- * Ignore any child macros.
+ * Recursively print text presumably on a macro line.
  * Convert all whitespace to regular spaces.
  */
 static void
@@ -638,6 +712,44 @@ pnode_printmacroline(struct parse *p, struct pnode *pn
 }
 
 static void
+pnode_printmopen(struct parse *p)
+{
+	if (p->newln) {
+		putchar('.');
+		p->newln = 0;
+	} else
+		putchar(' ');
+}
+
+static void
+pnode_printmclose(struct parse *p, int sv)
+{
+
+	if (sv && ! p->newln) {
+		putchar('\n');
+		p->newln = 1;
+	}
+}
+
+/*
+ * If the SYNOPSIS macro has a superfluous title, kill it.
+ */
+static void
+pnode_printrefsynopsisdiv(struct parse *p, struct pnode *pn)
+{
+	struct pnode	*pp;
+
+	TAILQ_FOREACH(pp, &pn->childq, child) 
+		if (NODE_TITLE == pp->node) {
+			pnode_unlink(pp);
+			return;
+		}
+}
+
+/*
+ * Start a hopefully-named `Sh' section.
+ */
+static void
 pnode_printrefsect(struct parse *p, struct pnode *pn)
 {
 	struct pnode	*pp;
@@ -648,12 +760,16 @@ pnode_printrefsect(struct parse *p, struct pnode *pn)
 
 	fputs(".Sh ", stdout);
 
-	if (NULL != pp)
+	if (NULL != pp) {
 		pnode_printmacroline(p, pp);
-	else
+		pnode_unlink(pp);
+	} else
 		puts("UNKNOWN");
 }
 
+/*
+ * Start a reference, extracting the title and volume.
+ */
 static void
 pnode_printciterefentry(struct parse *p, struct pnode *pn)
 {
@@ -696,6 +812,7 @@ pnode_printrefmeta(struct parse *p, struct pnode *pn)
 	fputs(".Dt ", stdout);
 
 	if (NULL != title) {
+		/* FIXME: uppercase. */
 		pnode_printmacrolinepart(p, title);
 		putchar(' ');
 	} else
@@ -780,30 +897,64 @@ pnode_printfuncprototype(struct parse *p, struct pnode
 	puts(".Fc");
 }
 
-/* TODO: handle "optional" values. */
+/* 
+ * The <arg> element is more complicated than it should be because text
+ * nodes are treated like ".Ar foo", but non-text nodes need to be
+ * re-sent into the printer (i.e., without the preceding ".Ar").
+ * TODO: handle "optional" attribute.
+ */
 static void
-pnode_printarg(struct parse *p, struct pnode *pn, int nested)
+pnode_printarg(struct parse *p, struct pnode *pn)
 {
 	struct pnode	*pp;
-	int		 sv = nested;
 
-	if ( ! nested) 
-		fputs(".", stdout);
-	nested = 1;
-	TAILQ_FOREACH(pp, &pn->childq, child)
-		if (NODE_OPTION == pp->node) {
-			fputs("Fl ", stdout);
-			pnode_printmacrolinepart(p, pp);
-		} else if (NODE_TEXT == pp->node) {
+	TAILQ_FOREACH(pp, &pn->childq, child) {
+		if (NODE_TEXT == pp->node) {
+			pnode_printmopen(p);
 			fputs("Ar ", stdout);
-			pnode_printmacrolinepart(p, pp);
-		} else if (NODE_ARG == pp->node)
-			pnode_printarg(p, pp, nested);
+		} 
+		pnode_print(p, pp);
+	}
+}
 
-	if ( ! sv)
-		puts("");
+/* 
+ * Recursively search and return the first instance of "node".
+ */
+static struct pnode *
+pnode_findfirst(struct pnode *pn, enum nodeid node)
+{
+	struct pnode	*pp, *res;
+
+	res = NULL;
+	TAILQ_FOREACH(pp, &pn->childq, child) {
+		res = pp->node == node ? pp :
+			pnode_findfirst(pp, node);
+		if (NULL != res)
+			break;
+	}
+
+	return(res);
 }
 
+static void
+pnode_printprologue(struct parse *p, struct pnode *pn)
+{
+	struct pnode	*pp;
+
+	pp = NULL == p->root ? NULL :
+		pnode_findfirst(p->root, NODE_REFMETA);
+
+	if (NULL != pp) {
+		pnode_printrefmeta(p, pp);
+		pnode_unlink(pp);
+	} else {
+		puts(".\\\" Supplying bogus prologue...");
+		puts(".Dd $Mdocdate" "$");
+		puts(".Dt UNKNOWN 1");
+		puts(".Os");
+	}
+}
+
 /*
  * Print a parsed node (or ignore it--whatever).
  * This is a recursive function.
@@ -814,84 +965,95 @@ pnode_print(struct parse *p, struct pnode *pn)
 {
 	struct pnode	*pp;
 	char		*cp;
-	int		 last;
+	int		 last, sv;
 
 	if (NULL == pn)
 		return;
 
-	if (NODE_TEXT != pn->node && NODE_ROOT != pn->node)
-		printf(".\\\" %s\n", nodes[pn->node].name);
+	sv = p->newln;
 
 	switch (pn->node) {
 	case (NODE_ARG):
-		pnode_printarg(p, pn, 0);
+		pnode_printarg(p, pn);
 		pnode_unlinksub(pn);
 		break;
 	case (NODE_CITEREFENTRY):
+		assert(p->newln);
 		pnode_printciterefentry(p, pn);
 		pnode_unlinksub(pn);
 		break;
 	case (NODE_CODE):
-		fputs(".Li ", stdout);
-		pnode_printmacroline(p, pn);
-		pnode_unlinksub(pn);
+		pnode_printmopen(p);
+		fputs("Li ", stdout);
 		break;
 	case (NODE_COMMAND):
-		fputs(".Nm ", stdout);
-		pnode_printmacroline(p, pn);
-		pnode_unlinksub(pn);
+		pnode_printmopen(p);
+		fputs("Nm ", stdout);
 		break;
 	case (NODE_FUNCTION):
-		fputs(".Fn ", stdout);
-		pnode_printmacroline(p, pn);
-		pnode_unlinksub(pn);
+		pnode_printmopen(p);
+		fputs("Fn ", stdout);
 		break;
 	case (NODE_FUNCPROTOTYPE):
+		assert(p->newln);
 		pnode_printfuncprototype(p, pn);
 		pnode_unlinksub(pn);
 		break;
 	case (NODE_FUNCSYNOPSISINFO):
-		fputs(".Fd ", stdout);
-		pnode_printmacroline(p, pn);
-		pnode_unlinksub(pn);
+		pnode_printmopen(p);
+		fputs("Fd ", stdout);
 		break;
+	case (NODE_OPTION):
+		pnode_printmopen(p);
+		fputs("Fl ", stdout);
+		break;
 	case (NODE_PARA):
-		/* FIXME: not always. */
+		assert(p->newln);
 		puts(".Pp");
 		break;
 	case (NODE_PARAMETER):
-		fputs(".Fa \"", stdout);
+		/* Suppress non-text children... */
+		pnode_printmopen(p);
+		fputs("Fa \"", stdout);
 		pnode_printmacrolinepart(p, pn);
 		puts("\"");
 		pnode_unlinksub(pn);
 		break;
 	case (NODE_PROGRAMLISTING):
+		assert(p->newln);
 		puts(".Bd -literal");
 		break;
 	case (NODE_REFMETA):
-		pnode_printrefmeta(p, pn);
-		pnode_unlinksub(pn);
+		abort();
 		break;
 	case (NODE_REFNAME):
-		fputs(".Nm ", stdout);
-		pnode_printmacroline(p, pn);
+		/* Suppress non-text children... */
+		pnode_printmopen(p);
+		fputs("Nm ", stdout);
+		pnode_printmacrolinepart(p, pn);
 		pnode_unlinksub(pn);
-		return;
+		break;
 	case (NODE_REFNAMEDIV):
+		assert(p->newln);
 		puts(".Sh NAME");
 		break;
 	case (NODE_REFPURPOSE):
+		assert(p->newln);
 		fputs(".Nd ", stdout);
-		pnode_printmacroline(p, pn);
-		pnode_unlinksub(pn);
-		return;
+		break;
 	case (NODE_REFSYNOPSISDIV):
+		assert(p->newln);
+		pnode_printrefsynopsisdiv(p, pn);
 		puts(".Sh SYNOPSIS");
 		break;
 	case (NODE_REFSECT1):
+		assert(p->newln);
 		pnode_printrefsect(p, pn);
-		pnode_unlinksub(pn);
 		break;
+	case (NODE_STRUCTNAME):
+		pnode_printmopen(p);
+		fputs("Vt ", stdout);
+		break;
 	case (NODE_TEXT):
 		bufclear(p);
 		bufappend(p, pn);
@@ -901,6 +1063,7 @@ pnode_print(struct parse *p, struct pnode *pn)
 		 * XXX: all whitespace, including tabs (?).
 		 * Remember to escape control characters and escapes.
 		 */
+		assert(p->bsz);
 		for (last = '\n', cp = p->b; '\0' != *cp; ) {
 			if ('\n' == last) {
 				/* Consume all whitespace. */
@@ -916,8 +1079,7 @@ pnode_print(struct parse *p, struct pnode *pn)
 			if ('\\' == last)
 				putchar('e');
 		}
-		if ('\n' != last)
-			putchar('\n');
+		p->newln = 0;
 		break;
 	default:
 		break;
@@ -927,8 +1089,22 @@ pnode_print(struct parse *p, struct pnode *pn)
 		pnode_print(p, pp);
 
 	switch (pn->node) {
+	case (NODE_ARG):
+	case (NODE_CODE):
+	case (NODE_COMMAND):
+	case (NODE_FUNCTION):
+	case (NODE_FUNCSYNOPSISINFO):
+	case (NODE_OPTION):
+	case (NODE_PARAMETER):
+	case (NODE_REFNAME):
+	case (NODE_STRUCTNAME):
+	case (NODE_TEXT):
+		pnode_printmclose(p, sv);
+		break;
 	case (NODE_PROGRAMLISTING):
+		assert(p->newln);
 		puts(".Ed");
+		p->newln = 1;
 		break;
 	default:
 		break;
@@ -966,6 +1142,8 @@ readfile(XML_Parser xp, int fd, 
 		 * Exit when we've read all or errors have occured
 		 * during the parse sequence.
 		 */
+		p.newln = 1;
+		pnode_printprologue(&p, p.root);
 		pnode_print(&p, p.root);
 		pnode_free(p.root);
 		free(p.b);