/* $Id: docbook2mdoc.c,v 1.3 2014/03/28 02:46:40 kristaps Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/queue.h>
#include <assert.h>
#include <ctype.h>
#include <expat.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
/*
* All recognised node types.
*/
enum nodeid {
NODE_ROOT = 0, /* Must comes first. */
/* Alpha-ordered hereafter. */
NODE_CITEREFENTRY,
NODE_CODE,
NODE_FUNCDEF,
NODE_FUNCPROTOTYPE,
NODE_FUNCSYNOPSIS,
NODE_FUNCSYNOPSISINFO,
NODE_FUNCTION,
NODE_MANVOLNUM,
NODE_PARA,
NODE_PARAMDEF,
NODE_PARAMETER,
NODE_PROGRAMLISTING,
NODE_REFCLASS,
NODE_REFDESCRIPTOR,
NODE_REFENTRY,
NODE_REFENTRYTITLE,
NODE_REFMETA,
NODE_REFMISCINFO,
NODE_REFNAME,
NODE_REFNAMEDIV,
NODE_REFPURPOSE,
NODE_REFSECT1,
NODE_REFSYNOPSISDIV,
NODE_SYNOPSIS,
NODE_TEXT,
NODE_TITLE,
NODE__MAX
};
/*
* Global parse state.
* Keep this as simple and small as possible.
*/
struct parse {
enum nodeid node; /* current (NODE_ROOT if pre-tree) */
int stop; /* should we stop now? */
struct pnode *root; /* root of parse tree */
struct pnode *cur; /* current node in tree */
char *b;
size_t bsz;
size_t mbsz;
};
struct node {
const char *name;
unsigned int flags;
#define NODE_IGNTEXT 1 /* ignore all contained text */
};
TAILQ_HEAD(pnodeq, pnode);
struct pnode {
enum nodeid node; /* node type */
char *b; /* binary data buffer */
size_t bsz; /* data buffer size */
struct pnode *parent; /* parent (or NULL if top) */
struct pnodeq childq; /* queue of children */
TAILQ_ENTRY(pnode) child;
};
static const struct node nodes[NODE__MAX] = {
{ NULL, 0 },
{ "citerefentry", NODE_IGNTEXT },
{ "code", 0 },
{ "funcdef", 0 },
{ "funcprototype", NODE_IGNTEXT },
{ "funcsynopsis", NODE_IGNTEXT },
{ "funcsynopsisinfo", 0 },
{ "function", 0 },
{ "manvolnum", 0 },
{ "para", 0 },
{ "paramdef", 0 },
{ "parameter", 0 },
{ "programlisting", 0 },
{ "refclass", NODE_IGNTEXT },
{ "refdescriptor", NODE_IGNTEXT },
{ "refentry", NODE_IGNTEXT },
{ "refentrytitle", 0 },
{ "refmeta", NODE_IGNTEXT },
{ "refmiscinfo", NODE_IGNTEXT },
{ "refname", 0 },
{ "refnamediv", NODE_IGNTEXT },
{ "refpurpose", 0 },
{ "refsect1", 0 },
{ "refsynopsisdiv", NODE_IGNTEXT },
{ "synopsis", 0 },
{ NULL, 0 },
{ "title", 0 },
};
/*
* Look up whether "parent" is a valid parent for "node".
*/
static int
isparent(enum nodeid node, enum nodeid parent)
{
switch (node) {
case (NODE_ROOT):
return(0);
case (NODE_CITEREFENTRY):
switch (parent) {
case (NODE_FUNCSYNOPSISINFO):
case (NODE_PARA):
case (NODE_PROGRAMLISTING):
case (NODE_REFDESCRIPTOR):
case (NODE_REFENTRYTITLE):
case (NODE_REFNAME):
case (NODE_REFPURPOSE):
case (NODE_SYNOPSIS):
case (NODE_TITLE):
return(1);
default:
break;
}
return(0);
case (NODE_CODE):
switch (parent) {
case (NODE_FUNCSYNOPSISINFO):
case (NODE_PARA):
case (NODE_PROGRAMLISTING):
case (NODE_REFDESCRIPTOR):
case (NODE_REFENTRYTITLE):
case (NODE_REFNAME):
case (NODE_REFPURPOSE):
case (NODE_SYNOPSIS):
case (NODE_TITLE):
return(1);
default:
break;
}
return(0);
case (NODE_FUNCDEF):
return(NODE_FUNCPROTOTYPE == parent);
case (NODE_FUNCPROTOTYPE):
return(NODE_FUNCSYNOPSIS == parent);
case (NODE_FUNCSYNOPSIS):
switch (parent) {
case (NODE_PARA):
case (NODE_REFSECT1):
case (NODE_REFSYNOPSISDIV):
return(1);
default:
break;
}
return(0);
case (NODE_FUNCSYNOPSISINFO):
return(NODE_FUNCSYNOPSIS == parent);
case (NODE_FUNCTION):
switch (parent) {
case (NODE_CODE):
case (NODE_FUNCDEF):
case (NODE_FUNCSYNOPSISINFO):
case (NODE_PARA):
case (NODE_REFDESCRIPTOR):
case (NODE_REFENTRYTITLE):
case (NODE_REFNAME):
case (NODE_REFPURPOSE):
case (NODE_SYNOPSIS):
case (NODE_TITLE):
return(1);
default:
break;
}
return(0);
case (NODE_MANVOLNUM):
switch (parent) {
case (NODE_CITEREFENTRY):
case (NODE_REFMETA):
return(1);
default:
break;
}
return(0);
case (NODE_PARA):
switch (parent) {
case (NODE_REFSECT1):
case (NODE_REFSYNOPSISDIV):
return(1);
default:
break;
}
return(0);
case (NODE_PARAMDEF):
return(NODE_FUNCPROTOTYPE == parent);
case (NODE_PARAMETER):
switch (parent) {
case (NODE_CODE):
case (NODE_FUNCSYNOPSISINFO):
case (NODE_PARA):
case (NODE_PARAMDEF):
case (NODE_REFDESCRIPTOR):
case (NODE_REFENTRYTITLE):
case (NODE_REFNAME):
case (NODE_REFPURPOSE):
case (NODE_SYNOPSIS):
case (NODE_TITLE):
return(1);
default:
break;
}
return(0);
case (NODE_PROGRAMLISTING):
switch (parent) {
case (NODE_PARA):
case (NODE_REFSECT1):
case (NODE_REFSYNOPSISDIV):
return(1);
default:
break;
}
return(0);
case (NODE_REFCLASS):
return(parent == NODE_REFNAMEDIV);
case (NODE_REFDESCRIPTOR):
return(parent == NODE_REFNAMEDIV);
case (NODE_REFENTRY):
return(parent == NODE_ROOT);
case (NODE_REFENTRYTITLE):
switch (parent) {
case (NODE_CITEREFENTRY):
case (NODE_REFMETA):
return(1);
default:
break;
}
case (NODE_REFMETA):
return(parent == NODE_REFENTRY);
case (NODE_REFMISCINFO):
return(parent == NODE_REFMETA);
case (NODE_REFNAME):
return(parent == NODE_REFNAMEDIV);
case (NODE_REFNAMEDIV):
return(parent == NODE_REFENTRY);
case (NODE_REFPURPOSE):
return(parent == NODE_REFNAMEDIV);
case (NODE_REFSECT1):
return(parent == NODE_REFENTRY);
case (NODE_REFSYNOPSISDIV):
return(parent == NODE_REFENTRY);
case (NODE_SYNOPSIS):
switch (parent) {
case (NODE_REFSYNOPSISDIV):
case (NODE_REFSECT1):
return(1);
default:
break;
}
return(0);
case (NODE_TITLE):
switch (parent) {
case (NODE_REFSECT1):
case (NODE_REFSYNOPSISDIV):
return(1);
default:
break;
}
return(0);
case (NODE_TEXT):
return(1);
case (NODE__MAX):
break;
}
abort();
return(0);
}
static void
xml_char(void *arg, const XML_Char *p, int sz)
{
struct parse *ps = arg;
struct pnode *dat;
/* Stopped or no tree yet. */
if (ps->stop || NODE_ROOT == ps->node)
return;
/* Not supposed to be collecting text. */
assert(NULL != ps->cur);
if (NODE_IGNTEXT & nodes[ps->node].flags)
return;
/*
* Are we in the midst of processing text?
* If we're not processing text right now, then create a text
* node for doing so.
*/
if (NODE_TEXT != ps->node) {
dat = calloc(1, sizeof(struct pnode));
if (NULL == dat) {
perror(NULL);
exit(EXIT_FAILURE);
}
dat->node = ps->node = NODE_TEXT;
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
assert(NULL != ps->root);
}
/* Append to current buffer. */
assert(sz >= 0);
ps->cur->b = realloc(ps->cur->b,
ps->cur->bsz + (size_t)sz);
if (NULL == ps->cur->b) {
perror(NULL);
exit(EXIT_FAILURE);
}
memcpy(ps->cur->b + ps->cur->bsz, p, sz);
ps->cur->bsz += (size_t)sz;
}
/*
* Begin an element.
* First, look for the element.
* If we don't find it and we're not parsing, keep going.
* If we don't find it (and we're parsing), puke and exit.
* If we find it but we're not parsing yet (i.e., it's not a refentry
* and thus out of context), keep going.
* If we're at the root and already have a tree, puke and exit.
* Make sure that the element is in the right context.
* Lastly, put the node onto our parse tree and continue.
*/
static void
xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
{
struct parse *ps = arg;
enum nodeid node;
struct pnode *dat;
if (ps->stop)
return;
/* Close out text node, if applicable... */
if (NODE_TEXT == ps->node) {
assert(NULL != ps->cur);
ps->cur = ps->cur->parent;
assert(NULL != ps->cur);
ps->node = ps->cur->node;
}
for (node = 0; node < NODE__MAX; node++)
if (NULL == nodes[node].name)
continue;
else if (0 == strcmp(nodes[node].name, name))
break;
if (NODE__MAX == node && NODE_ROOT == ps->node) {
fprintf(stderr, "%s: ignoring node\n", name);
return;
} else if (NODE__MAX == node) {
fprintf(stderr, "%s: unknown node\n", name);
ps->stop = 1;
return;
} else if (NODE_ROOT == ps->node && NULL != ps->root) {
fprintf(stderr, "%s: reentering?\n", name);
ps->stop = 1;
return;
} else if (NODE_ROOT == ps->node && NODE_REFENTRY != node) {
fprintf(stderr, "%s: known node w/o context\n", name);
return;
} else if ( ! isparent(node, ps->node)) {
fprintf(stderr, "%s: bad parent\n", name);
ps->stop = 1;
return;
}
if (NULL == (dat = calloc(1, sizeof(struct pnode)))) {
perror(NULL);
exit(EXIT_FAILURE);
}
dat->node = ps->node = node;
dat->parent = ps->cur;
TAILQ_INIT(&dat->childq);
if (NULL != ps->cur)
TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
ps->cur = dat;
if (NULL == ps->root)
ps->root = dat;
}
/*
* Roll up the parse tree.
* Does nothing else special.
* If we hit the root, then assign ourselves as the NODE_ROOT.
*/
static void
xml_elem_end(void *arg, const XML_Char *name)
{
struct parse *ps = arg;
if (ps->stop || NODE_ROOT == ps->node)
return;
/* Close out text node, if applicable... */
if (NODE_TEXT == ps->node) {
assert(NULL != ps->cur);
ps->cur = ps->cur->parent;
assert(NULL != ps->cur);
ps->node = ps->cur->node;
}
if (NULL == (ps->cur = ps->cur->parent))
ps->node = NODE_ROOT;
else
ps->node = ps->cur->node;
}
static void
pnode_free(struct pnode *pn)
{
struct pnode *pp;
if (NULL == pn)
return;
while (NULL != (pp = TAILQ_FIRST(&pn->childq))) {
TAILQ_REMOVE(&pn->childq, pp, child);
pnode_free(pp);
}
free(pn->b);
free(pn);
}
static void
pnode_unlink(struct pnode *pn)
{
if (NULL != pn->parent)
TAILQ_REMOVE(&pn->parent->childq, pn, child);
pnode_free(pn);
}
static void
bufclear(struct parse *p)
{
p->b[p->bsz = 0] = '\0';
}
static void
bufappend(struct parse *p, struct pnode *pn)
{
assert(NODE_TEXT == pn->node);
if (p->bsz + pn->bsz + 1 > p->mbsz) {
p->mbsz = p->bsz + pn->bsz + 1;
if (NULL == (p->b = realloc(p->b, p->mbsz))) {
perror(NULL);
exit(EXIT_FAILURE);
}
}
memcpy(p->b + p->bsz, pn->b, pn->bsz);
p->bsz += pn->bsz;
p->b[p->bsz] = '\0';
}
static void
bufappend_r(struct parse *p, struct pnode *pn)
{
struct pnode *pp;
if (NODE_TEXT == pn->node)
bufappend(p, pn);
TAILQ_FOREACH(pp, &pn->childq, child)
bufappend_r(p, pp);
}
/*
* Print text presumably on a macro line.
* Ignore any child macros.
* Convert all whitespace to regular spaces.
*/
static void
pnode_printmacrolinepart(struct parse *p, struct pnode *pn)
{
char *cp;
bufclear(p);
bufappend_r(p, pn);
/* Convert all space to spaces. */
for (cp = p->b; '\0' != *cp; cp++)
if (isspace((int)*cp))
*cp = ' ';
for (cp = p->b; isspace((int)*cp); cp++)
/* Spin. */ ;
for ( ; '\0' != *cp; cp++) {
/* Escape us if we look like a macro. */
if ((cp == p->b || ' ' == *(cp - 1)) &&
isupper((int)*cp) &&
'\0' != *(cp + 1) &&
islower((int)*(cp + 1)) &&
('\0' == *(cp + 2) ||
' ' == *(cp + 2) ||
(islower((int)*(cp + 2)) &&
('\0' == *(cp + 3) ||
' ' == *(cp + 3)))))
fputs("\\&", stdout);
putchar(*cp);
/* If we're a character escape, escape us. */
if ('\\' == *cp)
putchar('e');
}
}
/*
* Just pnode_printmacrolinepart() but with a newline.
* If no text, just the newline.
*/
static void
pnode_printmacroline(struct parse *p, struct pnode *pn)
{
pnode_printmacrolinepart(p, pn);
putchar('\n');
}
static void
pnode_printrefsect(struct parse *p, struct pnode *pn)
{
struct pnode *pp;
TAILQ_FOREACH(pp, &pn->childq, child)
if (NODE_TITLE == pp->node)
break;
if (NULL != pp) {
fputs(".Sh ", stdout);
pnode_printmacroline(p, pp);
pnode_unlink(pp);
} else
puts(".Sh UNKNOWN");
}
static void
pnode_printciterefentry(struct parse *p, struct pnode *pn)
{
struct pnode *pp, *title, *manvol;
title = manvol = NULL;
TAILQ_FOREACH(pp, &pn->childq, child)
if (NODE_MANVOLNUM == pp->node)
manvol = pp;
else if (NODE_REFENTRYTITLE == pp->node)
title = pp;
fputs(".Xr ", stdout);
if (NULL != title) {
pnode_printmacrolinepart(p, title);
pnode_unlink(title);
} else
fputs("unknown", stdout);
putchar(' ');
if (NULL != manvol) {
pnode_printmacroline(p, manvol);
pnode_unlink(manvol);
} else
puts("1");
}
static void
pnode_printrefmeta(struct parse *p, struct pnode *pn)
{
struct pnode *pp, *title, *manvol;
title = manvol = NULL;
TAILQ_FOREACH(pp, &pn->childq, child)
if (NODE_MANVOLNUM == pp->node)
manvol = pp;
else if (NODE_REFENTRYTITLE == pp->node)
title = pp;
puts(".Dd $Mdocdate" "$");
fputs(".Dt ", stdout);
if (NULL != title) {
pnode_printmacrolinepart(p, title);
pnode_unlink(title);
} else
fputs("UNKNOWN", stdout);
putchar(' ');
if (NULL != manvol) {
pnode_printmacroline(p, manvol);
pnode_unlink(manvol);
} else
puts("1");
puts(".Os");
}
static void
pnode_printfuncdef(struct parse *p, struct pnode *pn)
{
struct pnode *pp, *ftype, *func;
ftype = func = NULL;
TAILQ_FOREACH(pp, &pn->childq, child)
if (NODE_TEXT == pp->node)
ftype = pp;
else if (NODE_FUNCTION == pp->node)
func = pp;
if (NULL != ftype) {
fputs(".Ft ", stdout);
pnode_printmacroline(p, ftype);
}
if (NULL != func) {
fputs(".Fo ", stdout);
pnode_printmacroline(p, func);
} else
puts(".Fo UNKNOWN");
}
static void
pnode_printparamdef(struct parse *p, struct pnode *pn)
{
struct pnode *pp, *ptype, *param;
ptype = param = NULL;
TAILQ_FOREACH(pp, &pn->childq, child)
if (NODE_TEXT == pp->node)
ptype = pp;
else if (NODE_PARAMETER == pp->node)
param = pp;
fputs(".Fa \"", stdout);
if (NULL != ptype) {
pnode_printmacrolinepart(p, ptype);
putchar(' ');
}
if (NULL != param)
pnode_printmacrolinepart(p, param);
else
fputs("UNKNOWN", stdout);
puts("\"");
}
static void
pnode_printfuncprototype(struct parse *p, struct pnode *pn)
{
struct pnode *pp, *fdef;
TAILQ_FOREACH(fdef, &pn->childq, child)
if (NODE_FUNCDEF == fdef->node)
break;
if (NULL != fdef) {
pnode_printfuncdef(p, fdef);
pnode_unlink(fdef);
} else
puts(".Fo UNKNOWN");
TAILQ_FOREACH(pp, &pn->childq, child) {
if (NODE_PARAMDEF == pp->node)
pnode_printparamdef(p, pp);
pnode_unlink(pp);
}
puts(".Fc");
}
/*
* Print a parsed node (or ignore it--whatever).
* This is a recursive function.
* FIXME: macro line continuation?
*/
static void
pnode_print(struct parse *p, struct pnode *pn)
{
struct pnode *pp;
char *cp;
int last;
if (NULL == pn)
return;
if (NODE_TEXT != pn->node && NODE_ROOT != pn->node)
printf(".\\\" %s\n", nodes[pn->node].name);
switch (pn->node) {
case (NODE_CITEREFENTRY):
pnode_printciterefentry(p, pn);
break;
case (NODE_CODE):
fputs(".Li ", stdout);
pnode_printmacroline(p, pn);
break;
case (NODE_FUNCTION):
fputs(".Fn ", stdout);
pnode_printmacroline(p, pn);
break;
case (NODE_FUNCPROTOTYPE):
pnode_printfuncprototype(p, pn);
break;
case (NODE_FUNCSYNOPSISINFO):
fputs(".Fd ", stdout);
pnode_printmacroline(p, pn);
break;
case (NODE_PARA):
/* FIXME: not always. */
puts(".Pp");
break;
case (NODE_PARAMETER):
fputs(".Fa \"", stdout);
pnode_printmacrolinepart(p, pn);
puts("\"");
break;
case (NODE_PROGRAMLISTING):
puts(".Bd -literal");
break;
case (NODE_REFMETA):
pnode_printrefmeta(p, pn);
break;
case (NODE_REFNAME):
fputs(".Nm ", stdout);
pnode_printmacroline(p, pn);
return;
case (NODE_REFNAMEDIV):
puts(".Sh NAME");
break;
case (NODE_REFPURPOSE):
fputs(".Nd ", stdout);
pnode_printmacroline(p, pn);
return;
case (NODE_REFSYNOPSISDIV):
puts(".Sh SYNOPSIS");
break;
case (NODE_REFSECT1):
pnode_printrefsect(p, pn);
break;
case (NODE_TEXT):
bufclear(p);
bufappend(p, pn);
/*
* Output all characters, squeezing out whitespace
* between newlines.
* XXX: all whitespace, including tabs (?).
* Remember to escape control characters and escapes.
*/
for (last = '\n', cp = p->b; '\0' != *cp; ) {
if ('\n' == last) {
/* Consume all whitespace. */
if (isspace((int)*cp)) {
while (isspace((int)*cp))
cp++;
continue;
} else if ('\'' == *cp || '.' == *cp)
fputs("\\&", stdout);
}
putchar(last = *cp++);
/* If we're a character escape, escape us. */
if ('\\' == last)
putchar('e');
}
if ('\n' != last)
putchar('\n');
break;
default:
break;
}
TAILQ_FOREACH(pp, &pn->childq, child)
pnode_print(p, pp);
switch (pn->node) {
case (NODE_PROGRAMLISTING):
puts(".Ed");
break;
default:
break;
}
}
/*
* Loop around the read buffer until we've drained it of all data.
* Invoke the parser context with each buffer fill.
*/
static int
readfile(XML_Parser xp, int fd,
char *b, size_t bsz, const char *fn)
{
struct parse p;
int rc;
ssize_t ssz;
memset(&p, 0, sizeof(struct parse));
p.b = malloc(p.bsz = p.mbsz = 1024);
XML_SetCharacterDataHandler(xp, xml_char);
XML_SetElementHandler(xp, xml_elem_start, xml_elem_end);
XML_SetUserData(xp, &p);
while ((ssz = read(fd, b, bsz)) >= 0) {
if (0 == (rc = XML_Parse(xp, b, ssz, 0 == ssz)))
fprintf(stderr, "%s: %s\n", fn,
XML_ErrorString
(XML_GetErrorCode(xp)));
else if ( ! p.stop && ssz > 0)
continue;
/*
* Exit when we've read all or errors have occured
* during the parse sequence.
*/
pnode_print(&p, p.root);
pnode_free(p.root);
free(p.b);
return(0 != rc && ! p.stop);
}
/* Read error has occured. */
perror(fn);
pnode_free(p.root);
free(p.b);
return(0);
}
int
main(int argc, char *argv[])
{
XML_Parser xp;
const char *fname;
char *buf;
int fd, rc;
fname = "-";
xp = NULL;
buf = NULL;
rc = 0;
if (-1 != getopt(argc, argv, ""))
return(EXIT_FAILURE);
argc -= optind;
argv += optind;
if (argc > 1)
return(EXIT_FAILURE);
else if (argc > 0)
fname = argv[0];
/* Read from stdin or a file. */
fd = 0 == strcmp(fname, "-") ?
STDIN_FILENO : open(fname, O_RDONLY, 0);
/*
* Open file for reading.
* Allocate a read buffer.
* Create the parser context.
* Dive directly into the parse.
*/
if (-1 == fd)
perror(fname);
else if (NULL == (buf = malloc(4096)))
perror(NULL);
else if (NULL == (xp = XML_ParserCreate(NULL)))
perror(NULL);
else if ( ! readfile(xp, fd, buf, 4096, fname))
rc = 1;
XML_ParserFree(xp);
free(buf);
if (STDIN_FILENO != fd)
close(fd);
return(rc ? EXIT_SUCCESS : EXIT_FAILURE);
}