pod2mdoc/pod2mdoc.c - view

Return to pod2mdoc.c CVS log

Up to [cvsweb.bsd.lv] / pod2mdoc

File: [cvsweb.bsd.lv] / pod2mdoc / pod2mdoc.c (download)

Revision 1.30, Tue Jul 15 19:00:48 2014 UTC (10 years, 11 months ago) by schwarze
Branch: MAIN
Changes since 1.29: +17 -21 lines

Invalid format codes were skipped including their contents.
Instead, just skip the code, but not the content.
Also improve the comments at the place in ordinary()
where formatcode() is called.

/*	$Id: pod2mdoc.c,v 1.30 2014/07/15 19:00:48 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/stat.h>
#include <sys/time.h>

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/*
 * In what section can we find Perl module manuals?
 * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
 * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
 */
#define	PERL_SECTION	"3p"

struct	args {
	const char	*title; /* override "Dt" title */
	const char	*date; /* override "Dd" date */
	const char	*section; /* override "Dt" section */
};

enum	list {
	LIST_BULLET = 0,
	LIST_ENUM,
	LIST_TAG,
	LIST__MAX
};

enum	sect {
	SECT_NONE = 0,
	SECT_NAME, /* NAME section */
	SECT_SYNOPSIS, /* SYNOPSIS section */
};

struct	state {
	int		 parsing; /* after =cut of before command */
	int		 paused; /* in =begin and before =end */
	int		 haspar; /* in paragraph: do we need Pp? */
	enum sect	 sect; /* which section are we in? */
	const char	*fname; /* file being parsed */
#define	LIST_STACKSZ	 128
	enum list	 lstack[LIST_STACKSZ]; /* open lists */
	size_t		 lpos; /* where in list stack */
};

enum	fmt {
	FMT_ITALIC,
	FMT_BOLD,
	FMT_CODE,
	FMT_LINK,
	FMT_ESCAPE,
	FMT_FILE,
	FMT_NBSP,
	FMT_INDEX,
	FMT_NULL,
	FMT__MAX
};

enum	cmd {
	CMD_POD = 0,
	CMD_HEAD1,
	CMD_HEAD2,
	CMD_HEAD3,
	CMD_HEAD4,
	CMD_OVER,
	CMD_ITEM,
	CMD_BACK,
	CMD_BEGIN,
	CMD_END,
	CMD_FOR,
	CMD_ENCODING,
	CMD_CUT,
	CMD__MAX
};

static	const char *const cmds[CMD__MAX] = {
	"pod", 		/* CMD_POD */
	"head1",	/* CMD_HEAD1 */
	"head2",	/* CMD_HEAD2 */
	"head3",	/* CMD_HEAD3 */
	"head4",	/* CMD_HEAD4 */
	"over",		/* CMD_OVER */
	"item",		/* CMD_ITEM */
	"back",		/* CMD_BACK */
	"begin",	/* CMD_BEGIN */
	"end",		/* CMD_END */
	"for",		/* CMD_FOR */
	"encoding",	/* CMD_ENCODING */
	"cut"		/* CMD_CUT */
};

static	const char fmts[FMT__MAX] = {
	'I',		/* FMT_ITALIC */
	'B',		/* FMT_BOLD */
	'C',		/* FMT_CODE */
	'L',		/* FMT_LINK */
	'E',		/* FMT_ESCAPE */
	'F',		/* FMT_FILE */
	'S',		/* FMT_NBSP */
	'X',		/* FMT_INDEX */
	'Z'		/* FMT_NULL */
};

static	int 	last;

/*
 * Given buf[*start] is at the start of an escape name, read til the end
 * of the escape ('>') then try to do something with it.
 * Sets start to be one after the '>'.
 */
static void
formatescape(const char *buf, size_t *start, size_t end)
{
	char		 esc[16]; /* no more needed */
	size_t		 i, max;

	max = sizeof(esc) - 1;
	i = 0;
	/* Read til our buffer is full. */
	while (*start < end && '>' != buf[*start] && i < max)
		esc[i++] = buf[(*start)++];
	esc[i] = '\0';

	if (i == max) {
		/* Too long... skip til we end. */
		while (*start < end && '>' != buf[*start])
			(*start)++;
		return;
	} else if (*start >= end)
		return;

	assert('>' == buf[*start]);
	(*start)++;

	/*
	 * TODO: right now, we only recognise the named escapes.
	 * Just let the rest of them go. 
	 */
	if (0 == strcmp(esc, "lt")) 
		printf("\\(la");
	else if (0 == strcmp(esc, "gt"))
		printf("\\(ra");
	else if (0 == strcmp(esc, "vb"))
		printf("\\(ba");
	else if (0 == strcmp(esc, "sol"))
		printf("\\(sl");
	else
		return;

	last = 'a';
}

/*
 * Run some heuristics to intuit a link format.
 * I set "start" to be the end of the sequence (last right-carrot) so
 * that the caller can safely just continue processing.
 * If this is just an empty tag, I'll return 0.
 */
static int
trylink(const char *buf, size_t *start, size_t end, size_t dsz)
{
	size_t		 linkstart, realend, linkend, 
			 i, j, textsz, stack;

	/* 
	 * Scan to the start of the terminus. 
	 * This function is more or less replicated in the formatcode()
	 * for null or index formatting codes.
	 * However, we're slightly different because we might have
	 * nested escapes we need to ignore.
	 */
	stack = 0;
	for (linkstart = realend = *start; realend < end; realend++) {
		if ('<' == buf[realend])
			stack++;
		if ('>' != buf[realend])
			continue;
		else if (stack-- > 0)
			continue;
		if (dsz == 1)
			break;
		assert(realend > 0);
		if (' ' != buf[realend - 1])
			continue;
		for (i = realend, j = 0; i < end && j < dsz; j++) 
			if ('>' != buf[i++])
				break;
		if (dsz == j) 
			break;
	}

	/* Ignore stubs. */
	if (realend == end || realend == *start)
		return(0);

	/* Set linkend to the end of content. */
	linkend = dsz > 1 ? realend - 1 : realend;

	/* Re-scan to see if we have a title or section. */
	for (textsz = *start; textsz < linkend; textsz++)
		if ('|' == buf[textsz] || '/' == buf[textsz])
			break;

	if (textsz < linkend && '|' == buf[textsz]) {
		/* With title: set start, then end at section. */
		linkstart = textsz + 1;
		textsz = textsz - *start;
		for (i = linkstart; i < linkend; i++)
			if ('/' == buf[i])
				break;
		if (i < linkend)
			linkend = i;
	} else if (textsz < linkend && '/' == buf[textsz]) {
		/* With section: set end at section. */
		linkend = textsz;
		textsz = 0;
	} else
		/* No title, no section. */
		textsz = 0;

	*start = realend;
	j = linkend - linkstart;

	/* Do we have only subsection material? */
	if (0 == j && '/' == buf[linkend]) {
		linkstart = linkend + 1;
		linkend = dsz > 1 ? realend - 1 : realend;
		if (0 == (j = linkend - linkstart))
			return(0);
		printf("Sx %.*s", (int)j, &buf[linkstart]);
		return(1);
	} else if (0 == j)
		return(0);

	/* See if we qualify as being a link or not. */
	if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
		(j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
		(j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
		(j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
		(j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
		(j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
		/* Gross. */
		printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : 
			realend) - linkstart), &buf[linkstart]);
		return(1);
	} 
	
	/* See if we qualify as a mailto. */
	if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
		printf("Mt %.*s", (int)j, &buf[linkstart]);
		return(1);
	}

	/* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
	if ((j > 3 && ')' == buf[linkend - 1]) && 
		('(' == buf[linkend - 3])) {
		printf("Xr %.*s %c", (int)(j - 3), 
			&buf[linkstart], buf[linkend - 2]);
		return(1);
	} else if ((j > 4 && ')' == buf[linkend - 1]) &&
		('(' == buf[linkend - 4])) {
		printf("Xr %.*s %.*s", (int)(j - 4), 
			&buf[linkstart], 2, &buf[linkend - 3]);
		return(1);
	} else if ((j > 5 && ')' == buf[linkend - 1]) &&
		('(' == buf[linkend - 5])) {
		printf("Xr %.*s %.*s", (int)(j - 5), 
			&buf[linkstart], 3, &buf[linkend - 4]);
		return(1);
	}

	/* Last try: do we have a double-colon? */
	for (i = linkstart + 1; i < linkend; i++)
		if (':' == buf[i] && ':' == buf[i - 1])
			break;

	if (i < linkend)
		printf("Xr %.*s " PERL_SECTION, 
			(int)j, &buf[linkstart]);
	else
		printf("Xr %.*s 1", (int)j, &buf[linkstart]);

	return(1);
}

/*
 * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
 * then it's likely that we're a flag.
 * Our flag might be followed by an argument, so make sure that we're
 * accounting for that, too.
 * If we don't have a flag at all, however, then assume we're an "Ar".
 */
static void
dosynopsisfl(const char *buf, size_t *start, size_t end)
{
	size_t	 i;
again:
	assert(*start + 1 < end);
	assert('-' == buf[*start]);

	if ( ! isalnum((int)buf[*start + 1]) &&
		'?' != buf[*start + 1] &&
		'-' != buf[*start + 1]) {
		(*start)--;
		fputs("Ar ", stdout);
		return;
	}

	(*start)++;
	for (i = *start; i < end; i++)
		if (isalnum((int)buf[i]))
			continue;
		else if ('?' == buf[i])
			continue;
		else if ('-' == buf[i])
			continue;
		else if ('_' == buf[i])
			continue;
		else
			break;

	assert(i < end);

	if ( ! (' ' == buf[i] || '>' == buf[i])) {
		printf("Ar ");
		return;
	}

	printf("Fl ");
	if (end - *start > 1 && 
		isupper((int)buf[*start]) &&
		islower((int)buf[*start + 1]) &&
		(end - *start == 2 ||
		 ' ' == buf[*start + 2]))
		printf("\\&");
	printf("%.*s ", (int)(i - *start), &buf[*start]);
	*start = i;

	if (' ' == buf[i]) {
		while (i < end && ' ' == buf[i])
			i++;
		assert(i < end);
		if ('-' == buf[i]) {
			*start = i;
			goto again;
		}
		printf("Ar ");
		*start = i;
	}
}

/*
 * We're at the character in front of a format code, which is structured
 * like X<...> and can contain nested format codes.
 * This consumes the whole format code, and any nested format codes, til
 * the end of matched production.
 * If "reentrant", then we're being called after a macro has already
 * been printed to the current line.
 * If "nomacro", then we don't print any macros, just contained data
 * (e.g., following "Sh" or "Nm").
 * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
 * as the first format code on a line (for decoration as an "Nm"),
 * non-zero otherwise.
 * Return whether we've printed a macro or not--in other words, whether
 * this should trigger a subsequent newline (this should be ignored when
 * reentrant).
 */
static int
formatcode(struct state *st, const char *buf, size_t *start, 
	size_t end, int reentrant, int nomacro, int pos)
{
	enum fmt	 fmt;
	size_t		 i, j, dsz;

	assert(*start + 1 < end);
	assert('<' == buf[*start + 1]);

	/* 
	 * First, look up the format code. 
	 * If it's not valid, treat it as a NOOP.
	 */
	for (fmt = 0; fmt < FMT__MAX; fmt++) 
		if (buf[*start] == fmts[fmt])
			break;

	/*
	 * Determine whether we're overriding our delimiter.
	 * According to POD, if we have more than one '<' followed by a
	 * space, then we need a space followed by matching '>' to close
	 * the expression.
	 * Otherwise we use the usual '<' and '>' matched pair.
	 */
	i = *start + 1;
	while (i < end && '<' == buf[i])
		i++;
	assert(i > *start + 1);
	dsz = i - (*start + 1);
	if (dsz > 1 && (i >= end || ' ' != buf[i]))
		dsz = 1;

	/* Remember, if dsz>1, to jump the trailing space. */
	*start += dsz + 1 + (dsz > 1 ? 1 : 0);

	/*
	 * Escapes and ignored codes (NULL and INDEX) don't print macro
	 * sequences, so just output them like normal text before
	 * processing for real macros.
	 */
	if (FMT_ESCAPE == fmt) {
		formatescape(buf, start, end);
		return(0);
	} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
		/* 
		 * Just consume til the end delimiter, accounting for
		 * whether it's a custom one.
		 */
		for ( ; *start < end; (*start)++) {
			if ('>' != buf[*start])
				continue;
			else if (dsz == 1)
				break;
			assert(*start > 0);
			if (' ' != buf[*start - 1])
				continue;
			i = *start;
			for (j = 0; i < end && j < dsz; j++) 
				if ('>' != buf[i++])
					break;
			if (dsz != j) 
				continue;
			(*start) += dsz;
			break;
		}
		if (*start < end) {
			assert('>' == buf[*start]);
			(*start)++;
		}
		if (isspace(last))
			while (*start < end && isspace((int)buf[*start]))
				(*start)++;
		return(0);
	}

	/*
	 * Check whether we're supposed to print macro stuff (this is
	 * suppressed in, e.g., "Nm" and "Sh" macros).
	 */
	if (FMT__MAX != fmt && !nomacro) {
		/*
		 * Print out the macro describing this format code.
		 * If we're not "reentrant" (not yet on a macro line)
		 * then print a newline, if necessary, and the macro
		 * indicator.
		 * Otherwise, offset us with a space.
		 */
		if ( ! reentrant) {
			if (last != '\n')
				putchar('\n');
			putchar('.');
		} else 
			putchar(' ');
		
		/*
		 * If we don't have whitespace before us (and none after
		 * the opening delimiter), then suppress macro
		 * whitespace with Pf.
		 */
		if (' ' != last && '\n' != last && ' ' != buf[*start])
			printf("Pf ");

		switch (fmt) {
		case (FMT_ITALIC):
			printf("Em ");
			break;
		case (FMT_BOLD):
			if (SECT_SYNOPSIS == st->sect) { 
				if (1 == dsz && '-' == buf[*start])
					dosynopsisfl(buf, start, end);
				else if (0 == pos)
					printf("Nm ");
				else
					printf("Ar ");
				break;
			} 
			if (0 == strncmp(buf + *start, "NULL", 4) &&
			    ('=' == buf[*start + 4] ||
			     '>' == buf[*start + 4]))
				printf("Dv ");
			else
				printf("Sy ");
			break;
		case (FMT_CODE):
			printf("Qo Li ");
			break;
		case (FMT_LINK):
			/* Try to link; use "No" if it's empty. */
			if ( ! trylink(buf, start, end, dsz))
				printf("No ");
			break;
		case (FMT_FILE):
			printf("Pa ");
			break;
		case (FMT_NBSP):
			printf("No ");
			break;
		default:
			abort();
		}
	}

	/*
	 * Process until we reach the end marker (e.g., '>') or until we
	 * find a nested format code.
	 * Don't emit any newlines: since we're on a macro line, we
	 * don't want to break the line.
	 */
	while (*start < end) {
		if ('>' == buf[*start] && 1 == dsz) {
			(*start)++;
			break;
		} else if ('>' == buf[*start] && 
				' ' == buf[*start - 1]) {
			/*
			 * Handle custom delimiters.
			 * These require a certain number of
			 * space-preceded carrots before we're really at
			 * the end.
			 */
			i = *start;
			for (j = 0; i < end && j < dsz; j++)
				if ('>' != buf[i++])
					break;
			if (dsz == j) {
				*start += dsz;
				break;
			}
		}
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(st, buf, start, end, 1, nomacro, 1);
			continue;
		}

		/*
		 * Make sure that any macro-like words (or
		 * really any word starting with a capital
		 * letter) is assumed to be a macro that must be
		 * escaped.
		 * This matches "Xx " and "XxEOLN".
		 */
		if ((' ' == last || '\n' == last) && 
				end - *start > 1 &&
				isupper((int)buf[*start]) &&
				islower((int)buf[*start + 1]) &&
				(end - *start == 2 ||
				 ' ' == buf[*start + 2]))
			printf("\\&");

		/* Suppress newline. */
		if ('\n' == buf[*start])
			putchar(last = ' ');
		else
			putchar(last = buf[*start]);

		/* Protect against character escapes. */
		if ('\\' == last)
			putchar('e');

		(*start)++;

		if (' ' == last)
			while (*start < end && ' ' == buf[*start])
				(*start)++;
	}

	if (FMT__MAX == fmt)
		return(0);

	if ( ! nomacro && FMT_CODE == fmt)
		printf(" Qc ");

	/*
	 * We're now subsequent the format code.
	 * If there isn't a space (or newline) here, and we haven't just
	 * printed a space, then suppress space.
	 */
	if ( ! nomacro && ' ' != last)
		if (' ' != buf[*start] && '\n' != buf[*start])
			printf(" Ns ");

	return(1);
}

/*
 * Calls formatcode() til the end of a paragraph.
 */
static void
formatcodeln(struct state *st, const char *buf, 
	size_t *start, size_t end, int nomacro)
{

	last = ' ';
	while (*start < end)  {
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(st, buf, start, end, 1, nomacro, 1);
			continue;
		}
		/*
		 * Since we're already on a macro line, we want to make
		 * sure that we don't inadvertently invoke a macro.
		 * We need to do this carefully because section names
		 * are used in troff and we don't want to escape
		 * something that needn't be escaped.
		 */
		if (' ' == last && end - *start > 1 &&
				isupper((int)buf[*start]) &&
				islower((int)buf[*start + 1]) &&
				(end - *start == 2 ||
				 ' ' == buf[*start + 2]))
			printf("\\&");

		if ('\n' == buf[*start])
			putchar(last = ' ');
		else
			putchar(last = buf[*start]);

		/* Protect against character escapes. */
		if ('\\' == last)
			putchar('e');

		(*start)++;
	}
}

/*
 * Guess at what kind of list we are.
 * These are taken straight from the POD manual.
 * I don't know what people do in real life.
 */
static enum list
listguess(const char *buf, size_t start, size_t end)
{
	size_t		 len = end - start;

	assert(end >= start);

	if (len == 1 && '*' == buf[start])
		return(LIST_BULLET);
	if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
		return(LIST_ENUM);
	else if (len == 1 && '1' == buf[start])
		return(LIST_ENUM);
	else
		return(LIST_TAG);
}

/*
 * A command paragraph, as noted in the perlpod manual, just indicates
 * that we should do something, optionally with some text to print as
 * well.
 */
static void
command(struct state *st, const char *buf, size_t start, size_t end)
{
	size_t		 len, csz;
	enum cmd	 cmd;

	assert('=' == buf[start]);
	start++;
	len = end - start;

	for (cmd = 0; cmd < CMD__MAX; cmd++) {
		csz = strlen(cmds[cmd]);
		if (len < csz)
			continue;
		if (0 == memcmp(&buf[start], cmd[cmds], csz))
			break;
	}

	/* Ignore bogus commands. */

	if (CMD__MAX == cmd)
		return;

	start += csz;
	while (start < end && ' ' == buf[start])
		start++;

	len = end - start;

	if (st->paused) {
		st->paused = CMD_END != cmd;
		return;
	}

	switch (cmd) {
	case (CMD_POD):
		break;
	case (CMD_HEAD1):
		/*
		 * The behaviour of head= follows from a quick glance at
		 * how pod2man handles it.
		 */
		printf(".Sh ");
		st->sect = SECT_NONE;
		if (end - start == 4) {
			if (0 == memcmp(&buf[start], "NAME", 4))
				st->sect = SECT_NAME;
		} else if (end - start == 8) {
			if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
				st->sect = SECT_SYNOPSIS;
		} 
		formatcodeln(st, buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD2):
		printf(".Ss ");
		formatcodeln(st, buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD3):
		puts(".Pp");
		printf(".Em ");
		formatcodeln(st, buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_HEAD4):
		puts(".Pp");
		printf(".No ");
		formatcodeln(st, buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_OVER):
		/* 
		 * If we have an existing list that hasn't had an =item
		 * yet, then make sure that we open it now.
		 * We use the default list type, but that can't be
		 * helped (we haven't seen any items yet).
		 */
		if (st->lpos > 0)
			if (LIST__MAX == st->lstack[st->lpos - 1]) {
				st->lstack[st->lpos - 1] = LIST_TAG;
				puts(".Bl -tag -width Ds");
			}
		st->lpos++;
		assert(st->lpos < LIST_STACKSZ);
		st->lstack[st->lpos - 1] = LIST__MAX;
		break;
	case (CMD_ITEM):
		if (0 == st->lpos) {
			/* 
			 * Bad markup.
			 * Try to compensate.
			 */
			st->lstack[st->lpos] = LIST__MAX;
			st->lpos++;
		}
		assert(st->lpos > 0);
		/*
		 * If we're the first =item, guess at what our content
		 * will be: "*" is a bullet list, "1." is a numbered
		 * list, and everything is tagged.
		 */
		if (LIST__MAX == st->lstack[st->lpos - 1]) {
			st->lstack[st->lpos - 1] = 
				listguess(buf, start, end);
			switch (st->lstack[st->lpos - 1]) {
			case (LIST_BULLET):
				puts(".Bl -bullet");
				break;
			case (LIST_ENUM):
				puts(".Bl -enum");
				break;
			default:
				puts(".Bl -tag -width Ds");
				break;
			}
		}
		switch (st->lstack[st->lpos - 1]) {
		case (LIST_TAG):
			printf(".It ");
			formatcodeln(st, buf, &start, end, 0);
			putchar('\n');
			break;
		case (LIST_ENUM):
			/* FALLTHROUGH */
		case (LIST_BULLET):
			/*
			 * Abandon the remainder of the paragraph
			 * because we're going to be a bulletted or
			 * numbered list.
			 */
			puts(".It");
			break;
		default:
			abort();
		}
		st->haspar = 1;
		break;
	case (CMD_BACK):
		/* Make sure we don't back over the stack. */
		if (st->lpos > 0) {
			st->lpos--;
			puts(".El");
		}
		break;
	case (CMD_BEGIN):
		/* 
		 * We disregard all types for now.
		 * TODO: process at least "text" in a -literal block.
		 */
		st->paused = 1;
		break;
	case (CMD_FOR):
		/* 
		 * We ignore all types of encodings and formats
		 * unilaterally.
		 */
		break;
	case (CMD_ENCODING):
		break;
	case (CMD_CUT):
		st->parsing = 0;
		return;
	default:
		abort();
	}

	/* Any command (but =cut) makes us start parsing. */
	st->parsing = 1;
}

/*
 * Just pump out the line in a verbatim block.
 */
static void
verbatim(struct state *st, const char *buf, size_t start, size_t end)
{
	int		 last;
	size_t		 i;

	if ( ! st->parsing || st->paused)
		return;
again:
	/* 
	 * If we're in the SYNOPSIS, see if we're an #include block.
	 * If we are, then print the "In" macro and re-loop.
	 * This handles any number of inclusions, but only when they
	 * come before the remaining parts...
	 */
	if (SECT_SYNOPSIS == st->sect) {
		i = start;
		for (i = start; i < end && ' ' == buf[i]; i++)
			/* Spin. */ ;
		if (i == end)
			return;
		/* We're an include block! */
		if (end - i > 10 && 
			0 == memcmp(&buf[i], "#include <", 10)) {
			start = i + 10;
			while (start < end && ' ' == buf[start])
				start++;
			fputs(".In ", stdout);
			/* Stop til the '>' marker or we hit eoln. */
			while (start < end && 
				'>' != buf[start] && '\n' != buf[start])
				putchar(buf[start++]);
			putchar('\n');
			if (start < end && '>' == buf[start])
				start++;
			if (start < end && '\n' == buf[start])
				start++;
			if (start < end) 
				goto again;
			return;
		}
	}
	
	if (start == end)
		return;
	puts(".Bd -literal");
	for (last = ' '; start < end; start++) {
		/*
		 * Handle accidental macros (newline starting with
		 * control character) and escapes.
		 */
		if ('\n' == last)
			if ('.' == buf[start] || '\'' == buf[start])
				printf("\\&");
		putchar(last = buf[start]);
		if ('\\' == buf[start])
			printf("e");
	}
	putchar('\n');
	puts(".Ed");
}

/*
 * See dosynopsisop().
 */
static int
hasmatch(const char *buf, size_t start, size_t end)
{
	size_t	 stack;

	for (stack = 0; start < end; start++) 
		if (buf[start] == '[')
			stack++;
		else if (buf[start] == ']' && 0 == stack)
			return(1);
		else if (buf[start] == ']')
			stack--;
	return(0);
}

/*
 * If we're in the SYNOPSIS section and we've encounter braces in an
 * ordinary paragraph, then try to see whether we're an [-option].
 * Do this, if we're an opening bracket, by first seeing if we have a
 * matching end via hasmatch().
 * If we're an ending bracket, see if we have a stack already.
 */
static int
dosynopsisop(const char *buf, int *last,
	size_t *start, size_t end, size_t *opstack)
{

	assert('[' == buf[*start] || ']' == buf[*start]);

	if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
		if ('\n' != *last)
			putchar('\n');
		puts(".Oo");
		(*opstack)++;
	} else if ('[' == buf[*start])
		return(0);

	if (']' == buf[*start] && *opstack > 0) {
		if ('\n' != *last)
			putchar('\n');
		puts(".Oc");
		(*opstack)--;
	} else if (']' == buf[*start])
		return(0);

	(*start)++;
	*last = '\n';
	while (' ' == buf[*start])
		(*start)++;
	return(1);
}

/*
 * Format multiple "Nm" manpage names in the NAME section.
 */
static void
donamenm(struct state *st, const char *buf, size_t *start, size_t end)
{
	size_t	 word;

	while (*start < end && ' ' == buf[*start])
		(*start)++;

	if (end == *start) {
		puts(".Nm unknown");
		return;
	}

	while (*start < end) {
		fputs(".Nm ", stdout);
		for (word = *start; word < end; word++)
			if (',' == buf[word])
				break;
		formatcodeln(st, buf, start, word, 1);
		if (*start == end) {
			putchar('\n');
			continue;
		}
		assert(',' == buf[*start]);
		puts(" ,");
		(*start)++;
		while (*start < end && ' ' == buf[*start])
			(*start)++;
	}
}

/*
 * Ordinary paragraph.
 * Well, this is really the hardest--POD seems to assume that, for
 * example, a leading space implies a newline, and so on.
 * Lots of other snakes in the grass: escaping a newline followed by a
 * period (accidental mdoc(7) control), double-newlines after macro
 * passages, etc.
 */
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
{
	size_t		i, j, opstack;
	int		seq;

	if ( ! st->parsing || st->paused)
		return;

	/*
	 * Special-case: the NAME section.
	 * If we find a "-" when searching from the end, assume that
	 * we're in "name - description" format.
	 * To wit, print out a "Nm" and "Nd" in that format.
	 */
	if (SECT_NAME == st->sect) {
		for (i = end - 2; i > start; i--)
			if ('-' == buf[i] && ' ' == buf[i + 1])
				break;
		if ('-' == buf[i]) {
			j = i;
			/* Roll over multiple "-". */
			for ( ; i > start; i--)
				if ('-' != buf[i])
					break;
			donamenm(st, buf, &start, i + 1);
			start = j + 1;
			while (start < end && ' ' == buf[start])
				start++;
			fputs(".Nd ", stdout);
			formatcodeln(st, buf, &start, end, 1);
			putchar('\n');
			return;
		}
	}

	if ( ! st->haspar)
		puts(".Pp");

	st->haspar = 0;
	last = '\n';
	opstack = 0;

	for (seq = 0; start < end; seq++) {
		/* 
		 * Loop til we get either to a newline or escape. 
		 * Escape initial control characters.
		 */
		while (start < end) {
			if (start < end - 1 && '<' == buf[start + 1])
				break;
			else if ('\n' == buf[start])
				break;
			else if ('\n' == last && '.' == buf[start])
				printf("\\&");
			else if ('\n' == last && '\'' == buf[start])
				printf("\\&");
			/*
			 * If we're in the SYNOPSIS, have square
			 * brackets indicate that we're opening and
			 * closing an optional context.
			 */
			if (SECT_SYNOPSIS == st->sect &&
				('[' == buf[start] || 
				 ']' == buf[start]) &&
				dosynopsisop(buf, &last, 
					&start, end, &opstack))
				continue;
			putchar(last = buf[start++]);
			if ('\\' == last)
				putchar('e');
		}

		if (start < end - 1 && '<' == buf[start + 1]) {
			if (formatcode(st, buf, &start, end, 0, 0, seq)) {
				/*
				 * Let mdoc(7) handle trailing punctuation.
				 * XXX Some punctuation characters
				 *     are not handled yet.
				 */
				if ((start == end - 1 || 
					(start < end - 1 && 
					 (' ' == buf[start + 1] ||
					  '\n' == buf[start + 1]))) &&
					('.' == buf[start] ||
					 ',' == buf[start])) {
					putchar(' ');
					putchar(buf[start++]);
				}
				/* End the macro line. */
				putchar(last = '\n');
				/*
				 * Consume all whitespace
				 * so we don't accidentally start
				 * an implicit literal line.
				 */
				while (start < end && ' ' == buf[start])
					start++;
			}
		} else if (start < end && '\n' == buf[start]) {
			/*
			 * Print the newline only if we haven't already
			 * printed a newline.
			 */
			if (last != '\n')
				putchar(last = buf[start]);
			if (++start >= end)
				continue;
			/*
			 * If we have whitespace next, eat it to prevent
			 * mdoc(7) from thinking that it's meant for
			 * verbatim text.
			 * It is--but if we start with that, we can't
			 * have a macro subsequent it, which may be
			 * possible if we have an escape next.
			 */
			if (' ' == buf[start] || '\t' == buf[start]) {
				puts(".br");
				last = '\n';
			}
			for ( ; start < end; start++)
				if (' ' != buf[start] && '\t' != buf[start])
					break;
		} 
	}

	if (last != '\n')
		putchar('\n');
}

/*
 * There are three kinds of paragraphs: verbatim (starts with whitespace
 * of some sort), ordinary (starts without "=" marker), or a command
 * (default: starts with "=").
 */
static void
dopar(struct state *st, const char *buf, size_t start, size_t end)
{

	if (end == start)
		return;
	if (' ' == buf[start] || '\t' == buf[start])
		verbatim(st, buf, start, end);
	else if ('=' != buf[start])
		ordinary(st, buf, start, end);
	else
		command(st, buf, start, end);
}

/*
 * Loop around paragraphs within a document, processing each one in the
 * POD way.
 */
static void
dofile(const struct args *args, const char *fname, 
	const struct tm *tm, const char *buf, size_t sz)
{
	char		 datebuf[64];
	struct state	 st;
	const char	*fbase, *fext, *section, *date;
	char		*title, *cp;
	size_t		 sup, end, i, cur = 0;

	if (0 == sz)
		return;

	/*
	 * Parsing the filename is almost always required,
	 * except when both the title and the section
	 * are provided on the command line.
	 */

	if (NULL == args->title || NULL == args->section) {
		fbase = strrchr(fname, '/');
		if (NULL == fbase)
			fbase = fname;
		else
			fbase++;
		fext = strrchr(fbase, '.');
	} else
		fext = NULL;

	/*
	 * The title will be converted to uppercase,
	 * so it needs to be copied.
	 */

	title = (NULL != args->title) ? strdup(args->title) :
		(NULL != fext) ? strndup(fbase, fext - fbase) :
		strdup(fbase);

	if (NULL == title) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	/* Section is 1 unless suffix is "pm". */

	section = (NULL != args->section) ? args->section :
	    (NULL == fext || strcmp(fext + 1, "pm")) ? "1" :
	    PERL_SECTION;

	/* Date.  Or the given "tm" if not supplied. */

	if (NULL == (date = args->date)) {
		strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
		date = datebuf;
	}

	for (cp = title; '\0' != *cp; cp++)
		*cp = toupper((int)*cp);

	/* The usual mdoc(7) preamble. */

	printf(".Dd %s\n", date);
	printf(".Dt %s %s\n", title, section);
	puts(".Os");

	free(title);

	memset(&st, 0, sizeof(struct state));
	assert(sz > 0);

	/* Main loop over file contents. */

	while (cur < sz) {
		/* Read until next paragraph. */
		for (i = cur + 1; i < sz; i++)
			if ('\n' == buf[i] && '\n' == buf[i - 1]) {
				/* Consume blank paragraphs. */
				while (i + 1 < sz && '\n' == buf[i + 1])
					i++;
				break;
			}
		
		/* Adjust end marker for EOF. */
		end = i < sz ? i - 1 : 
			('\n' == buf[sz - 1] ? sz - 1 : sz);
		sup = i < sz ? end + 2 : sz;

		/* Process paragraph and adjust start. */
		dopar(&st, buf, cur, end);
		cur = sup;
	}
}

/*
 * Read a single file fully into memory.
 * If the file is "-", do it from stdin.
 * If successfully read, send the input buffer to dofile() for further
 * processing.
 */
static int
readfile(const struct args *args, const char *fname)
{
	int		 fd;
	char		*buf;
	size_t		 bufsz, cur;
	ssize_t		 ssz;
	struct tm	*tm;
	time_t		 ttm;
	struct stat 	 st;

	fd = 0 != strcmp("-", fname) ? 
		open(fname, O_RDONLY, 0) : STDIN_FILENO;

	if (-1 == fd) {
		perror(fname);
		return(0);
	}

	if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
		ttm = time(NULL);
		tm = localtime(&ttm);
	} else
		tm = localtime(&st.st_mtime);

	/* 
	 * Arbitrarily-sized initial buffer.
	 * Should be big enough for most files...
	 */
	cur = 0;
	bufsz = 1 << 14;
	if (NULL == (buf = malloc(bufsz))) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
		/* Double buffer size on fill. */
		if ((size_t)ssz == bufsz - cur)  {
			bufsz *= 2;
			if (NULL == (buf = realloc(buf, bufsz))) {
				perror(NULL);
				exit(EXIT_FAILURE);
			}
		}
		cur += (size_t)ssz;
	}
	if (ssz < 0) {
		perror(fname);
		free(buf);
		return(0);
	}

	dofile(args, STDIN_FILENO == fd ? 
		"STDIN" : fname, tm, buf, cur);
	free(buf);
	if (STDIN_FILENO != fd)
		close(fd);
	return(1);
}

int
main(int argc, char *argv[])
{
	const char	*fname, *name;
	struct args	 args;
	int		 c;

	name = strrchr(argv[0], '/');
	if (name == NULL)
		name = argv[0];
	else
		++name;

	memset(&args, 0, sizeof(struct args));
	fname = "-";

	/* Accept no arguments for now. */

	while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
		switch (c) {
		case ('h'):
			/* FALLTHROUGH */
		case ('l'):
			/* FALLTHROUGH */
		case ('c'):
			/* FALLTHROUGH */
		case ('o'):
			/* FALLTHROUGH */
		case ('q'):
			/* FALLTHROUGH */
		case ('r'):
			/* FALLTHROUGH */
		case ('u'):
			/* FALLTHROUGH */
		case ('v'):
			/* Ignore these. */
			break;
		case ('d'):
			args.date = optarg;
			break;
		case ('n'):
			args.title = optarg;
			break;
		case ('s'):
			args.section = optarg;
			break;
		default:
			goto usage;
		}

	argc -= optind;
	argv += optind;

	/* Accept only a single input file. */

	if (argc > 1)
		goto usage;
	else if (1 == argc)
		fname = *argv;

	return(readfile(&args, fname) ? 
		EXIT_SUCCESS : EXIT_FAILURE);

usage:
	fprintf(stderr, "usage: %s [-d date] " 
	    "[-n title] [-s section] [file]\n", name);

	return(EXIT_FAILURE);
}