/*	$Id: pod2mdoc.c,v 1.2 2014/03/20 15:15:32 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/stat.h>
#include <sys/time.h>

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

struct	args {
	const char	*title; /* override "Dt" title */
	const char	*date; /* override "Dd" date */
	const char	*section; /* override "Dt" section */
};

struct	state {
	int		 parsing; /* after =cut of before command */
	int		 paused; /* in =begin and before =end */
	int		 haspar; /* in paragraph: do we need Pp? */
	int		 isname; /* are we the NAME section? */
	const char	*fname; /* file being parsed */
};

enum	fmt {
	FMT_ITALIC,
	FMT_BOLD,
	FMT_CODE,
	FMT_LINK,
	FMT_ESCAPE,
	FMT_FILE,
	FMT_NBSP,
	FMT_INDEX,
	FMT_NULL,
	FMT__MAX
};

enum	cmd {
	CMD_POD = 0,
	CMD_HEAD1,
	CMD_HEAD2,
	CMD_HEAD3,
	CMD_HEAD4,
	CMD_OVER,
	CMD_ITEM,
	CMD_BACK,
	CMD_BEGIN,
	CMD_END,
	CMD_FOR,
	CMD_ENCODING,
	CMD_CUT,
	CMD__MAX
};

static	const char *const cmds[CMD__MAX] = {
	"pod", 		/* CMD_POD */
	"head1",	/* CMD_HEAD1 */
	"head2",	/* CMD_HEAD2 */
	"head3",	/* CMD_HEAD3 */
	"head4",	/* CMD_HEAD4 */
	"over",		/* CMD_OVER */
	"item",		/* CMD_ITEM */
	"back",		/* CMD_BACK */
	"begin",	/* CMD_BEGIN */
	"end",		/* CMD_END */
	"for",		/* CMD_FOR */
	"encoding",	/* CMD_ENCODING */
	"cut"		/* CMD_CUT */
};

static	const char fmts[FMT__MAX] = {
	'I',		/* FMT_ITALIC */
	'B',		/* FMT_BOLD */
	'C',		/* FMT_CODE */
	'L',		/* FMT_LINK */
	'E',		/* FMT_ESCAPE */
	'F',		/* FMT_FILE */
	'S',		/* FMT_NBSP */
	'X',		/* FMT_INDEX */
	'Z'		/* FMT_NULL */
};

/*
 * Given buf[*start] is at the start of an escape name, read til the end
 * of the escape ('>') then try to do something with it.
 * Sets start to be one after the '>'.
 */
static void
formatescape(const char *buf, size_t *start, size_t end)
{
	char		 esc[16]; /* no more needed */
	size_t		 i, max;

	max = sizeof(esc) - 1;
	i = 0;
	/* Read til our buffer is full. */
	while (*start < end && '>' != buf[*start] && i < max)
		esc[i++] = buf[(*start)++];
	esc[i] = '\0';

	if (i == max) {
		/* Too long... skip til we end. */
		while (*start < end && '>' != buf[*start])
			(*start)++;
		return;
	} else if (*start >= end)
		return;

	assert('>' == buf[*start]);
	(*start)++;

	/*
	 * TODO: right now, we only recognise the named escapes.
	 * Just let the rest of them go. 
	 */
	if (0 == strcmp(esc, "lt"))
		printf("\\(la");
	else if (0 == strcmp(esc, "gt"))
		printf("\\(ra");
	else if (0 == strcmp(esc, "vb"))
		printf("\\(ba");
	else if (0 == strcmp(esc, "sol"))
		printf("\\(sl");
}

/*
 * Skip space characters.
 */
static void
skipspace(const char *buf, size_t *start, size_t end)
{

	while (*start < end && ' ' == buf[*start])
		(*start)++;
}

/*
 * We're at the character in front of a format code, which is structured
 * like X<...> and can contain nested format codes.
 * This consumes the whole format code, and any nested format codes, til
 * the end of matched production.
 * If "reentrant", then we're being called after a macro has already
 * been printed to the current line.
 * "last" is set to the last read character: this is used to determine
 * whether we should buffer with space or not.
 * If "nomacro", then we don't print any macros, just contained data.
 */
static int
formatcode(const char *buf, size_t *start, 
	size_t end, int reentrant, int last, int nomacro)
{
	enum fmt	 fmt;

	assert(*start + 1 < end);
	assert('<' == buf[*start + 1]);

	for (fmt = 0; fmt < FMT__MAX; fmt++) 
		if (buf[*start] == fmts[fmt])
			break;

	/* Invalid macros are just regular text. */

	if (FMT__MAX == fmt) {
		putchar(buf[*start]);
		(*start)++;
		return(0);
	}

	*start += 2;

	/*
	 * Escapes don't print macro sequences, so just output them like
	 * normal text before processing for macros.
	 */
	if (FMT_ESCAPE == fmt) {
		formatescape(buf, start, end);
		return(0);
	} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
		/* For indices and nulls, just consume. */
		while (*start < end && '>' != buf[*start])
			(*start)++;
		if (*start < end)
			(*start)++;
		return(0);
	}

	if ( ! nomacro) {
		/*
		 * Print out the macro describing this format code.
		 * If we're not "reentrant" (not yet on a macro line)
		 * then print a newline, if necessary, and the macro
		 * indicator.
		 * Otherwise, offset us with a space.
		 */
		if ( ! reentrant && last != '\n')
			putchar('\n');
		if ( ! reentrant)
			putchar('.');
		else
			putchar(' ');
		
		/*
		 * If we don't have whitespace before us, then suppress
		 * macro whitespace with Ns.
		 */
		if (' ' != last)
			printf("Ns ");
		switch (fmt) {
		case (FMT_ITALIC):
			printf("Em ");
			break;
		case (FMT_BOLD):
			printf("Sy ");
			break;
		case (FMT_CODE):
			printf("Qo Li ");
			break;
		case (FMT_LINK):
			printf("Lk ");
			break;
		case (FMT_FILE):
			printf("Pa ");
			break;
		case (FMT_NBSP):
			/* TODO. */
			printf("No ");
			break;
		default:
			abort();
		}
	}

	/*
	 * Read until we reach the end market ('>') or until we find a
	 * nested format code.
	 * Don't emit any newlines: since we're on a macro line, we
	 * don't want to break the line.
	 */
	while (*start < end) {
		if ('>' == buf[*start]) {
			(*start)++;
			break;
		}
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(buf, start, end, 1, last, nomacro);
			continue;
		}
		if ('\n' != buf[*start]) {
			/*
			 * Make sure that any macro-like words (or
			 * really any word starting with a capital
			 * letter) is assumed to be a macro that must be
			 * escaped.
			 * XXX: should this be isalpha()?
			 */
			if ((' ' == last || '\n' == last) && 
				isupper(buf[*start]))
				printf("\\&");
			putchar(last = buf[*start]);
		}
		(*start)++;
	}

	if ( ! nomacro && FMT_CODE == fmt)
		printf(" Qc ");

	if (reentrant)
		return(1);

	/*
	 * If we're not reentrant, we want to put ending punctuation on
	 * the macro line so that it's properly handled by being
	 * smooshed against the terminal word.
	 */
	skipspace(buf, start, end);
	if (',' != buf[*start] && '.' != buf[*start] &&
		'!' != buf[*start] && '?' != buf[*start] &&
		')' != buf[*start])
		return(1);
	while (*start < end) {
		if (',' != buf[*start] &&
			'.' != buf[*start] &&
			'!' != buf[*start] &&
			'?' != buf[*start] &&
			')' != buf[*start])
			break;
		putchar(' ');
		putchar(buf[*start]);
		(*start)++;
	}
	skipspace(buf, start, end);
	return(1);
}

/*
 * Calls formatcode() til the end of a paragraph.
 */
static void
formatcodeln(const char *buf, size_t *start, size_t end, int nomacro)
{
	int		 last;

	last = '\n';
	while (*start < end)  {
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(buf, start, end, 1, last, nomacro);
			continue;
		}
		if ('\n' != buf[*start])
			putchar(last = buf[*start]);
		(*start)++;
	}
}

/*
 * A command paragraph, as noted in the perlpod manual, just indicates
 * that we should do something, optionally with some text to print as
 * well.
 */
static void
command(struct state *st, const char *buf, size_t start, size_t end)
{
	size_t		 len, csz;
	enum cmd	 cmd;

	assert('=' == buf[start]);
	start++;
	len = end - start;

	for (cmd = 0; cmd < CMD__MAX; cmd++) {
		csz = strlen(cmds[cmd]);
		if (len < csz)
			continue;
		if (0 == memcmp(&buf[start], cmd[cmds], csz))
			break;
	}

	/* Ignore bogus commands. */

	if (CMD__MAX == cmd)
		return;

	start += csz;
	skipspace(buf, &start, end);
	len = end - start;

	if (st->paused) {
		st->paused = CMD_END != cmd;
		return;
	}

	switch (cmd) {
	case (CMD_POD):
		break;
	case (CMD_HEAD1):
		/*
		 * The behaviour of head= follows from a quick glance at
		 * how pod2man handles it.
		 */
		printf(".Sh ");
		st->isname = 0;
		if (end - start == 4)
			if (0 == memcmp(&buf[start], "NAME", 4))
				st->isname = 1;
		formatcodeln(buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD2):
		printf(".Ss ");
		formatcodeln(buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD3):
		puts(".Pp");
		printf(".Em ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_HEAD4):
		puts(".Pp");
		printf(".No ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_OVER):
		/*
		 * TODO: we should be doing this after we process the
		 * first =item to see whether we'll do an -enum,
		 * -bullet, or something else.
		 */
		puts(".Bl -tag -width Ds");
		break;
	case (CMD_ITEM):
		printf(".It ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_BACK):
		puts(".El");
		break;
	case (CMD_BEGIN):
		/* 
		 * We disregard all types for now.
		 * TODO: process at least "text" in a -literal block.
		 */
		st->paused = 1;
		break;
	case (CMD_FOR):
		/* 
		 * We ignore all types of encodings and formats
		 * unilaterally.
		 */
		break;
	case (CMD_ENCODING):
		break;
	case (CMD_CUT):
		st->parsing = 0;
		return;
	default:
		abort();
	}

	/* Any command (but =cut) makes us start parsing. */
	st->parsing = 1;
}

/*
 * Just pump out the line in a verbatim block.
 */
static void
verbatim(struct state *st, const char *buf, size_t start, size_t end)
{

	if ( ! st->parsing || st->paused)
		return;

	puts(".Bd -literal");
	printf("%.*s\n", (int)(end - start), &buf[start]);
	puts(".Ed");
}

/*
 * Ordinary paragraph.
 * Well, this is really the hardest--POD seems to assume that, for
 * example, a leading space implies a newline, and so on.
 * Lots of other snakes in the grass: escaping a newline followed by a
 * period (accidental mdoc(7) control), double-newlines after macro
 * passages, etc.
 */
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
{
	int		last;
	size_t		i, j;

	if ( ! st->parsing || st->paused)
		return;

	/*
	 * Special-case: the NAME section.
	 * If we find a "-" when searching from the end, assume that
	 * we're in "name - description" format.
	 * To wit, print out a "Nm" and "Nd" in that format.
	 */
	if (st->isname) {
		for (i = end - 1; i > start; i--)
			if ('-' == buf[i])
				break;
		if ('-' == buf[i]) {
			j = i;
			/* Roll over multiple "-". */
			for ( ; i > start; i--)
				if ('-' != buf[i])
					break;
			printf(".Nm %.*s\n", 
				(int)((i + 1) - start), &buf[start]);
			printf(".Nd %.*s\n", 
				(int)(end - (j + 1)), &buf[j + 1]);
			return;
		}
	}

	if ( ! st->haspar)
		puts(".Pp");

	st->haspar = 0;
	last = '\n';

	while (start < end) {
		/* 
		 * Loop til we get either to a newline or escape. 
		 * Escape initial control characters.
		 */
		while (start < end) {
			if (start < end - 1 && '<' == buf[start + 1])
				break;
			else if ('\n' == buf[start])
				break;
			else if ('\n' == last && '.' == buf[start])
				printf("\\&");
			else if ('\n' == last && '\'' == buf[start])
				printf("\\&");
			putchar(last = buf[start++]);
		}

		if (start < end - 1 && '<' == buf[start + 1]) {
			/*
			 * We've encountered a format code.
			 * This is going to trigger a macro no matter
			 * what, so print a newline now.
			 * Then print the (possibly nested) macros and
			 * following that, a newline.
			 */
			if (formatcode(buf, &start, end, 0, last, 0))
				putchar(last = '\n');
		} else if (start < end && '\n' == buf[start]) {
			/*
			 * Print the newline only if we haven't already
			 * printed a newline.
			 */
			if (last != '\n')
				putchar(last = buf[start]);
			if (++start >= end)
				continue;
			/*
			 * If we have whitespace next, eat it to prevent
			 * mdoc(7) from thinking that it's meant for
			 * verbatim text.
			 * It is--but if we start with that, we can't
			 * have a macro subsequent it, which may be
			 * possible if we have an escape next.
			 */
			if (' ' == buf[start] || '\t' == buf[start]) {
				puts(".br");
				last = '\n';
			}
			for ( ; start < end; start++)
				if (' ' != buf[start] && '\t' != buf[start])
					break;
		} else if (start < end) {
			/* 
			 * Default: print the character. 
			 * Escape initial control characters.
			 */
			if ('\n' == last && '.' == buf[start])
				printf("\\&");
			else if ('\n' == last && '\'' == buf[start])
				printf("\\&");
			putchar(last = buf[start++]);
		}
	}

	if (last != '\n')
		putchar('\n');
}

/*
 * There are three kinds of paragraphs: verbatim (starts with whitespace
 * of some sort), ordinary (starts without "=" marker), or a command
 * (default: starts with "=").
 */
static void
dopar(struct state *st, const char *buf, size_t start, size_t end)
{

	if (end == start)
		return;
	if (' ' == buf[start] || '\t' == buf[start])
		verbatim(st, buf, start, end);
	else if ('=' != buf[start])
		ordinary(st, buf, start, end);
	else
		command(st, buf, start, end);
}

/*
 * Loop around paragraphs within a document, processing each one in the
 * POD way.
 */
static void
dofile(const struct args *args, const char *fname, 
	const struct tm *tm, const char *buf, size_t sz)
{
	size_t		 sup, end, i, cur = 0;
	struct state	 st;
	const char	*section, *date;
	char		 datebuf[64];
	char		*title, *cp;

	if (0 == sz)
		return;

	/* Title is last path component of the filename. */

	if (NULL != args->title)
		title = strdup(args->title);
	else if (NULL != (cp = strrchr(fname, '/')))
		title = strdup(cp + 1);
	else
		title = strdup(fname);
	
	if (NULL == title) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	/* Section is 1 unless suffix is "pm". */

	if (NULL == (section = args->section)) {
		section = "1";
		if (NULL != (cp = strrchr(title, '.'))) {
			*cp++ = '\0';
			if (0 == strcmp(cp, "pm"))
				section = "3p";
		}
	} 

	/* Date.  Or the given "tm" if not supplied. */

	if (NULL == (date = args->date)) {
		strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
		date = datebuf;
	}

	for (cp = title; '\0' != *cp; cp++)
		*cp = toupper((int)*cp);

	/* The usual mdoc(7) preamble. */

	printf(".Dd %s\n", date);
	printf(".Dt %s %s\n", title, section);
	puts(".Os");

	free(title);

	memset(&st, 0, sizeof(struct state));
	assert(sz > 0);

	/* Main loop over file contents. */

	while (cur < sz) {
		/* Read until next paragraph. */
		for (i = cur + 1; i < sz; i++)
			if ('\n' == buf[i] && '\n' == buf[i - 1]) {
				/* Consume blank paragraphs. */
				while (i + 1 < sz && '\n' == buf[i + 1])
					i++;
				break;
			}
		
		/* Adjust end marker for EOF. */
		end = i < sz ? i - 1 : 
			('\n' == buf[sz - 1] ? sz - 1 : sz);
		sup = i < sz ? end + 2 : sz;

		/* Process paragraph and adjust start. */
		dopar(&st, buf, cur, end);
		cur = sup;
	}
}

/*
 * Read a single file fully into memory.
 * If the file is "-", do it from stdin.
 * If successfully read, send the input buffer to dofile() for further
 * processing.
 */
static int
readfile(const struct args *args, const char *fname)
{
	int		 fd;
	char		*buf;
	size_t		 bufsz, cur;
	ssize_t		 ssz;
	struct tm	*tm;
	time_t		 ttm;
	struct stat 	 st;

	assert(NULL != fname);

	fd = 0 != strcmp("-", fname) ? 
		open(fname, O_RDONLY, 0) : STDIN_FILENO;

	if (-1 == fd) {
		perror(fname);
		return(0);
	}

	if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
		ttm = time(NULL);
		tm = localtime(&ttm);
	} else
		tm = localtime(&st.st_mtime);

	/* 
	 * Arbitrarily-sized initial buffer.
	 * Should be big enough for most files...
	 */
	cur = 0;
	bufsz = 1 << 14;
	if (NULL == (buf = malloc(bufsz))) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
		/* Double buffer size on fill. */
		if ((size_t)ssz == bufsz - cur)  {
			bufsz *= 2;
			if (NULL == (buf = realloc(buf, bufsz))) {
				perror(NULL);
				exit(EXIT_FAILURE);
			}
		}
		cur += (size_t)ssz;
	}
	if (ssz < 0) {
		perror(fname);
		free(buf);
		return(0);
	}

	dofile(args, STDIN_FILENO == fd ? 
		"STDIN" : fname, tm, buf, cur);
	free(buf);
	if (STDIN_FILENO != fd)
		close(fd);
	return(1);
}

int
main(int argc, char *argv[])
{
	const char	*fname, *name;
	struct args	 args;
	int		 c;

	name = strrchr(argv[0], '/');
	if (name == NULL)
		name = argv[0];
	else
		++name;

	memset(&args, 0, sizeof(struct args));
	fname = "-";

	/* Accept no arguments for now. */

	while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
		switch (c) {
		case ('h'):
			/* FALLTHROUGH */
		case ('l'):
			/* FALLTHROUGH */
		case ('c'):
			/* FALLTHROUGH */
		case ('o'):
			/* FALLTHROUGH */
		case ('q'):
			/* FALLTHROUGH */
		case ('r'):
			/* FALLTHROUGH */
		case ('u'):
			/* FALLTHROUGH */
		case ('v'):
			/* Ignore these. */
			break;
		case ('d'):
			args.date = optarg;
			break;
		case ('n'):
			args.title = optarg;
			break;
		case ('s'):
			args.section = optarg;
			break;
		default:
			goto usage;
		}

	argc -= optind;
	argv += optind;

	/* Accept only a single input file. */

	if (argc > 2)
		return(EXIT_FAILURE);
	else if (1 == argc)
		fname = *argv;

	return(readfile(&args, fname) ? 
		EXIT_SUCCESS : EXIT_FAILURE);

usage:
	fprintf(stderr, "usage: %s [-d date] " 
		"[-n title] [-s section]\n", name);

	return(EXIT_FAILURE);
}