pod2mdoc/pod2mdoc.c - view

Return to pod2mdoc.c CVS log

Up to [cvsweb.bsd.lv] / pod2mdoc

File: [cvsweb.bsd.lv] / pod2mdoc / pod2mdoc.c (download)

Revision 1.2, Thu Mar 20 15:15:32 2014 UTC (10 years, 1 month ago) by schwarze
Branch: MAIN
Changes since 1.1: +5 -2 lines

Enclose code samples in quotation marks.
Otherwise, they would look just like normal text.
The pod2man(1) utility does the same.

Includes an added blank character after Qc added by kristaps@.

/*	$Id: pod2mdoc.c,v 1.2 2014/03/20 15:15:32 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/stat.h>
#include <sys/time.h>

#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

struct	args {
	const char	*title; /* override "Dt" title */
	const char	*date; /* override "Dd" date */
	const char	*section; /* override "Dt" section */
};

struct	state {
	int		 parsing; /* after =cut of before command */
	int		 paused; /* in =begin and before =end */
	int		 haspar; /* in paragraph: do we need Pp? */
	int		 isname; /* are we the NAME section? */
	const char	*fname; /* file being parsed */
};

enum	fmt {
	FMT_ITALIC,
	FMT_BOLD,
	FMT_CODE,
	FMT_LINK,
	FMT_ESCAPE,
	FMT_FILE,
	FMT_NBSP,
	FMT_INDEX,
	FMT_NULL,
	FMT__MAX
};

enum	cmd {
	CMD_POD = 0,
	CMD_HEAD1,
	CMD_HEAD2,
	CMD_HEAD3,
	CMD_HEAD4,
	CMD_OVER,
	CMD_ITEM,
	CMD_BACK,
	CMD_BEGIN,
	CMD_END,
	CMD_FOR,
	CMD_ENCODING,
	CMD_CUT,
	CMD__MAX
};

static	const char *const cmds[CMD__MAX] = {
	"pod", 		/* CMD_POD */
	"head1",	/* CMD_HEAD1 */
	"head2",	/* CMD_HEAD2 */
	"head3",	/* CMD_HEAD3 */
	"head4",	/* CMD_HEAD4 */
	"over",		/* CMD_OVER */
	"item",		/* CMD_ITEM */
	"back",		/* CMD_BACK */
	"begin",	/* CMD_BEGIN */
	"end",		/* CMD_END */
	"for",		/* CMD_FOR */
	"encoding",	/* CMD_ENCODING */
	"cut"		/* CMD_CUT */
};

static	const char fmts[FMT__MAX] = {
	'I',		/* FMT_ITALIC */
	'B',		/* FMT_BOLD */
	'C',		/* FMT_CODE */
	'L',		/* FMT_LINK */
	'E',		/* FMT_ESCAPE */
	'F',		/* FMT_FILE */
	'S',		/* FMT_NBSP */
	'X',		/* FMT_INDEX */
	'Z'		/* FMT_NULL */
};

/*
 * Given buf[*start] is at the start of an escape name, read til the end
 * of the escape ('>') then try to do something with it.
 * Sets start to be one after the '>'.
 */
static void
formatescape(const char *buf, size_t *start, size_t end)
{
	char		 esc[16]; /* no more needed */
	size_t		 i, max;

	max = sizeof(esc) - 1;
	i = 0;
	/* Read til our buffer is full. */
	while (*start < end && '>' != buf[*start] && i < max)
		esc[i++] = buf[(*start)++];
	esc[i] = '\0';

	if (i == max) {
		/* Too long... skip til we end. */
		while (*start < end && '>' != buf[*start])
			(*start)++;
		return;
	} else if (*start >= end)
		return;

	assert('>' == buf[*start]);
	(*start)++;

	/*
	 * TODO: right now, we only recognise the named escapes.
	 * Just let the rest of them go. 
	 */
	if (0 == strcmp(esc, "lt"))
		printf("\\(la");
	else if (0 == strcmp(esc, "gt"))
		printf("\\(ra");
	else if (0 == strcmp(esc, "vb"))
		printf("\\(ba");
	else if (0 == strcmp(esc, "sol"))
		printf("\\(sl");
}

/*
 * Skip space characters.
 */
static void
skipspace(const char *buf, size_t *start, size_t end)
{

	while (*start < end && ' ' == buf[*start])
		(*start)++;
}

/*
 * We're at the character in front of a format code, which is structured
 * like X<...> and can contain nested format codes.
 * This consumes the whole format code, and any nested format codes, til
 * the end of matched production.
 * If "reentrant", then we're being called after a macro has already
 * been printed to the current line.
 * "last" is set to the last read character: this is used to determine
 * whether we should buffer with space or not.
 * If "nomacro", then we don't print any macros, just contained data.
 */
static int
formatcode(const char *buf, size_t *start, 
	size_t end, int reentrant, int last, int nomacro)
{
	enum fmt	 fmt;

	assert(*start + 1 < end);
	assert('<' == buf[*start + 1]);

	for (fmt = 0; fmt < FMT__MAX; fmt++) 
		if (buf[*start] == fmts[fmt])
			break;

	/* Invalid macros are just regular text. */

	if (FMT__MAX == fmt) {
		putchar(buf[*start]);
		(*start)++;
		return(0);
	}

	*start += 2;

	/*
	 * Escapes don't print macro sequences, so just output them like
	 * normal text before processing for macros.
	 */
	if (FMT_ESCAPE == fmt) {
		formatescape(buf, start, end);
		return(0);
	} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
		/* For indices and nulls, just consume. */
		while (*start < end && '>' != buf[*start])
			(*start)++;
		if (*start < end)
			(*start)++;
		return(0);
	}

	if ( ! nomacro) {
		/*
		 * Print out the macro describing this format code.
		 * If we're not "reentrant" (not yet on a macro line)
		 * then print a newline, if necessary, and the macro
		 * indicator.
		 * Otherwise, offset us with a space.
		 */
		if ( ! reentrant && last != '\n')
			putchar('\n');
		if ( ! reentrant)
			putchar('.');
		else
			putchar(' ');
		
		/*
		 * If we don't have whitespace before us, then suppress
		 * macro whitespace with Ns.
		 */
		if (' ' != last)
			printf("Ns ");
		switch (fmt) {
		case (FMT_ITALIC):
			printf("Em ");
			break;
		case (FMT_BOLD):
			printf("Sy ");
			break;
		case (FMT_CODE):
			printf("Qo Li ");
			break;
		case (FMT_LINK):
			printf("Lk ");
			break;
		case (FMT_FILE):
			printf("Pa ");
			break;
		case (FMT_NBSP):
			/* TODO. */
			printf("No ");
			break;
		default:
			abort();
		}
	}

	/*
	 * Read until we reach the end market ('>') or until we find a
	 * nested format code.
	 * Don't emit any newlines: since we're on a macro line, we
	 * don't want to break the line.
	 */
	while (*start < end) {
		if ('>' == buf[*start]) {
			(*start)++;
			break;
		}
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(buf, start, end, 1, last, nomacro);
			continue;
		}
		if ('\n' != buf[*start]) {
			/*
			 * Make sure that any macro-like words (or
			 * really any word starting with a capital
			 * letter) is assumed to be a macro that must be
			 * escaped.
			 * XXX: should this be isalpha()?
			 */
			if ((' ' == last || '\n' == last) && 
				isupper(buf[*start]))
				printf("\\&");
			putchar(last = buf[*start]);
		}
		(*start)++;
	}

	if ( ! nomacro && FMT_CODE == fmt)
		printf(" Qc ");

	if (reentrant)
		return(1);

	/*
	 * If we're not reentrant, we want to put ending punctuation on
	 * the macro line so that it's properly handled by being
	 * smooshed against the terminal word.
	 */
	skipspace(buf, start, end);
	if (',' != buf[*start] && '.' != buf[*start] &&
		'!' != buf[*start] && '?' != buf[*start] &&
		')' != buf[*start])
		return(1);
	while (*start < end) {
		if (',' != buf[*start] &&
			'.' != buf[*start] &&
			'!' != buf[*start] &&
			'?' != buf[*start] &&
			')' != buf[*start])
			break;
		putchar(' ');
		putchar(buf[*start]);
		(*start)++;
	}
	skipspace(buf, start, end);
	return(1);
}

/*
 * Calls formatcode() til the end of a paragraph.
 */
static void
formatcodeln(const char *buf, size_t *start, size_t end, int nomacro)
{
	int		 last;

	last = '\n';
	while (*start < end)  {
		if (*start + 1 < end && '<' == buf[*start + 1]) {
			formatcode(buf, start, end, 1, last, nomacro);
			continue;
		}
		if ('\n' != buf[*start])
			putchar(last = buf[*start]);
		(*start)++;
	}
}

/*
 * A command paragraph, as noted in the perlpod manual, just indicates
 * that we should do something, optionally with some text to print as
 * well.
 */
static void
command(struct state *st, const char *buf, size_t start, size_t end)
{
	size_t		 len, csz;
	enum cmd	 cmd;

	assert('=' == buf[start]);
	start++;
	len = end - start;

	for (cmd = 0; cmd < CMD__MAX; cmd++) {
		csz = strlen(cmds[cmd]);
		if (len < csz)
			continue;
		if (0 == memcmp(&buf[start], cmd[cmds], csz))
			break;
	}

	/* Ignore bogus commands. */

	if (CMD__MAX == cmd)
		return;

	start += csz;
	skipspace(buf, &start, end);
	len = end - start;

	if (st->paused) {
		st->paused = CMD_END != cmd;
		return;
	}

	switch (cmd) {
	case (CMD_POD):
		break;
	case (CMD_HEAD1):
		/*
		 * The behaviour of head= follows from a quick glance at
		 * how pod2man handles it.
		 */
		printf(".Sh ");
		st->isname = 0;
		if (end - start == 4)
			if (0 == memcmp(&buf[start], "NAME", 4))
				st->isname = 1;
		formatcodeln(buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD2):
		printf(".Ss ");
		formatcodeln(buf, &start, end, 1);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_HEAD3):
		puts(".Pp");
		printf(".Em ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_HEAD4):
		puts(".Pp");
		printf(".No ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		puts(".Pp");
		st->haspar = 1;
		break;
	case (CMD_OVER):
		/*
		 * TODO: we should be doing this after we process the
		 * first =item to see whether we'll do an -enum,
		 * -bullet, or something else.
		 */
		puts(".Bl -tag -width Ds");
		break;
	case (CMD_ITEM):
		printf(".It ");
		formatcodeln(buf, &start, end, 0);
		putchar('\n');
		st->haspar = 1;
		break;
	case (CMD_BACK):
		puts(".El");
		break;
	case (CMD_BEGIN):
		/* 
		 * We disregard all types for now.
		 * TODO: process at least "text" in a -literal block.
		 */
		st->paused = 1;
		break;
	case (CMD_FOR):
		/* 
		 * We ignore all types of encodings and formats
		 * unilaterally.
		 */
		break;
	case (CMD_ENCODING):
		break;
	case (CMD_CUT):
		st->parsing = 0;
		return;
	default:
		abort();
	}

	/* Any command (but =cut) makes us start parsing. */
	st->parsing = 1;
}

/*
 * Just pump out the line in a verbatim block.
 */
static void
verbatim(struct state *st, const char *buf, size_t start, size_t end)
{

	if ( ! st->parsing || st->paused)
		return;

	puts(".Bd -literal");
	printf("%.*s\n", (int)(end - start), &buf[start]);
	puts(".Ed");
}

/*
 * Ordinary paragraph.
 * Well, this is really the hardest--POD seems to assume that, for
 * example, a leading space implies a newline, and so on.
 * Lots of other snakes in the grass: escaping a newline followed by a
 * period (accidental mdoc(7) control), double-newlines after macro
 * passages, etc.
 */
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
{
	int		last;
	size_t		i, j;

	if ( ! st->parsing || st->paused)
		return;

	/*
	 * Special-case: the NAME section.
	 * If we find a "-" when searching from the end, assume that
	 * we're in "name - description" format.
	 * To wit, print out a "Nm" and "Nd" in that format.
	 */
	if (st->isname) {
		for (i = end - 1; i > start; i--)
			if ('-' == buf[i])
				break;
		if ('-' == buf[i]) {
			j = i;
			/* Roll over multiple "-". */
			for ( ; i > start; i--)
				if ('-' != buf[i])
					break;
			printf(".Nm %.*s\n", 
				(int)((i + 1) - start), &buf[start]);
			printf(".Nd %.*s\n", 
				(int)(end - (j + 1)), &buf[j + 1]);
			return;
		}
	}

	if ( ! st->haspar)
		puts(".Pp");

	st->haspar = 0;
	last = '\n';

	while (start < end) {
		/* 
		 * Loop til we get either to a newline or escape. 
		 * Escape initial control characters.
		 */
		while (start < end) {
			if (start < end - 1 && '<' == buf[start + 1])
				break;
			else if ('\n' == buf[start])
				break;
			else if ('\n' == last && '.' == buf[start])
				printf("\\&");
			else if ('\n' == last && '\'' == buf[start])
				printf("\\&");
			putchar(last = buf[start++]);
		}

		if (start < end - 1 && '<' == buf[start + 1]) {
			/*
			 * We've encountered a format code.
			 * This is going to trigger a macro no matter
			 * what, so print a newline now.
			 * Then print the (possibly nested) macros and
			 * following that, a newline.
			 */
			if (formatcode(buf, &start, end, 0, last, 0))
				putchar(last = '\n');
		} else if (start < end && '\n' == buf[start]) {
			/*
			 * Print the newline only if we haven't already
			 * printed a newline.
			 */
			if (last != '\n')
				putchar(last = buf[start]);
			if (++start >= end)
				continue;
			/*
			 * If we have whitespace next, eat it to prevent
			 * mdoc(7) from thinking that it's meant for
			 * verbatim text.
			 * It is--but if we start with that, we can't
			 * have a macro subsequent it, which may be
			 * possible if we have an escape next.
			 */
			if (' ' == buf[start] || '\t' == buf[start]) {
				puts(".br");
				last = '\n';
			}
			for ( ; start < end; start++)
				if (' ' != buf[start] && '\t' != buf[start])
					break;
		} else if (start < end) {
			/* 
			 * Default: print the character. 
			 * Escape initial control characters.
			 */
			if ('\n' == last && '.' == buf[start])
				printf("\\&");
			else if ('\n' == last && '\'' == buf[start])
				printf("\\&");
			putchar(last = buf[start++]);
		}
	}

	if (last != '\n')
		putchar('\n');
}

/*
 * There are three kinds of paragraphs: verbatim (starts with whitespace
 * of some sort), ordinary (starts without "=" marker), or a command
 * (default: starts with "=").
 */
static void
dopar(struct state *st, const char *buf, size_t start, size_t end)
{

	if (end == start)
		return;
	if (' ' == buf[start] || '\t' == buf[start])
		verbatim(st, buf, start, end);
	else if ('=' != buf[start])
		ordinary(st, buf, start, end);
	else
		command(st, buf, start, end);
}

/*
 * Loop around paragraphs within a document, processing each one in the
 * POD way.
 */
static void
dofile(const struct args *args, const char *fname, 
	const struct tm *tm, const char *buf, size_t sz)
{
	size_t		 sup, end, i, cur = 0;
	struct state	 st;
	const char	*section, *date;
	char		 datebuf[64];
	char		*title, *cp;

	if (0 == sz)
		return;

	/* Title is last path component of the filename. */

	if (NULL != args->title)
		title = strdup(args->title);
	else if (NULL != (cp = strrchr(fname, '/')))
		title = strdup(cp + 1);
	else
		title = strdup(fname);
	
	if (NULL == title) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	/* Section is 1 unless suffix is "pm". */

	if (NULL == (section = args->section)) {
		section = "1";
		if (NULL != (cp = strrchr(title, '.'))) {
			*cp++ = '\0';
			if (0 == strcmp(cp, "pm"))
				section = "3p";
		}
	} 

	/* Date.  Or the given "tm" if not supplied. */

	if (NULL == (date = args->date)) {
		strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
		date = datebuf;
	}

	for (cp = title; '\0' != *cp; cp++)
		*cp = toupper((int)*cp);

	/* The usual mdoc(7) preamble. */

	printf(".Dd %s\n", date);
	printf(".Dt %s %s\n", title, section);
	puts(".Os");

	free(title);

	memset(&st, 0, sizeof(struct state));
	assert(sz > 0);

	/* Main loop over file contents. */

	while (cur < sz) {
		/* Read until next paragraph. */
		for (i = cur + 1; i < sz; i++)
			if ('\n' == buf[i] && '\n' == buf[i - 1]) {
				/* Consume blank paragraphs. */
				while (i + 1 < sz && '\n' == buf[i + 1])
					i++;
				break;
			}
		
		/* Adjust end marker for EOF. */
		end = i < sz ? i - 1 : 
			('\n' == buf[sz - 1] ? sz - 1 : sz);
		sup = i < sz ? end + 2 : sz;

		/* Process paragraph and adjust start. */
		dopar(&st, buf, cur, end);
		cur = sup;
	}
}

/*
 * Read a single file fully into memory.
 * If the file is "-", do it from stdin.
 * If successfully read, send the input buffer to dofile() for further
 * processing.
 */
static int
readfile(const struct args *args, const char *fname)
{
	int		 fd;
	char		*buf;
	size_t		 bufsz, cur;
	ssize_t		 ssz;
	struct tm	*tm;
	time_t		 ttm;
	struct stat 	 st;

	assert(NULL != fname);

	fd = 0 != strcmp("-", fname) ? 
		open(fname, O_RDONLY, 0) : STDIN_FILENO;

	if (-1 == fd) {
		perror(fname);
		return(0);
	}

	if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
		ttm = time(NULL);
		tm = localtime(&ttm);
	} else
		tm = localtime(&st.st_mtime);

	/* 
	 * Arbitrarily-sized initial buffer.
	 * Should be big enough for most files...
	 */
	cur = 0;
	bufsz = 1 << 14;
	if (NULL == (buf = malloc(bufsz))) {
		perror(NULL);
		exit(EXIT_FAILURE);
	}

	while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
		/* Double buffer size on fill. */
		if ((size_t)ssz == bufsz - cur)  {
			bufsz *= 2;
			if (NULL == (buf = realloc(buf, bufsz))) {
				perror(NULL);
				exit(EXIT_FAILURE);
			}
		}
		cur += (size_t)ssz;
	}
	if (ssz < 0) {
		perror(fname);
		free(buf);
		return(0);
	}

	dofile(args, STDIN_FILENO == fd ? 
		"STDIN" : fname, tm, buf, cur);
	free(buf);
	if (STDIN_FILENO != fd)
		close(fd);
	return(1);
}

int
main(int argc, char *argv[])
{
	const char	*fname, *name;
	struct args	 args;
	int		 c;

	name = strrchr(argv[0], '/');
	if (name == NULL)
		name = argv[0];
	else
		++name;

	memset(&args, 0, sizeof(struct args));
	fname = "-";

	/* Accept no arguments for now. */

	while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
		switch (c) {
		case ('h'):
			/* FALLTHROUGH */
		case ('l'):
			/* FALLTHROUGH */
		case ('c'):
			/* FALLTHROUGH */
		case ('o'):
			/* FALLTHROUGH */
		case ('q'):
			/* FALLTHROUGH */
		case ('r'):
			/* FALLTHROUGH */
		case ('u'):
			/* FALLTHROUGH */
		case ('v'):
			/* Ignore these. */
			break;
		case ('d'):
			args.date = optarg;
			break;
		case ('n'):
			args.title = optarg;
			break;
		case ('s'):
			args.section = optarg;
			break;
		default:
			goto usage;
		}

	argc -= optind;
	argv += optind;

	/* Accept only a single input file. */

	if (argc > 2)
		return(EXIT_FAILURE);
	else if (1 == argc)
		fname = *argv;

	return(readfile(&args, fname) ? 
		EXIT_SUCCESS : EXIT_FAILURE);

usage:
	fprintf(stderr, "usage: %s [-d date] " 
		"[-n title] [-s section]\n", name);

	return(EXIT_FAILURE);
}