File: [cvsweb.bsd.lv] / pod2mdoc / pod2mdoc.c (download)
Revision 1.4, Thu Mar 20 15:29:57 2014 UTC (10 years, 6 months ago) by schwarze
Branch: MAIN
Changes since 1.3: +126 -25 lines
Version 0.0.5 from kristaps@.
* elementary list support
* less excessive escaping of possible mdoc macros in formatcode()
* same escaping of possible mdoc macros in formatcodeln()
|
/* $Id: pod2mdoc.c,v 1.4 2014/03/20 15:29:57 schwarze Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/stat.h>
#include <sys/time.h>
#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
struct args {
const char *title; /* override "Dt" title */
const char *date; /* override "Dd" date */
const char *section; /* override "Dt" section */
};
enum list {
LIST_BULLET = 0,
LIST_ENUM,
LIST_TAG,
LIST__MAX
};
struct state {
int parsing; /* after =cut of before command */
int paused; /* in =begin and before =end */
int haspar; /* in paragraph: do we need Pp? */
int isname; /* are we the NAME section? */
const char *fname; /* file being parsed */
#define LIST_STACKSZ 128
enum list lstack[LIST_STACKSZ]; /* open lists */
size_t lpos; /* where in list stack */
};
enum fmt {
FMT_ITALIC,
FMT_BOLD,
FMT_CODE,
FMT_LINK,
FMT_ESCAPE,
FMT_FILE,
FMT_NBSP,
FMT_INDEX,
FMT_NULL,
FMT__MAX
};
enum cmd {
CMD_POD = 0,
CMD_HEAD1,
CMD_HEAD2,
CMD_HEAD3,
CMD_HEAD4,
CMD_OVER,
CMD_ITEM,
CMD_BACK,
CMD_BEGIN,
CMD_END,
CMD_FOR,
CMD_ENCODING,
CMD_CUT,
CMD__MAX
};
static const char *const cmds[CMD__MAX] = {
"pod", /* CMD_POD */
"head1", /* CMD_HEAD1 */
"head2", /* CMD_HEAD2 */
"head3", /* CMD_HEAD3 */
"head4", /* CMD_HEAD4 */
"over", /* CMD_OVER */
"item", /* CMD_ITEM */
"back", /* CMD_BACK */
"begin", /* CMD_BEGIN */
"end", /* CMD_END */
"for", /* CMD_FOR */
"encoding", /* CMD_ENCODING */
"cut" /* CMD_CUT */
};
static const char fmts[FMT__MAX] = {
'I', /* FMT_ITALIC */
'B', /* FMT_BOLD */
'C', /* FMT_CODE */
'L', /* FMT_LINK */
'E', /* FMT_ESCAPE */
'F', /* FMT_FILE */
'S', /* FMT_NBSP */
'X', /* FMT_INDEX */
'Z' /* FMT_NULL */
};
/*
* Given buf[*start] is at the start of an escape name, read til the end
* of the escape ('>') then try to do something with it.
* Sets start to be one after the '>'.
*/
static void
formatescape(const char *buf, size_t *start, size_t end)
{
char esc[16]; /* no more needed */
size_t i, max;
max = sizeof(esc) - 1;
i = 0;
/* Read til our buffer is full. */
while (*start < end && '>' != buf[*start] && i < max)
esc[i++] = buf[(*start)++];
esc[i] = '\0';
if (i == max) {
/* Too long... skip til we end. */
while (*start < end && '>' != buf[*start])
(*start)++;
return;
} else if (*start >= end)
return;
assert('>' == buf[*start]);
(*start)++;
/*
* TODO: right now, we only recognise the named escapes.
* Just let the rest of them go.
*/
if (0 == strcmp(esc, "lt"))
printf("\\(la");
else if (0 == strcmp(esc, "gt"))
printf("\\(ra");
else if (0 == strcmp(esc, "vb"))
printf("\\(ba");
else if (0 == strcmp(esc, "sol"))
printf("\\(sl");
}
/*
* Skip space characters.
*/
static void
skipspace(const char *buf, size_t *start, size_t end)
{
while (*start < end && ' ' == buf[*start])
(*start)++;
}
/*
* We're at the character in front of a format code, which is structured
* like X<...> and can contain nested format codes.
* This consumes the whole format code, and any nested format codes, til
* the end of matched production.
* If "reentrant", then we're being called after a macro has already
* been printed to the current line.
* "last" is set to the last read character: this is used to determine
* whether we should buffer with space or not.
* If "nomacro", then we don't print any macros, just contained data.
*/
static int
formatcode(const char *buf, size_t *start,
size_t end, int reentrant, int last, int nomacro)
{
enum fmt fmt;
assert(*start + 1 < end);
assert('<' == buf[*start + 1]);
for (fmt = 0; fmt < FMT__MAX; fmt++)
if (buf[*start] == fmts[fmt])
break;
/* Invalid macros are just regular text. */
if (FMT__MAX == fmt) {
putchar(buf[*start]);
(*start)++;
return(0);
}
*start += 2;
/*
* Escapes don't print macro sequences, so just output them like
* normal text before processing for macros.
*/
if (FMT_ESCAPE == fmt) {
formatescape(buf, start, end);
return(0);
} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
/* For indices and nulls, just consume. */
while (*start < end && '>' != buf[*start])
(*start)++;
if (*start < end)
(*start)++;
return(0);
}
if ( ! nomacro) {
/*
* Print out the macro describing this format code.
* If we're not "reentrant" (not yet on a macro line)
* then print a newline, if necessary, and the macro
* indicator.
* Otherwise, offset us with a space.
*/
if ( ! reentrant && last != '\n')
putchar('\n');
if ( ! reentrant)
putchar('.');
else
putchar(' ');
/*
* If we don't have whitespace before us, then suppress
* macro whitespace with Ns.
*/
if (' ' != last)
printf("Ns ");
switch (fmt) {
case (FMT_ITALIC):
printf("Em ");
break;
case (FMT_BOLD):
printf("Sy ");
break;
case (FMT_CODE):
printf("Qo Li ");
break;
case (FMT_LINK):
printf("Lk ");
break;
case (FMT_FILE):
printf("Pa ");
break;
case (FMT_NBSP):
/* TODO. */
printf("No ");
break;
default:
abort();
}
}
/*
* Read until we reach the end market ('>') or until we find a
* nested format code.
* Don't emit any newlines: since we're on a macro line, we
* don't want to break the line.
*/
while (*start < end) {
if ('>' == buf[*start]) {
(*start)++;
break;
}
if (*start + 1 < end && '<' == buf[*start + 1]) {
formatcode(buf, start, end, 1, last, nomacro);
continue;
}
/*
* Make sure that any macro-like words (or
* really any word starting with a capital
* letter) is assumed to be a macro that must be
* escaped.
* This matches "Xx " and "XxEOLN".
*/
if ((' ' == last || '\n' == last) &&
end - *start > 1 &&
isupper((int)buf[*start]) &&
islower((int)buf[*start + 1]) &&
(end - *start == 2 ||
' ' == buf[*start + 2]))
printf("\\&");
/* Suppress newline. */
if ('\n' == (last = buf[(*start)++]))
last = ' ';
putchar(last);
}
if ( ! nomacro && FMT_CODE == fmt)
printf(" Qc ");
if (reentrant)
return(1);
/*
* If we're not reentrant, we want to put ending punctuation on
* the macro line so that it's properly handled by being
* smooshed against the terminal word.
*/
skipspace(buf, start, end);
if (',' != buf[*start] && '.' != buf[*start] &&
'!' != buf[*start] && '?' != buf[*start] &&
')' != buf[*start])
return(1);
while (*start < end) {
if (',' != buf[*start] &&
'.' != buf[*start] &&
'!' != buf[*start] &&
'?' != buf[*start] &&
')' != buf[*start])
break;
putchar(' ');
putchar(buf[*start]);
(*start)++;
}
skipspace(buf, start, end);
return(1);
}
/*
* Calls formatcode() til the end of a paragraph.
*/
static void
formatcodeln(const char *buf, size_t *start, size_t end, int nomacro)
{
int last;
last = ' ';
while (*start < end) {
if (*start + 1 < end && '<' == buf[*start + 1]) {
formatcode(buf, start, end, 1, last, nomacro);
continue;
}
/*
* Since we're already on a macro line, we want to make
* sure that we don't inadvertently invoke a macro.
* We need to do this carefully because section names
* are used in troff and we don't want to escape
* something that needn't be escaped.
*/
if (' ' == last && end - *start > 1 &&
isupper((int)buf[*start]) &&
islower((int)buf[*start + 1]) &&
(end - *start == 2 ||
' ' == buf[*start + 2]))
printf("\\&");
if ('\n' != buf[*start])
putchar(last = buf[*start]);
else
putchar(last = ' ');
(*start)++;
}
}
/*
* Guess at what kind of list we are.
* These are taken straight from the POD manual.
* I don't know what people do in real life.
*/
static enum list
listguess(const char *buf, size_t start, size_t end)
{
size_t len = end - start;
assert(end >= start);
if (len == 1 && '*' == buf[start])
return(LIST_BULLET);
if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
return(LIST_ENUM);
else if (len == 1 && '1' == buf[start])
return(LIST_ENUM);
else
return(LIST_TAG);
}
/*
* A command paragraph, as noted in the perlpod manual, just indicates
* that we should do something, optionally with some text to print as
* well.
*/
static void
command(struct state *st, const char *buf, size_t start, size_t end)
{
size_t len, csz;
enum cmd cmd;
assert('=' == buf[start]);
start++;
len = end - start;
for (cmd = 0; cmd < CMD__MAX; cmd++) {
csz = strlen(cmds[cmd]);
if (len < csz)
continue;
if (0 == memcmp(&buf[start], cmd[cmds], csz))
break;
}
/* Ignore bogus commands. */
if (CMD__MAX == cmd)
return;
start += csz;
skipspace(buf, &start, end);
len = end - start;
if (st->paused) {
st->paused = CMD_END != cmd;
return;
}
switch (cmd) {
case (CMD_POD):
break;
case (CMD_HEAD1):
/*
* The behaviour of head= follows from a quick glance at
* how pod2man handles it.
*/
printf(".Sh ");
st->isname = 0;
if (end - start == 4)
if (0 == memcmp(&buf[start], "NAME", 4))
st->isname = 1;
formatcodeln(buf, &start, end, 1);
putchar('\n');
st->haspar = 1;
break;
case (CMD_HEAD2):
printf(".Ss ");
formatcodeln(buf, &start, end, 1);
putchar('\n');
st->haspar = 1;
break;
case (CMD_HEAD3):
puts(".Pp");
printf(".Em ");
formatcodeln(buf, &start, end, 0);
putchar('\n');
puts(".Pp");
st->haspar = 1;
break;
case (CMD_HEAD4):
puts(".Pp");
printf(".No ");
formatcodeln(buf, &start, end, 0);
putchar('\n');
puts(".Pp");
st->haspar = 1;
break;
case (CMD_OVER):
/*
* If we have an existing list that hasn't had an =item
* yet, then make sure that we open it now.
* We use the default list type, but that can't be
* helped (we haven't seen any items yet).
*/
if (st->lpos > 0)
if (LIST__MAX == st->lstack[st->lpos - 1]) {
st->lstack[st->lpos - 1] = LIST_TAG;
puts(".Bl -tag -width Ds");
}
st->lpos++;
assert(st->lpos < LIST_STACKSZ);
st->lstack[st->lpos - 1] = LIST__MAX;
break;
case (CMD_ITEM):
assert(st->lpos > 0);
/*
* If we're the first =item, guess at what our content
* will be: "*" is a bullet list, "1." is a numbered
* list, and everything is tagged.
*/
if (LIST__MAX == st->lstack[st->lpos - 1]) {
st->lstack[st->lpos - 1] =
listguess(buf, start, end);
switch (st->lstack[st->lpos - 1]) {
case (LIST_BULLET):
puts(".Bl -bullet");
break;
case (LIST_ENUM):
puts(".Bl -enum");
break;
default:
puts(".Bl -tag -width Ds");
break;
}
}
switch (st->lstack[st->lpos - 1]) {
case (LIST_TAG):
printf(".It ");
formatcodeln(buf, &start, end, 0);
putchar('\n');
break;
case (LIST_ENUM):
/* FALLTHROUGH */
case (LIST_BULLET):
/*
* Abandon the remainder of the paragraph
* because we're going to be a bulletted or
* numbered list.
*/
puts(".It");
break;
default:
abort();
}
st->haspar = 1;
break;
case (CMD_BACK):
/* Make sure we don't back over the stack. */
if (st->lpos > 0) {
st->lpos--;
puts(".El");
}
break;
case (CMD_BEGIN):
/*
* We disregard all types for now.
* TODO: process at least "text" in a -literal block.
*/
st->paused = 1;
break;
case (CMD_FOR):
/*
* We ignore all types of encodings and formats
* unilaterally.
*/
break;
case (CMD_ENCODING):
break;
case (CMD_CUT):
st->parsing = 0;
return;
default:
abort();
}
/* Any command (but =cut) makes us start parsing. */
st->parsing = 1;
}
/*
* Just pump out the line in a verbatim block.
*/
static void
verbatim(struct state *st, const char *buf, size_t start, size_t end)
{
if ( ! st->parsing || st->paused)
return;
puts(".Bd -literal");
printf("%.*s\n", (int)(end - start), &buf[start]);
puts(".Ed");
}
/*
* Ordinary paragraph.
* Well, this is really the hardest--POD seems to assume that, for
* example, a leading space implies a newline, and so on.
* Lots of other snakes in the grass: escaping a newline followed by a
* period (accidental mdoc(7) control), double-newlines after macro
* passages, etc.
*/
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
{
int last;
size_t i, j;
if ( ! st->parsing || st->paused)
return;
/*
* Special-case: the NAME section.
* If we find a "-" when searching from the end, assume that
* we're in "name - description" format.
* To wit, print out a "Nm" and "Nd" in that format.
*/
if (st->isname) {
for (i = end - 1; i > start; i--)
if ('-' == buf[i])
break;
if ('-' == buf[i]) {
j = i;
/* Roll over multiple "-". */
for ( ; i > start; i--)
if ('-' != buf[i])
break;
/* FIXME: escape macro-like words etc. */
printf(".Nm %.*s\n",
(int)((i + 1) - start), &buf[start]);
printf(".Nd %.*s\n",
(int)(end - (j + 1)), &buf[j + 1]);
return;
}
}
if ( ! st->haspar)
puts(".Pp");
st->haspar = 0;
last = '\n';
while (start < end) {
/*
* Loop til we get either to a newline or escape.
* Escape initial control characters.
*/
while (start < end) {
if (start < end - 1 && '<' == buf[start + 1])
break;
else if ('\n' == buf[start])
break;
else if ('\n' == last && '.' == buf[start])
printf("\\&");
else if ('\n' == last && '\'' == buf[start])
printf("\\&");
putchar(last = buf[start++]);
}
if (start < end - 1 && '<' == buf[start + 1]) {
/*
* We've encountered a format code.
* This is going to trigger a macro no matter
* what, so print a newline now.
* Then print the (possibly nested) macros and
* following that, a newline.
*/
if (formatcode(buf, &start, end, 0, last, 0))
putchar(last = '\n');
} else if (start < end && '\n' == buf[start]) {
/*
* Print the newline only if we haven't already
* printed a newline.
*/
if (last != '\n')
putchar(last = buf[start]);
if (++start >= end)
continue;
/*
* If we have whitespace next, eat it to prevent
* mdoc(7) from thinking that it's meant for
* verbatim text.
* It is--but if we start with that, we can't
* have a macro subsequent it, which may be
* possible if we have an escape next.
*/
if (' ' == buf[start] || '\t' == buf[start]) {
puts(".br");
last = '\n';
}
for ( ; start < end; start++)
if (' ' != buf[start] && '\t' != buf[start])
break;
} else if (start < end) {
/*
* Default: print the character.
* Escape initial control characters.
*/
if ('\n' == last && '.' == buf[start])
printf("\\&");
else if ('\n' == last && '\'' == buf[start])
printf("\\&");
putchar(last = buf[start++]);
}
}
if (last != '\n')
putchar('\n');
}
/*
* There are three kinds of paragraphs: verbatim (starts with whitespace
* of some sort), ordinary (starts without "=" marker), or a command
* (default: starts with "=").
*/
static void
dopar(struct state *st, const char *buf, size_t start, size_t end)
{
if (end == start)
return;
if (' ' == buf[start] || '\t' == buf[start])
verbatim(st, buf, start, end);
else if ('=' != buf[start])
ordinary(st, buf, start, end);
else
command(st, buf, start, end);
}
/*
* Loop around paragraphs within a document, processing each one in the
* POD way.
*/
static void
dofile(const struct args *args, const char *fname,
const struct tm *tm, const char *buf, size_t sz)
{
size_t sup, end, i, cur = 0;
struct state st;
const char *section, *date;
char datebuf[64];
char *title, *cp;
if (0 == sz)
return;
/* Title is last path component of the filename. */
if (NULL != args->title)
title = strdup(args->title);
else if (NULL != (cp = strrchr(fname, '/')))
title = strdup(cp + 1);
else
title = strdup(fname);
if (NULL == title) {
perror(NULL);
exit(EXIT_FAILURE);
}
/* Section is 1 unless suffix is "pm". */
if (NULL == (section = args->section)) {
section = "1";
if (NULL != (cp = strrchr(title, '.'))) {
*cp++ = '\0';
if (0 == strcmp(cp, "pm"))
section = "3p";
}
}
/* Date. Or the given "tm" if not supplied. */
if (NULL == (date = args->date)) {
strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
date = datebuf;
}
for (cp = title; '\0' != *cp; cp++)
*cp = toupper((int)*cp);
/* The usual mdoc(7) preamble. */
printf(".Dd %s\n", date);
printf(".Dt %s %s\n", title, section);
puts(".Os");
free(title);
memset(&st, 0, sizeof(struct state));
assert(sz > 0);
/* Main loop over file contents. */
while (cur < sz) {
/* Read until next paragraph. */
for (i = cur + 1; i < sz; i++)
if ('\n' == buf[i] && '\n' == buf[i - 1]) {
/* Consume blank paragraphs. */
while (i + 1 < sz && '\n' == buf[i + 1])
i++;
break;
}
/* Adjust end marker for EOF. */
end = i < sz ? i - 1 :
('\n' == buf[sz - 1] ? sz - 1 : sz);
sup = i < sz ? end + 2 : sz;
/* Process paragraph and adjust start. */
dopar(&st, buf, cur, end);
cur = sup;
}
}
/*
* Read a single file fully into memory.
* If the file is "-", do it from stdin.
* If successfully read, send the input buffer to dofile() for further
* processing.
*/
static int
readfile(const struct args *args, const char *fname)
{
int fd;
char *buf;
size_t bufsz, cur;
ssize_t ssz;
struct tm *tm;
time_t ttm;
struct stat st;
assert(NULL != fname);
fd = 0 != strcmp("-", fname) ?
open(fname, O_RDONLY, 0) : STDIN_FILENO;
if (-1 == fd) {
perror(fname);
return(0);
}
if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
ttm = time(NULL);
tm = localtime(&ttm);
} else
tm = localtime(&st.st_mtime);
/*
* Arbitrarily-sized initial buffer.
* Should be big enough for most files...
*/
cur = 0;
bufsz = 1 << 14;
if (NULL == (buf = malloc(bufsz))) {
perror(NULL);
exit(EXIT_FAILURE);
}
while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
/* Double buffer size on fill. */
if ((size_t)ssz == bufsz - cur) {
bufsz *= 2;
if (NULL == (buf = realloc(buf, bufsz))) {
perror(NULL);
exit(EXIT_FAILURE);
}
}
cur += (size_t)ssz;
}
if (ssz < 0) {
perror(fname);
free(buf);
return(0);
}
dofile(args, STDIN_FILENO == fd ?
"STDIN" : fname, tm, buf, cur);
free(buf);
if (STDIN_FILENO != fd)
close(fd);
return(1);
}
int
main(int argc, char *argv[])
{
const char *fname, *name;
struct args args;
int c;
name = strrchr(argv[0], '/');
if (name == NULL)
name = argv[0];
else
++name;
memset(&args, 0, sizeof(struct args));
fname = "-";
/* Accept no arguments for now. */
while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
switch (c) {
case ('h'):
/* FALLTHROUGH */
case ('l'):
/* FALLTHROUGH */
case ('c'):
/* FALLTHROUGH */
case ('o'):
/* FALLTHROUGH */
case ('q'):
/* FALLTHROUGH */
case ('r'):
/* FALLTHROUGH */
case ('u'):
/* FALLTHROUGH */
case ('v'):
/* Ignore these. */
break;
case ('d'):
args.date = optarg;
break;
case ('n'):
args.title = optarg;
break;
case ('s'):
args.section = optarg;
break;
default:
goto usage;
}
argc -= optind;
argv += optind;
/* Accept only a single input file. */
if (argc > 2)
return(EXIT_FAILURE);
else if (1 == argc)
fname = *argv;
return(readfile(&args, fname) ?
EXIT_SUCCESS : EXIT_FAILURE);
usage:
fprintf(stderr, "usage: %s [-d date] "
"[-n title] [-s section]\n", name);
return(EXIT_FAILURE);
}