/* $Id: pod2mdoc.c,v 1.46 2015/02/14 15:34:39 schwarze Exp $ */
/*
* Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
* Copyright (c) 2014, 2015 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <sys/stat.h>
#include <sys/time.h>
#include <assert.h>
#include <ctype.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include "dict.h"
/*
* In what section can we find Perl module manuals?
* Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
* XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
*/
#define PERL_SECTION "3p"
struct args {
const char *title; /* override "Dt" title */
const char *date; /* override "Dd" date */
const char *section; /* override "Dt" section */
};
enum list {
LIST_BULLET = 0,
LIST_ENUM,
LIST_TAG,
LIST__MAX
};
enum sect {
SECT_NONE = 0,
SECT_NAME, /* NAME section */
SECT_SYNOPSIS, /* SYNOPSIS section */
};
enum outstate {
OUST_NL = 0, /* just started a new output line */
OUST_TXT, /* text line output in progress */
OUST_MAC /* macro line output in progress */
};
struct state {
const char *fname; /* file being parsed */
int parsing; /* after =cut of before command */
int paused; /* in =begin and before =end */
enum sect sect; /* which section are we in? */
#define LIST_STACKSZ 128
enum list lstack[LIST_STACKSZ]; /* open lists */
size_t lpos; /* where in list stack */
int haspar; /* in paragraph: do we need Pp? */
enum outstate oust; /* state of the mdoc output stream */
int wantws; /* let mdoc(7) output whitespace here */
char *outbuf; /* text buffered for output */
size_t outbufsz; /* allocated size of outbuf */
size_t outbuflen; /* current length of outbuf */
};
enum fmt {
FMT_ITALIC,
FMT_BOLD,
FMT_CODE,
FMT_LINK,
FMT_ESCAPE,
FMT_FILE,
FMT_NBSP,
FMT_INDEX,
FMT_NULL,
FMT__MAX
};
enum cmd {
CMD_POD = 0,
CMD_HEAD1,
CMD_HEAD2,
CMD_HEAD3,
CMD_HEAD4,
CMD_OVER,
CMD_ITEM,
CMD_BACK,
CMD_BEGIN,
CMD_END,
CMD_FOR,
CMD_ENCODING,
CMD_CUT,
CMD__MAX
};
static const char *const cmds[CMD__MAX] = {
"pod", /* CMD_POD */
"head1", /* CMD_HEAD1 */
"head2", /* CMD_HEAD2 */
"head3", /* CMD_HEAD3 */
"head4", /* CMD_HEAD4 */
"over", /* CMD_OVER */
"item", /* CMD_ITEM */
"back", /* CMD_BACK */
"begin", /* CMD_BEGIN */
"end", /* CMD_END */
"for", /* CMD_FOR */
"encoding", /* CMD_ENCODING */
"cut" /* CMD_CUT */
};
static const char fmts[FMT__MAX] = {
'I', /* FMT_ITALIC */
'B', /* FMT_BOLD */
'C', /* FMT_CODE */
'L', /* FMT_LINK */
'E', /* FMT_ESCAPE */
'F', /* FMT_FILE */
'S', /* FMT_NBSP */
'X', /* FMT_INDEX */
'Z' /* FMT_NULL */
};
static unsigned char last;
static void
outbuf_grow(struct state *st, size_t by)
{
st->outbufsz += (by / 128 + 1) * 128;
st->outbuf = realloc(st->outbuf, st->outbufsz);
if (NULL == st->outbuf) {
perror(NULL);
exit(EXIT_FAILURE);
}
}
static void
outbuf_addchar(struct state *st)
{
if (st->outbuflen + 2 >= st->outbufsz)
outbuf_grow(st, 1);
st->outbuf[st->outbuflen++] = last;
if ('\\' == last)
st->outbuf[st->outbuflen++] = 'e';
st->outbuf[st->outbuflen] = '\0';
}
static void
outbuf_addstr(struct state *st, const char *str)
{
size_t slen;
slen = strlen(str);
if (st->outbuflen + slen >= st->outbufsz)
outbuf_grow(st, slen);
memcpy(st->outbuf + st->outbuflen, str, slen+1);
st->outbuflen += slen;
last = str[slen - 1];
}
static void
outbuf_flush(struct state *st)
{
if (0 == st->outbuflen)
return;
if (OUST_TXT == st->oust && st->wantws)
putchar(' ');
fputs(st->outbuf, stdout);
*st->outbuf = '\0';
st->outbuflen = 0;
if (OUST_NL == st->oust)
st->oust = OUST_TXT;
}
static void
mdoc_newln(struct state *st)
{
if (OUST_NL == st->oust)
return;
putchar('\n');
last = '\n';
st->oust = OUST_NL;
st->wantws = 1;
}
/*
* Given buf[*start] is at the start of an escape name, read til the end
* of the escape ('>') then try to do something with it.
* Sets start to be one after the '>'.
*
* This function does not care about output modes,
* it merely appends text to the output buffer,
* which can then be used in any mode.
*/
static void
formatescape(struct state *st, const char *buf, size_t *start, size_t end)
{
char esc[16]; /* no more needed */
size_t i, max;
max = sizeof(esc) - 1;
i = 0;
/* Read til our buffer is full. */
while (*start < end && '>' != buf[*start] && i < max)
esc[i++] = buf[(*start)++];
esc[i] = '\0';
if (i == max) {
/* Too long... skip til we end. */
while (*start < end && '>' != buf[*start])
(*start)++;
return;
} else if (*start >= end)
return;
assert('>' == buf[*start]);
(*start)++;
/*
* TODO: right now, we only recognise the named escapes.
* Just let the rest of them go.
*/
if (0 == strcmp(esc, "lt"))
outbuf_addstr(st, "\\(la");
else if (0 == strcmp(esc, "gt"))
outbuf_addstr(st, "\\(ra");
else if (0 == strcmp(esc, "verbar"))
outbuf_addstr(st, "\\(ba");
else if (0 == strcmp(esc, "sol"))
outbuf_addstr(st, "\\(sl");
}
/*
* Run some heuristics to intuit a link format.
* I set "start" to be the end of the sequence (last right-carrot) so
* that the caller can safely just continue processing.
* If this is just an empty tag, I'll return 0.
*
* Always operates in OUST_MAC mode.
* Mode handling is done by the caller.
*/
static int
trylink(const char *buf, size_t *start, size_t end, size_t dsz)
{
size_t linkstart, realend, linkend,
i, j, textsz, stack;
/*
* Scan to the start of the terminus.
* This function is more or less replicated in the formatcode()
* for null or index formatting codes.
* However, we're slightly different because we might have
* nested escapes we need to ignore.
*/
stack = 0;
for (linkstart = realend = *start; realend < end; realend++) {
if ('<' == buf[realend])
stack++;
if ('>' != buf[realend])
continue;
else if (stack-- > 0)
continue;
if (dsz == 1)
break;
assert(realend > 0);
if (' ' != buf[realend - 1])
continue;
for (i = realend, j = 0; i < end && j < dsz; j++)
if ('>' != buf[i++])
break;
if (dsz == j)
break;
}
/* Ignore stubs. */
if (realend == end || realend == *start)
return(0);
/* Set linkend to the end of content. */
linkend = dsz > 1 ? realend - 1 : realend;
/* Re-scan to see if we have a title or section. */
for (textsz = *start; textsz < linkend; textsz++)
if ('|' == buf[textsz] || '/' == buf[textsz])
break;
if (textsz < linkend && '|' == buf[textsz]) {
/* With title: set start, then end at section. */
linkstart = textsz + 1;
textsz = textsz - *start;
for (i = linkstart; i < linkend; i++)
if ('/' == buf[i])
break;
if (i < linkend)
linkend = i;
} else if (textsz < linkend && '/' == buf[textsz]) {
/* With section: set end at section. */
linkend = textsz;
textsz = 0;
} else
/* No title, no section. */
textsz = 0;
*start = realend;
j = linkend - linkstart;
/* Do we have only subsection material? */
if (0 == j && '/' == buf[linkend]) {
linkstart = linkend + 1;
linkend = dsz > 1 ? realend - 1 : realend;
if (0 == (j = linkend - linkstart))
return(0);
printf("Sx %.*s", (int)j, &buf[linkstart]);
return(1);
} else if (0 == j)
return(0);
/* See if we qualify as being a link or not. */
if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
(j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
(j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
(j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
(j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
(j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
/* Gross. */
printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
realend) - linkstart), &buf[linkstart]);
return(1);
}
/* See if we qualify as a mailto. */
if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
printf("Mt %.*s", (int)j, &buf[linkstart]);
return(1);
}
/* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
if ((j > 3 && ')' == buf[linkend - 1]) &&
('(' == buf[linkend - 3])) {
printf("Xr %.*s %c", (int)(j - 3),
&buf[linkstart], buf[linkend - 2]);
return(1);
} else if ((j > 4 && ')' == buf[linkend - 1]) &&
('(' == buf[linkend - 4])) {
printf("Xr %.*s %.*s", (int)(j - 4),
&buf[linkstart], 2, &buf[linkend - 3]);
return(1);
} else if ((j > 5 && ')' == buf[linkend - 1]) &&
('(' == buf[linkend - 5])) {
printf("Xr %.*s %.*s", (int)(j - 5),
&buf[linkstart], 3, &buf[linkend - 4]);
return(1);
}
/* Last try: do we have a double-colon? */
for (i = linkstart + 1; i < linkend; i++)
if (':' == buf[i] && ':' == buf[i - 1])
break;
if (i < linkend)
printf("Xr %.*s " PERL_SECTION,
(int)j, &buf[linkstart]);
else
printf("Xr %.*s 1", (int)j, &buf[linkstart]);
return(1);
}
/*
* Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
* then it's likely that we're a flag.
* Our flag might be followed by an argument, so make sure that we're
* accounting for that, too.
* If we don't have a flag at all, however, then assume we're an "Ar".
*
* Always operates in OUST_MAC mode.
* Mode handlinf is done by the caller.
*/
static void
dosynopsisfl(const char *buf, size_t *start, size_t end)
{
size_t i;
again:
assert(*start + 1 < end);
assert('-' == buf[*start]);
if ( ! isalnum((int)buf[*start + 1]) &&
'?' != buf[*start + 1] &&
'-' != buf[*start + 1]) {
(*start)--;
fputs("Ar ", stdout);
return;
}
(*start)++;
for (i = *start; i < end; i++)
if (isalnum((int)buf[i]))
continue;
else if ('?' == buf[i])
continue;
else if ('-' == buf[i])
continue;
else if ('_' == buf[i])
continue;
else
break;
assert(i < end);
if ( ! (' ' == buf[i] || '>' == buf[i])) {
printf("Ar ");
return;
}
printf("Fl ");
if (end - *start > 1 &&
isupper((int)buf[*start]) &&
islower((int)buf[*start + 1]) &&
(end - *start == 2 ||
' ' == buf[*start + 2]))
printf("\\&");
printf("%.*s ", (int)(i - *start), &buf[*start]);
*start = i;
if (' ' == buf[i]) {
while (i < end && ' ' == buf[i])
i++;
assert(i < end);
if ('-' == buf[i]) {
*start = i;
goto again;
}
printf("Ar ");
*start = i;
}
}
/*
* We're at the character in front of a format code, which is structured
* like X<...> and can contain nested format codes.
* This consumes the whole format code, and any nested format codes, til
* the end of matched production.
* If "nomacro", then we don't print any macros, just contained data
* (e.g., following "Sh" or "Nm").
* "pos" is only significant in SYNOPSIS, and should be 0 when invoked
* as the first format code on a line (for decoration as an "Nm"),
* non-zero otherwise.
*
* Output mode handling is most complicated here.
* We may enter in any mode.
* We usually exit in OUST_MAC mode, except when
* entering without OUST_MAC and the code is invalid.
*/
static int
formatcode(struct state *st, const char *buf, size_t *start,
size_t end, int nomacro, int pos)
{
size_t i, j, dsz;
enum fmt fmt;
int wantws;
unsigned char uc;
assert(*start + 1 < end);
assert('<' == buf[*start + 1]);
/*
* First, look up the format code.
* If it's not valid, treat it as a NOOP.
*/
for (fmt = 0; fmt < FMT__MAX; fmt++)
if (buf[*start] == fmts[fmt])
break;
/*
* Determine whether we're overriding our delimiter.
* According to POD, if we have more than one '<' followed by a
* space, then we need a space followed by matching '>' to close
* the expression.
* Otherwise we use the usual '<' and '>' matched pair.
*/
i = *start + 1;
while (i < end && '<' == buf[i])
i++;
assert(i > *start + 1);
dsz = i - (*start + 1);
if (dsz > 1 && (i >= end || ' ' != buf[i]))
dsz = 1;
/* Remember, if dsz>1, to jump the trailing space. */
*start += dsz + 1 + (dsz > 1 ? 1 : 0);
/*
* Escapes and ignored codes (NULL and INDEX) don't print macro
* sequences, so just output them like normal text before
* processing for real macros.
*/
if (FMT_ESCAPE == fmt) {
formatescape(st, buf, start, end);
return(0);
} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
/*
* Just consume til the end delimiter, accounting for
* whether it's a custom one.
*/
for ( ; *start < end; (*start)++) {
if ('>' != buf[*start])
continue;
else if (dsz == 1)
break;
assert(*start > 0);
if (' ' != buf[*start - 1])
continue;
i = *start;
for (j = 0; i < end && j < dsz; j++)
if ('>' != buf[i++])
break;
if (dsz != j)
continue;
(*start) += dsz;
break;
}
if (*start < end) {
assert('>' == buf[*start]);
(*start)++;
}
if (isspace(last))
while (*start < end && isspace((int)buf[*start]))
(*start)++;
return(0);
}
/*
* Check whether we're supposed to print macro stuff (this is
* suppressed in, e.g., "Nm" and "Sh" macros).
*/
if (FMT__MAX != fmt && !nomacro) {
/*
* We may already have wantws if there was whitespace
* before the code ("text B<text"), but initial
* whitespace inside our scope ("textB< text")
* allows to break at this point as well.
*/
wantws = ' ' == buf[*start] ||
(OUST_MAC == st->oust ? st->wantws : ! st->outbuflen);
/*
* If we are on a text line and there is no
* whitespace before our content, we have to make
* the previous word a prefix to the macro line.
* In the following, mdoc_newln() must not be used
* lest we clobber out output state.
*/
if (OUST_MAC != st->oust && ! wantws) {
if (OUST_NL != st->oust)
putchar('\n');
printf(".Pf ");
st->wantws = 0;
}
outbuf_flush(st);
/* Whitespace is easier to suppress on macro lines. */
if (OUST_MAC == st->oust && ! wantws)
printf(" Ns ");
/* Unless we are on a macro line, start one. */
if (OUST_MAC != st->oust && wantws) {
if (OUST_NL != st->oust)
putchar('\n');
putchar('.');
} else
putchar(' ');
/*
* Print the macro corresponding to this format code,
* and update the output state afterwards.
*/
switch (fmt) {
case (FMT_ITALIC):
printf("Em ");
break;
case (FMT_BOLD):
if (SECT_SYNOPSIS == st->sect) {
if (1 == dsz && '-' == buf[*start])
dosynopsisfl(buf, start, end);
else if (0 == pos)
printf("Nm ");
else
printf("Ar ");
break;
}
i = 0;
uc = buf[*start];
while (isalnum(uc) || '_' == uc || ' ' == uc)
uc = buf[*start + ++i];
if ('=' != uc && '>' != uc)
i = 0;
if (4 == i && ! strncmp(buf + *start, "NULL", 4)) {
printf("Dv ");
break;
}
switch (i ? dict_get(buf + *start, i) : MDOC_MAX) {
case MDOC_Fa:
printf("Fa ");
break;
case MDOC_Vt:
printf("Vt ");
break;
default:
printf("Sy ");
break;
}
break;
case (FMT_CODE):
printf("Qo Li ");
break;
case (FMT_LINK):
/* Try to link; use "No" if it's empty. */
if ( ! trylink(buf, start, end, dsz))
printf("No ");
break;
case (FMT_FILE):
printf("Pa ");
break;
case (FMT_NBSP):
printf("No ");
break;
default:
abort();
}
st->oust = OUST_MAC;
st->wantws = 1;
} else
outbuf_flush(st);
/*
* Process until we reach the end marker (e.g., '>') or until we
* find a nested format code.
* Don't emit any newlines: since we're on a macro line, we
* don't want to break the line.
*/
while (*start < end) {
if ('>' == buf[*start] && 1 == dsz) {
(*start)++;
break;
} else if ('>' == buf[*start] &&
' ' == buf[*start - 1]) {
/*
* Handle custom delimiters.
* These require a certain number of
* space-preceded carrots before we're really at
* the end.
*/
i = *start;
for (j = 0; i < end && j < dsz; j++)
if ('>' != buf[i++])
break;
if (dsz == j) {
*start += dsz;
break;
}
}
if (*start + 1 < end && '<' == buf[*start + 1] &&
'A' <= buf[*start] && 'Z' >= buf[*start]) {
if ( ! formatcode(st, buf, start, end, nomacro, 1))
st->wantws = 1;
continue;
}
/* Suppress newlines and multiple spaces. */
last = buf[(*start)++];
if (' ' == last || '\n' == last) {
putchar(' ');
while (*start < end && ' ' == buf[*start])
(*start)++;
continue;
}
if (OUST_MAC == st->oust && FMT__MAX != fmt) {
if ( ! st->wantws) {
printf(" Ns ");
st->wantws = 1;
}
/*
* Escape macro-like words.
* This matches "Xx " and "XxEOLN".
*/
if (end - *start > 0 &&
isupper((unsigned char)last) &&
islower((unsigned char)buf[*start]) &&
(end - *start == 1 ||
' ' == buf[*start + 1] ||
'>' == buf[*start + 1]))
printf("\\&");
}
putchar(last);
/* Protect against character escapes. */
if ('\\' == last)
putchar('e');
}
if ( ! nomacro && FMT_CODE == fmt)
printf(" Qc ");
st->wantws = ' ' == last;
return(FMT__MAX != fmt);
}
/*
* Calls formatcode() til the end of a paragraph.
* Goes to OUST_MAC mode and stays there when returning,
* such that the caller can add arguments to the macro line
* before closing it out.
*/
static void
formatcodeln(struct state *st, const char *linemac,
const char *buf, size_t *start, size_t end, int nomacro)
{
int gotmacro, wantws;
assert(OUST_NL == st->oust);
assert(st->wantws);
printf(".%s ", linemac);
st->oust = OUST_MAC;
gotmacro = 0;
while (*start < end) {
wantws = ' ' == buf[*start] || '\n' == buf[*start];
if (wantws) {
last = ' ';
do {
(*start)++;
} while (*start < end && ' ' == buf[*start]);
}
if (*start + 1 < end && '<' == buf[*start + 1] &&
'A' <= buf[*start] && 'Z' >= buf[*start]) {
st->wantws |= wantws;
gotmacro = formatcode(st, buf,
start, end, nomacro, 1);
continue;
}
if (gotmacro) {
if (*start < end || st->outbuflen) {
if (st->wantws ||
(wantws && !st->outbuflen))
printf(" No ");
else
printf(" Ns ");
}
gotmacro = 0;
}
outbuf_flush(st);
st->wantws = wantws;
if (*start >= end)
break;
if (st->wantws) {
putchar(' ');
st->wantws = 0;
}
/*
* Since we're already on a macro line, we want to make
* sure that we don't inadvertently invoke a macro.
* We need to do this carefully because section names
* are used in troff and we don't want to escape
* something that needn't be escaped.
*/
if (' ' == last && end - *start > 1 &&
isupper((unsigned char)buf[*start]) &&
islower((unsigned char)buf[*start + 1]) &&
(end - *start == 2 || ' ' == buf[*start + 2]))
printf("\\&");
putchar(last = buf[*start]);
/* Protect against character escapes. */
if ('\\' == last)
putchar('e');
(*start)++;
}
}
/*
* Guess at what kind of list we are.
* These are taken straight from the POD manual.
* I don't know what people do in real life.
*/
static enum list
listguess(const char *buf, size_t start, size_t end)
{
size_t len = end - start;
assert(end >= start);
if (len == 1 && '*' == buf[start])
return(LIST_BULLET);
if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
return(LIST_ENUM);
else if (len == 1 && '1' == buf[start])
return(LIST_ENUM);
else
return(LIST_TAG);
}
/*
* A command paragraph, as noted in the perlpod manual, just indicates
* that we should do something, optionally with some text to print as
* well.
* From the perspective of external callers,
* always stays in OUST_NL/wantws mode,
* but its children do use OUST_MAC.
*/
static void
command(struct state *st, const char *buf, size_t start, size_t end)
{
size_t len, csz;
enum cmd cmd;
assert('=' == buf[start]);
start++;
len = end - start;
for (cmd = 0; cmd < CMD__MAX; cmd++) {
csz = strlen(cmds[cmd]);
if (len < csz)
continue;
if (0 == memcmp(&buf[start], cmd[cmds], csz))
break;
}
/* Ignore bogus commands. */
if (CMD__MAX == cmd)
return;
start += csz;
while (start < end && ' ' == buf[start])
start++;
len = end - start;
if (st->paused) {
st->paused = CMD_END != cmd;
return;
}
switch (cmd) {
case (CMD_POD):
break;
case (CMD_HEAD1):
/*
* The behaviour of head= follows from a quick glance at
* how pod2man handles it.
*/
st->sect = SECT_NONE;
if (end - start == 4) {
if (0 == memcmp(&buf[start], "NAME", 4))
st->sect = SECT_NAME;
} else if (end - start == 8) {
if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
st->sect = SECT_SYNOPSIS;
}
formatcodeln(st, "Sh", buf, &start, end, 1);
mdoc_newln(st);
st->haspar = 1;
break;
case (CMD_HEAD2):
formatcodeln(st, "Ss", buf, &start, end, 1);
mdoc_newln(st);
st->haspar = 1;
break;
case (CMD_HEAD3):
puts(".Pp");
formatcodeln(st, "Em", buf, &start, end, 0);
mdoc_newln(st);
puts(".Pp");
st->haspar = 1;
break;
case (CMD_HEAD4):
puts(".Pp");
formatcodeln(st, "No", buf, &start, end, 0);
mdoc_newln(st);
puts(".Pp");
st->haspar = 1;
break;
case (CMD_OVER):
/*
* If we have an existing list that hasn't had an =item
* yet, then make sure that we open it now.
* We use the default list type, but that can't be
* helped (we haven't seen any items yet).
*/
if (st->lpos > 0)
if (LIST__MAX == st->lstack[st->lpos - 1]) {
st->lstack[st->lpos - 1] = LIST_TAG;
puts(".Bl -tag -width Ds");
}
st->lpos++;
assert(st->lpos < LIST_STACKSZ);
st->lstack[st->lpos - 1] = LIST__MAX;
break;
case (CMD_ITEM):
if (0 == st->lpos) {
/*
* Bad markup.
* Try to compensate.
*/
st->lstack[st->lpos] = LIST__MAX;
st->lpos++;
}
assert(st->lpos > 0);
/*
* If we're the first =item, guess at what our content
* will be: "*" is a bullet list, "1." is a numbered
* list, and everything is tagged.
*/
if (LIST__MAX == st->lstack[st->lpos - 1]) {
st->lstack[st->lpos - 1] =
listguess(buf, start, end);
switch (st->lstack[st->lpos - 1]) {
case (LIST_BULLET):
puts(".Bl -bullet");
break;
case (LIST_ENUM):
puts(".Bl -enum");
break;
default:
puts(".Bl -tag -width Ds");
break;
}
}
switch (st->lstack[st->lpos - 1]) {
case (LIST_TAG):
formatcodeln(st, "It", buf, &start, end, 0);
mdoc_newln(st);
break;
case (LIST_ENUM):
/* FALLTHROUGH */
case (LIST_BULLET):
/*
* Abandon the remainder of the paragraph
* because we're going to be a bulletted or
* numbered list.
*/
puts(".It");
break;
default:
abort();
}
st->haspar = 1;
break;
case (CMD_BACK):
/* Make sure we don't back over the stack. */
if (st->lpos > 0) {
st->lpos--;
puts(".El");
}
break;
case (CMD_BEGIN):
/*
* We disregard all types for now.
* TODO: process at least "text" in a -literal block.
*/
st->paused = 1;
break;
case (CMD_FOR):
/*
* We ignore all types of encodings and formats
* unilaterally.
*/
break;
case (CMD_ENCODING):
break;
case (CMD_CUT):
st->parsing = 0;
return;
default:
abort();
}
/* Any command (but =cut) makes us start parsing. */
st->parsing = 1;
}
/*
* Put the type provided as an argument into the dictionary.
*/
static void
register_type(const char *ptype)
{
const char *pname, *pend;
pname = ptype;
while (isalnum((unsigned char)*pname) || '_' == *pname)
pname++;
if ((pname - ptype == 6 && ! strncmp(ptype, "struct", 6)) ||
(pname - ptype == 4 && ! strncmp(ptype, "enum", 4))) {
while (' ' == *pname)
pname++;
pend = pname;
while (isalnum((unsigned char)*pend) || '_' == *pend)
pend++;
if (pend > pname)
dict_put(pname, pend - pname, MDOC_Vt);
} else
pend = pname;
if (pend > ptype)
dict_put(ptype, pend - ptype, MDOC_Vt);
}
/*
* Just pump out the line in a verbatim block.
* From the perspective of external callers,
* always stays in OUST_NL/wantws mode.
*/
static void
verbatim(struct state *st, char *buf, size_t start, size_t end)
{
size_t i, ift, ifo, ifa, ifc, inl;
char *cp, *cp2;
int nopen;
if ( ! st->parsing || st->paused || start == end)
return;
again:
/*
* If we're in the SYNOPSIS, see if we're an #include block.
* If we are, then print the "In" macro and re-loop.
* This handles any number of inclusions, but only when they
* come before the remaining parts...
*/
if (SECT_SYNOPSIS == st->sect) {
i = start;
while (i < end && buf[i] == ' ')
i++;
if (i == end)
return;
/* We're an include block! */
if (end - i > 10 &&
0 == memcmp(&buf[i], "#include <", 10)) {
start = i + 10;
while (start < end && ' ' == buf[start])
start++;
fputs(".In ", stdout);
/* Stop til the '>' marker or we hit eoln. */
while (start < end &&
'>' != buf[start] && '\n' != buf[start])
putchar(buf[start++]);
putchar('\n');
if (start < end && '>' == buf[start])
start++;
if (start < end && '\n' == buf[start])
start++;
goto again;
}
/* Other preprocessor directives. */
if ('#' == buf[i]) {
fputs(".Fd ", stdout);
start = i;
while(start < end && '\n' != buf[start])
putchar(buf[start++]);
putchar('\n');
if (start < end && '\n' == buf[start])
start++;
goto again;
}
/* Parse function declaration. */
ifo = ifa = ifc = 0;
inl = end;
nopen = 0;
for (ift = i; i < end; i++) {
if (ifc) {
if (buf[i] != '\n')
continue;
inl = i;
break;
}
switch (buf[i]) {
case '\t':
/* FALLTHROUGH */
case ' ':
if ( ! ifa)
ifo = i;
break;
case '(':
if (ifo) {
nopen++;
if ( ! ifa)
ifa = i;
} else
i = end;
break;
case ')':
switch (nopen) {
case 0:
i = end;
break;
case 1:
ifc = i;
break;
default:
nopen--;
break;
}
break;
default:
break;
}
}
/* Encode function declaration. */
if (ifc) {
for (i = ifa; i < ifc; i++)
if (buf[i] == '\n')
buf[i] = ' ';
buf[ifo++] = '\0';
register_type(buf + ift);
printf(".Ft %s", buf + ift);
if (buf[ifo] == '*') {
fputs(" *", stdout);
ifo++;
}
putchar('\n');
buf[ifa++] = '\0';
printf(".Fo %s\n", buf + ifo);
dict_put(buf + ifo, 0, MDOC_Fo);
buf[ifc++] = '\0';
for (;;) {
cp = strchr(buf + ifa, ',');
if (cp != NULL) {
cp2 = cp;
*cp++ = '\0';
} else
cp2 = strchr(buf + ifa, '\0');
while (isalnum((unsigned char)cp2[-1]) ||
'_' == cp2[-1])
cp2--;
if ('\0' != *cp2)
dict_put(cp2, 0, MDOC_Fa);
register_type(buf + ifa);
printf(".Fa \"%s\"\n", buf + ifa);
if (cp == NULL)
break;
while (*cp == ' ' || *cp == '\t')
cp++;
ifa = cp - buf;
}
puts(".Fc");
if (buf[ifc] == ';')
ifc++;
if (ifc < inl) {
buf[inl] = '\0';
puts(buf + ifc);
}
start = inl + 1;
if (start < end)
goto again;
return;
}
}
puts(".Bd -literal");
for (last = ' '; start < end; start++) {
/*
* Handle accidental macros (newline starting with
* control character) and escapes.
*/
if ('\n' == last)
if ('.' == buf[start] || '\'' == buf[start])
printf("\\&");
putchar(last = buf[start]);
if ('\\' == buf[start])
printf("e");
}
putchar(last = '\n');
puts(".Ed");
}
/*
* See dosynopsisop().
*/
static int
hasmatch(const char *buf, size_t start, size_t end)
{
size_t stack;
for (stack = 0; start < end; start++)
if (buf[start] == '[')
stack++;
else if (buf[start] == ']' && 0 == stack)
return(1);
else if (buf[start] == ']')
stack--;
return(0);
}
/*
* If we're in the SYNOPSIS section and we've encounter braces in an
* ordinary paragraph, then try to see whether we're an [-option].
* Do this, if we're an opening bracket, by first seeing if we have a
* matching end via hasmatch().
* If we're an ending bracket, see if we have a stack already.
*/
static int
dosynopsisop(struct state *st, const char *buf,
size_t *start, size_t end, size_t *opstack)
{
assert('[' == buf[*start] || ']' == buf[*start]);
if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
mdoc_newln(st);
puts(".Oo");
(*opstack)++;
} else if ('[' == buf[*start])
return(0);
if (']' == buf[*start] && *opstack > 0) {
mdoc_newln(st);
puts(".Oc");
(*opstack)--;
} else if (']' == buf[*start])
return(0);
(*start)++;
last = '\n';
while (' ' == buf[*start])
(*start)++;
return(1);
}
/*
* Format multiple "Nm" manpage names in the NAME section.
* From the perspective of external callers,
* always stays in OUST_NL/wantws mode,
* but its children do use OUST_MAC.
*/
static void
donamenm(struct state *st, const char *buf, size_t *start, size_t end)
{
size_t word;
assert(OUST_NL == st->oust);
assert(st->wantws);
while (*start < end && ' ' == buf[*start])
(*start)++;
if (end == *start) {
puts(".Nm unknown");
return;
}
while (*start < end) {
for (word = *start; word < end; word++)
if (',' == buf[word])
break;
formatcodeln(st, "Nm", buf, start, word, 1);
if (*start == end) {
mdoc_newln(st);
break;
}
assert(',' == buf[*start]);
printf(" ,");
mdoc_newln(st);
(*start)++;
while (*start < end && ' ' == buf[*start])
(*start)++;
}
}
/*
* Ordinary paragraph.
* Well, this is really the hardest--POD seems to assume that, for
* example, a leading space implies a newline, and so on.
* Lots of other snakes in the grass: escaping a newline followed by a
* period (accidental mdoc(7) control), double-newlines after macro
* passages, etc.
*
* Uses formatcode() to go to OUST_MAC mode
* and outbuf_flush() to go to OUST_TXT mode.
* In text mode, wantws requests white space before the text
* currently contained in the outbuf, not before upcoming text.
* Must make sure to go back to OUST_NL/wantws mode before returning.
*/
static void
ordinary(struct state *st, const char *buf, size_t start, size_t end)
{
size_t i, j, opstack, wend;
enum mdoc_type mtype;
int eos, noeos, seq;
if ( ! st->parsing || st->paused)
return;
/*
* Special-case: the NAME section.
* If we find a "-" when searching from the end, assume that
* we're in "name - description" format.
* To wit, print out a "Nm" and "Nd" in that format.
*/
if (SECT_NAME == st->sect) {
for (i = end - 2; i > start; i--)
if ('-' == buf[i] && ' ' == buf[i + 1])
break;
if ('-' == buf[i]) {
j = i;
/* Roll over multiple "-". */
for ( ; i > start; i--)
if ('-' != buf[i])
break;
donamenm(st, buf, &start, i + 1);
start = j + 1;
while (start < end && ' ' == buf[start])
start++;
formatcodeln(st, "Nd", buf, &start, end, 1);
mdoc_newln(st);
return;
}
}
if ( ! st->haspar)
puts(".Pp");
st->haspar = 0;
last = '\n';
opstack = 0;
for (seq = 0; start < end; seq++) {
/*
* Loop til we get either to a newline or escape.
* Escape initial control characters.
*/
while (start < end) {
if (start < end - 1 && '<' == buf[start + 1] &&
'A' <= buf[start] && 'Z' >= buf[start])
break;
else if ('\n' == buf[start])
break;
else if ('\n' == last && '.' == buf[start])
outbuf_addstr(st, "\\&");
else if ('\n' == last && '\'' == buf[start])
outbuf_addstr(st, "\\&");
/*
* If we're in the SYNOPSIS, have square
* brackets indicate that we're opening and
* closing an optional context.
*/
if (SECT_SYNOPSIS == st->sect &&
('[' == buf[start] ||
']' == buf[start]) &&
dosynopsisop(st, buf,
&start, end, &opstack))
continue;
/* Merely buffer non-whitespace. */
last = buf[start++];
if ( ! isspace(last))
outbuf_addchar(st);
if (start < end &&
! isspace((unsigned char)buf[start]))
continue;
/*
* Found the end of a word.
* Rewind trailing delimiters.
*/
eos = noeos = 0;
for (wend = st->outbuflen; wend; wend--)
if ('.' == st->outbuf[wend - 1] ||
'!' == st->outbuf[wend - 1] ||
'?' == st->outbuf[wend - 1])
eos = 1;
else if ('|' == st->outbuf[wend - 1] ||
',' == st->outbuf[wend - 1] ||
';' == st->outbuf[wend - 1] ||
':' == st->outbuf[wend - 1])
noeos = 1;
else if ('\'' != st->outbuf[wend - 1] &&
'"' != st->outbuf[wend - 1] &&
')' != st->outbuf[wend - 1] &&
']' != st->outbuf[wend - 1])
break;
eos &= ! noeos;
/*
* Detect function names.
*/
mtype = MDOC_Fa;
if (wend && ')' == st->outbuf[wend] &&
'(' == st->outbuf[wend - 1]) {
mtype = dict_get(st->outbuf, --wend);
if (MDOC_Fo == mtype || MDOC_MAX == mtype) {
st->outbuflen = wend;
st->outbuf[wend] = '\0';
mdoc_newln(st);
if (MDOC_Fo == mtype)
fputs(".Fn ", stdout);
else
fputs(".Xr ", stdout);
st->oust = OUST_MAC;
}
}
/*
* On whitespace, flush the output buffer
* and allow breaking to a macro line.
*/
outbuf_flush(st);
/*
* End macro lines, and
* end text lines at the end of sentences.
*/
if (OUST_MAC == st->oust || (eos && wend > 1 &&
islower((unsigned char)st->outbuf[wend - 1]))) {
if (MDOC_MAX == mtype)
fputs(" 3", stdout);
if (MDOC_Fa != mtype)
for (wend += 2;
'\0' != st->outbuf[wend];
wend++)
printf(" %c",
st->outbuf[wend]);
mdoc_newln(st);
}
/* Advance to the next word. */
while ('\n' != buf[start] &&
isspace((unsigned char)buf[start]))
start++;
st->wantws = 1;
}
if (start < end - 1 && '<' == buf[start + 1] &&
'A' <= buf[start] && 'Z' >= buf[start]) {
formatcode(st, buf, &start, end, 0, seq);
if (OUST_MAC == st->oust) {
/*
* Let mdoc(7) handle trailing punctuation.
* XXX Some punctuation characters
* are not handled yet.
*/
if ((start == end - 1 ||
(start < end - 1 &&
(' ' == buf[start + 1] ||
'\n' == buf[start + 1]))) &&
('.' == buf[start] ||
',' == buf[start])) {
putchar(' ');
putchar(buf[start++]);
}
if (st->wantws ||
' ' == buf[start] ||
'\n' == buf[start])
mdoc_newln(st);
/*
* Consume all whitespace
* so we don't accidentally start
* an implicit literal line.
*/
while (start < end && ' ' == buf[start])
start++;
/*
* Some text is following.
* Implement requested spacing.
*/
if ( ! st->wantws && start < end &&
('<' != buf[start + 1] ||
'A' > buf[start] ||
'Z' < buf[start])) {
printf(" Ns ");
st->wantws = 1;
}
}
} else if (start < end && '\n' == buf[start]) {
outbuf_flush(st);
mdoc_newln(st);
if (++start >= end)
continue;
/*
* If we have whitespace next, eat it to prevent
* mdoc(7) from thinking that it's meant for
* verbatim text.
* It is--but if we start with that, we can't
* have a macro subsequent it, which may be
* possible if we have an escape next.
*/
if (' ' == buf[start] || '\t' == buf[start])
puts(".br");
for ( ; start < end; start++)
if (' ' != buf[start] && '\t' != buf[start])
break;
}
}
outbuf_flush(st);
mdoc_newln(st);
}
/*
* There are three kinds of paragraphs: verbatim (starts with whitespace
* of some sort), ordinary (starts without "=" marker), or a command
* (default: starts with "=").
*/
static void
dopar(struct state *st, char *buf, size_t start, size_t end)
{
assert(OUST_NL == st->oust);
assert(st->wantws);
if (end == start)
return;
if (' ' == buf[start] || '\t' == buf[start])
verbatim(st, buf, start, end);
else if ('=' != buf[start])
ordinary(st, buf, start, end);
else
command(st, buf, start, end);
}
/*
* Loop around paragraphs within a document, processing each one in the
* POD way.
*/
static void
dofile(const struct args *args, const char *fname,
const struct tm *tm, char *buf, size_t sz)
{
char datebuf[64];
struct state st;
const char *fbase, *fext, *section, *date, *format;
char *title, *cp;
size_t sup, end, i, cur = 0;
if (0 == sz)
return;
/*
* Parsing the filename is almost always required,
* except when both the title and the section
* are provided on the command line.
*/
if (NULL == args->title || NULL == args->section) {
fbase = strrchr(fname, '/');
if (NULL == fbase)
fbase = fname;
else
fbase++;
fext = strrchr(fbase, '.');
} else
fext = NULL;
/*
* The title will be converted to uppercase,
* so it needs to be copied.
*/
title = (NULL != args->title) ? strdup(args->title) :
(NULL != fext) ? strndup(fbase, fext - fbase) :
strdup(fbase);
if (NULL == title) {
perror(NULL);
exit(EXIT_FAILURE);
}
/* Section is 1 unless suffix is "pm". */
section = (NULL != args->section) ? args->section :
(NULL == fext || strcmp(fext + 1, "pm")) ? "1" :
PERL_SECTION;
/* Date. Or the given "tm" if not supplied. */
date = args->date;
format = (NULL == date) ? "%B %d, %Y" :
strcmp(date, "Mdocdate") ? NULL : "$Mdocdate: February 14 2015 $";
if (NULL != format) {
strftime(datebuf, sizeof(datebuf), format, tm);
date = datebuf;
}
for (cp = title; '\0' != *cp; cp++)
*cp = toupper((int)*cp);
/* The usual mdoc(7) preamble. */
printf(".Dd %s\n", date);
printf(".Dt %s %s\n", title, section);
puts(".Os");
free(title);
dict_init();
memset(&st, 0, sizeof(struct state));
st.oust = OUST_NL;
st.wantws = 1;
assert(sz > 0);
/* Main loop over file contents. */
while (cur < sz) {
/* Read until next paragraph. */
for (i = cur + 1; i < sz; i++)
if ('\n' == buf[i] && '\n' == buf[i - 1]) {
/* Consume blank paragraphs. */
while (i + 1 < sz && '\n' == buf[i + 1])
i++;
break;
}
/* Adjust end marker for EOF. */
end = i < sz ? i - 1 :
('\n' == buf[sz - 1] ? sz - 1 : sz);
sup = i < sz ? end + 2 : sz;
/* Process paragraph and adjust start. */
dopar(&st, buf, cur, end);
cur = sup;
}
dict_destroy();
}
/*
* Read a single file fully into memory.
* If the file is "-", do it from stdin.
* If successfully read, send the input buffer to dofile() for further
* processing.
*/
static int
readfile(const struct args *args, const char *fname)
{
int fd;
char *buf;
size_t bufsz, cur;
ssize_t ssz;
struct tm *tm;
time_t ttm;
struct stat st;
fd = 0 != strcmp("-", fname) ?
open(fname, O_RDONLY, 0) : STDIN_FILENO;
if (-1 == fd) {
perror(fname);
return(0);
}
if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
ttm = time(NULL);
tm = localtime(&ttm);
} else
tm = localtime(&st.st_mtime);
/*
* Arbitrarily-sized initial buffer.
* Should be big enough for most files...
*/
cur = 0;
bufsz = 1 << 14;
if (NULL == (buf = malloc(bufsz))) {
perror(NULL);
exit(EXIT_FAILURE);
}
while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
/* Double buffer size on fill. */
if ((size_t)ssz == bufsz - cur) {
bufsz *= 2;
if (NULL == (buf = realloc(buf, bufsz))) {
perror(NULL);
exit(EXIT_FAILURE);
}
}
cur += (size_t)ssz;
}
if (ssz < 0) {
perror(fname);
free(buf);
return(0);
}
dofile(args, STDIN_FILENO == fd ?
"STDIN" : fname, tm, buf, cur);
free(buf);
if (STDIN_FILENO != fd)
close(fd);
return(1);
}
int
main(int argc, char *argv[])
{
const char *fname, *name;
struct args args;
int c;
name = strrchr(argv[0], '/');
if (name == NULL)
name = argv[0];
else
++name;
memset(&args, 0, sizeof(struct args));
fname = "-";
/* Accept no arguments for now. */
while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
switch (c) {
case ('h'):
/* FALLTHROUGH */
case ('l'):
/* FALLTHROUGH */
case ('c'):
/* FALLTHROUGH */
case ('o'):
/* FALLTHROUGH */
case ('q'):
/* FALLTHROUGH */
case ('r'):
/* FALLTHROUGH */
case ('u'):
/* FALLTHROUGH */
case ('v'):
/* Ignore these. */
break;
case ('d'):
args.date = optarg;
break;
case ('n'):
args.title = optarg;
break;
case ('s'):
args.section = optarg;
break;
default:
goto usage;
}
argc -= optind;
argv += optind;
/* Accept only a single input file. */
if (argc > 1)
goto usage;
else if (1 == argc)
fname = *argv;
return(readfile(&args, fname) ?
EXIT_SUCCESS : EXIT_FAILURE);
usage:
fprintf(stderr, "usage: %s [-d date] "
"[-n title] [-s section] [file]\n", name);
return(EXIT_FAILURE);
}