/* $Id: pod2mdoc.c,v 1.62 2016/11/03 15:50:28 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * Copyright (c) 2014, 2015 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include "dict.h" /* * In what section can we find Perl module manuals? * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p. * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL. */ #define PERL_SECTION "3p" struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ const char *section; /* override "Dt" section */ }; enum list { LIST_BULLET = 0, LIST_ENUM, LIST_TAG, LIST__MAX }; enum sect { SECT_NONE = 0, SECT_NAME, /* NAME section */ SECT_SYNOPSIS, /* SYNOPSIS section */ }; enum outstate { OUST_NL = 0, /* just started a new output line */ OUST_TXT, /* text line output in progress */ OUST_MAC /* macro line output in progress */ }; struct state { const char *fname; /* file being parsed */ int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ enum sect sect; /* which section are we in? */ #define LIST_STACKSZ 128 enum list lstack[LIST_STACKSZ]; /* open lists */ size_t lpos; /* where in list stack */ int haspar; /* in paragraph: do we need Pp? */ enum outstate oust; /* state of the mdoc output stream */ int wantws; /* let mdoc(7) output whitespace here */ char *outbuf; /* text buffered for output */ size_t outbufsz; /* allocated size of outbuf */ size_t outbuflen; /* current length of outbuf */ size_t outlnlen; /* chars so far on this output line */ }; enum fmt { FMT_ITALIC, FMT_BOLD, FMT_CODE, FMT_LINK, FMT_ESCAPE, FMT_FILE, FMT_NBSP, FMT_INDEX, FMT_NULL, FMT__MAX }; enum cmd { CMD_POD = 0, CMD_HEAD1, CMD_HEAD2, CMD_HEAD3, CMD_HEAD4, CMD_OVER, CMD_ITEM, CMD_BACK, CMD_BEGIN, CMD_END, CMD_FOR, CMD_ENCODING, CMD_CUT, CMD__MAX }; static void command(struct state *, const char *, size_t, size_t); static void dofile(const struct args *, const char *, const struct tm *, char *, size_t); static void donamenm(struct state *, const char *, size_t *, size_t); static void dopar(struct state *, char *, size_t, size_t); static void dosynopsisfl(const char *, size_t *, size_t); static int dosynopsisop(struct state *, const char *, size_t *, size_t, size_t *); static int formatcode(struct state *, const char *, size_t *, size_t, int, int); static void formatcodeln(struct state *, const char *, const char *, size_t *, size_t, int); static void formatescape(struct state *, const char *, size_t *, size_t); static int hasmatch(const char *, size_t, size_t); static void ordinary(struct state *, const char *, size_t, size_t); static void outbuf_addchar(struct state *); static void outbuf_addstr(struct state *, const char *); static void outbuf_flush(struct state *); static void outbuf_grow(struct state *, size_t); static enum list listguess(const char *, size_t, size_t); static void mdoc_newln(struct state *); static int readfile(const struct args *, const char *); static void register_type(const char *); static int trylink(const char *, size_t *, size_t, size_t); static void verbatim(struct state *, char *, size_t, size_t); static const char *const cmds[CMD__MAX] = { "pod", /* CMD_POD */ "head1", /* CMD_HEAD1 */ "head2", /* CMD_HEAD2 */ "head3", /* CMD_HEAD3 */ "head4", /* CMD_HEAD4 */ "over", /* CMD_OVER */ "item", /* CMD_ITEM */ "back", /* CMD_BACK */ "begin", /* CMD_BEGIN */ "end", /* CMD_END */ "for", /* CMD_FOR */ "encoding", /* CMD_ENCODING */ "cut" /* CMD_CUT */ }; static const char fmts[FMT__MAX] = { 'I', /* FMT_ITALIC */ 'B', /* FMT_BOLD */ 'C', /* FMT_CODE */ 'L', /* FMT_LINK */ 'E', /* FMT_ESCAPE */ 'F', /* FMT_FILE */ 'S', /* FMT_NBSP */ 'X', /* FMT_INDEX */ 'Z' /* FMT_NULL */ }; static unsigned char last; static void outbuf_grow(struct state *st, size_t by) { st->outbufsz += (by / 128 + 1) * 128; st->outbuf = realloc(st->outbuf, st->outbufsz); if (NULL == st->outbuf) { perror(NULL); exit(EXIT_FAILURE); } } static void outbuf_addchar(struct state *st) { if (st->outbuflen + 2 >= st->outbufsz) outbuf_grow(st, 1); st->outbuf[st->outbuflen++] = last; if ('\\' == last) st->outbuf[st->outbuflen++] = 'e'; st->outbuf[st->outbuflen] = '\0'; } static void outbuf_addstr(struct state *st, const char *str) { size_t slen; slen = strlen(str); if (st->outbuflen + slen >= st->outbufsz) outbuf_grow(st, slen); memcpy(st->outbuf + st->outbuflen, str, slen+1); st->outbuflen += slen; last = str[slen - 1]; } static void outbuf_flush(struct state *st) { if (0 == st->outbuflen) return; st->outlnlen += st->outbuflen; if (OUST_TXT == st->oust && st->wantws) { if (++st->outlnlen > 72) { putchar('\n'); st->oust = OUST_NL; st->outlnlen = st->outbuflen; } } if (OUST_NL != st->oust && st->wantws) putchar(' '); if (OUST_MAC == st->oust && '"' == *st->outbuf) printf("\\(dq%s", st->outbuf + 1); else fputs(st->outbuf, stdout); *st->outbuf = '\0'; st->outbuflen = 0; if (OUST_NL == st->oust) st->oust = OUST_TXT; } static void mdoc_newln(struct state *st) { if (OUST_NL == st->oust) return; putchar('\n'); last = '\n'; st->oust = OUST_NL; st->outlnlen = 0; st->wantws = 1; } /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. * Sets start to be one after the '>'. * * This function does not care about output modes, * it merely appends text to the output buffer, * which can then be used in any mode. */ static void formatescape(struct state *st, const char *buf, size_t *start, size_t end) { char esc[16]; /* no more needed */ size_t i, max; max = sizeof(esc) - 1; i = 0; /* Read til our buffer is full. */ while (*start < end && '>' != buf[*start] && i < max) esc[i++] = buf[(*start)++]; esc[i] = '\0'; if (i == max) { /* Too long... skip til we end. */ while (*start < end && '>' != buf[*start]) (*start)++; return; } else if (*start >= end) return; assert('>' == buf[*start]); (*start)++; /* * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ if (0 == strcmp(esc, "lt")) outbuf_addstr(st, "\\(la"); else if (0 == strcmp(esc, "gt")) outbuf_addstr(st, "\\(ra"); else if (0 == strcmp(esc, "verbar")) outbuf_addstr(st, "\\(ba"); else if (0 == strcmp(esc, "sol")) outbuf_addstr(st, "\\(sl"); } /* * Run some heuristics to intuit a link format. * I set "start" to be the end of the sequence (last right-carrot) so * that the caller can safely just continue processing. * If this is just an empty tag, I'll return 0. * * Always operates in OUST_MAC mode. * Mode handling is done by the caller. */ static int trylink(const char *buf, size_t *start, size_t end, size_t dsz) { size_t linkstart, realend, linkend, i, j, textsz, stack; /* * Scan to the start of the terminus. * This function is more or less replicated in the formatcode() * for null or index formatting codes. * However, we're slightly different because we might have * nested escapes we need to ignore. */ stack = 0; for (linkstart = realend = *start; realend < end; realend++) { if ('<' == buf[realend]) stack++; if ('>' != buf[realend]) continue; else if (stack-- > 0) continue; if (dsz == 1) break; assert(realend > 0); if (' ' != buf[realend - 1]) continue; for (i = realend, j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz == j) break; } /* Ignore stubs. */ if (realend == end || realend == *start) return(0); /* Set linkend to the end of content. */ linkend = dsz > 1 ? realend - 1 : realend; /* Re-scan to see if we have a title or section. */ for (textsz = *start; textsz < linkend; textsz++) if ('|' == buf[textsz] || '/' == buf[textsz]) break; if (textsz < linkend && '|' == buf[textsz]) { /* With title: set start, then end at section. */ linkstart = textsz + 1; textsz = textsz - *start; for (i = linkstart; i < linkend; i++) if ('/' == buf[i]) break; if (i < linkend) linkend = i; } else if (textsz < linkend && '/' == buf[textsz]) { /* With section: set end at section. */ linkend = textsz; textsz = 0; } else /* No title, no section. */ textsz = 0; *start = realend; j = linkend - linkstart; /* Do we have only subsection material? */ if (0 == j && '/' == buf[linkend]) { linkstart = linkend + 1; linkend = dsz > 1 ? realend - 1 : realend; if (0 == (j = linkend - linkstart)) return(0); printf("Sx %.*s", (int)j, &buf[linkstart]); return(1); } else if (0 == j) return(0); /* See if we qualify as being a link or not. */ if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) || (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) || (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) { /* Gross. */ printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : realend) - linkstart), &buf[linkstart]); return(1); } /* See if we qualify as a mailto. */ if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) { printf("Mt %.*s", (int)j, &buf[linkstart]); return(1); } /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */ if ((j > 3 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 3])) { printf("Xr %.*s %c", (int)(j - 3), &buf[linkstart], buf[linkend - 2]); return(1); } else if ((j > 4 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 4])) { printf("Xr %.*s %.*s", (int)(j - 4), &buf[linkstart], 2, &buf[linkend - 3]); return(1); } else if ((j > 5 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 5])) { printf("Xr %.*s %.*s", (int)(j - 5), &buf[linkstart], 3, &buf[linkend - 4]); return(1); } /* Last try: do we have a double-colon? */ for (i = linkstart + 1; i < linkend; i++) if (':' == buf[i] && ':' == buf[i - 1]) break; if (i < linkend) printf("Xr %.*s " PERL_SECTION, (int)j, &buf[linkstart]); else printf("Xr %.*s 1", (int)j, &buf[linkstart]); return(1); } /* * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section, * then it's likely that we're a flag. * Our flag might be followed by an argument, so make sure that we're * accounting for that, too. * If we don't have a flag at all, however, then assume we're an "Ar". * * Always operates in OUST_MAC mode. * Mode handlinf is done by the caller. */ static void dosynopsisfl(const char *buf, size_t *start, size_t end) { size_t i; again: assert(*start + 1 < end); assert('-' == buf[*start]); if ( ! isalnum((int)buf[*start + 1]) && '?' != buf[*start + 1] && '-' != buf[*start + 1]) { (*start)--; fputs("Ar", stdout); return; } (*start)++; for (i = *start; i < end; i++) if (isalnum((int)buf[i])) continue; else if ('?' == buf[i]) continue; else if ('-' == buf[i]) continue; else if ('_' == buf[i]) continue; else break; assert(i < end); if ( ! (' ' == buf[i] || '>' == buf[i])) { fputs("Ar", stdout); return; } printf("Fl "); if (end - *start > 1 && isupper((int)buf[*start]) && islower((int)buf[*start + 1]) && (end - *start == 2 || ' ' == buf[*start + 2])) printf("\\&"); printf("%.*s", (int)(i - *start), &buf[*start]); *start = i; if (' ' == buf[i]) { while (i < end && ' ' == buf[i]) i++; assert(i < end); if ('-' == buf[i]) { *start = i; goto again; } fputs("Ar", stdout); *start = i; } } /* * We're at the character in front of a format code, which is structured * like X<...> and can contain nested format codes. * This consumes the whole format code, and any nested format codes, til * the end of matched production. * If "nomacro", then we don't print any macros, just contained data * (e.g., following "Sh" or "Nm"). * "pos" is only significant in SYNOPSIS, and should be 0 when invoked * as the first format code on a line (for decoration as an "Nm"), * non-zero otherwise. * * Output mode handling is most complicated here. * We may enter in any mode. * We usually exit in OUST_MAC mode, except when * entering without OUST_MAC and the code is invalid. */ static int formatcode(struct state *st, const char *buf, size_t *start, size_t end, int nomacro, int pos) { size_t i, j, dsz; enum fmt fmt; unsigned char uc; int gotmacro, wantws; assert(*start + 1 < end); assert('<' == buf[*start + 1]); /* * First, look up the format code. * If it's not valid, treat it as a NOOP. */ for (fmt = 0; fmt < FMT__MAX; fmt++) if (buf[*start] == fmts[fmt]) break; /* * Determine whether we're overriding our delimiter. * According to POD, if we have more than one '<' followed by a * space, then we need a space followed by matching '>' to close * the expression. * Otherwise we use the usual '<' and '>' matched pair. */ i = *start + 1; while (i < end && '<' == buf[i]) i++; assert(i > *start + 1); dsz = i - (*start + 1); if (dsz > 1 && (i >= end || ' ' != buf[i])) dsz = 1; /* Remember, if dsz>1, to jump the trailing space. */ *start += dsz + 1 + (dsz > 1 ? 1 : 0); /* * Escapes and ignored codes (NULL and INDEX) don't print macro * sequences, so just output them like normal text before * processing for real macros. */ if (FMT_ESCAPE == fmt) { formatescape(st, buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { /* * Just consume til the end delimiter, accounting for * whether it's a custom one. */ for ( ; *start < end; (*start)++) { if ('>' != buf[*start]) continue; else if (dsz == 1) break; assert(*start > 0); if (' ' != buf[*start - 1]) continue; i = *start; for (j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz != j) continue; (*start) += dsz; break; } if (*start < end) { assert('>' == buf[*start]); (*start)++; } if (isspace(last)) while (*start < end && isspace((int)buf[*start])) (*start)++; return(0); } /* * Check whether we're supposed to print macro stuff (this is * suppressed in, e.g., "Nm" and "Sh" macros). */ if (FMT__MAX != fmt && !nomacro) { /* * Do we need spacing before the upcoming macro, * after any pending text already in the outbuf? * We may already have wantws if there was whitespace * before the code ("text Bwantws && ! st->outbuflen); /* * If we are on a text line and there is no * whitespace before our content, we have to make * the previous word a prefix to the macro line. */ if (OUST_MAC != st->oust && ! wantws) { if (OUST_NL != st->oust) mdoc_newln(st); fputs(".Pf", stdout); st->oust = OUST_MAC; st->wantws = wantws = 1; } outbuf_flush(st); /* Whitespace is easier to suppress on macro lines. */ if (OUST_MAC == st->oust && ! wantws) printf(" Ns"); /* Unless we are on a macro line, start one. */ if (OUST_MAC != st->oust) { if (OUST_NL != st->oust) mdoc_newln(st); putchar('.'); st->oust = OUST_MAC; } else putchar(' '); st->wantws = 1; /* * Print the macro corresponding to this format code, * and update the output state afterwards. */ switch (fmt) { case (FMT_BOLD): if (SECT_SYNOPSIS == st->sect) { if (1 == dsz && '-' == buf[*start]) dosynopsisfl(buf, start, end); else if (0 == pos) fputs("Nm", stdout); else fputs("Ar", stdout); break; } /* FALLTHROUGH */ case (FMT_ITALIC): i = 0; uc = buf[*start]; while (isalnum(uc) || '_' == uc || ' ' == uc) uc = buf[*start + ++i]; if ('=' != uc && '>' != uc) i = 0; if (4 == i && ! strncmp(buf + *start, "NULL", 4)) { fputs("Dv", stdout); break; } switch (i ? dict_get(buf + *start, i) : MDOC_MAX) { case MDOC_Fa: fputs("Fa", stdout); break; case MDOC_Vt: fputs("Vt", stdout); break; default: fputs(FMT_BOLD == fmt ? "Sy" : "Em", stdout); break; } break; case (FMT_CODE): fputs("Qo Li", stdout); break; case (FMT_LINK): /* Try to link; use "No" if it's empty. */ if ( ! trylink(buf, start, end, dsz)) fputs("No", stdout); break; case (FMT_FILE): fputs("Pa", stdout); break; case (FMT_NBSP): fputs("No", stdout); break; default: abort(); } } else { outbuf_flush(st); st->wantws = 0; } /* * Process until we reach the end marker (e.g., '>') or until we * find a nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. */ gotmacro = 0; while (*start < end) { if ('>' == buf[*start] && 1 == dsz) { (*start)++; break; } else if ('>' == buf[*start] && ' ' == buf[*start - 1]) { /* * Handle custom delimiters. * These require a certain number of * space-preceded carrots before we're really at * the end. */ i = *start; for (j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz == j) { *start += dsz; break; } } if (*start + 1 < end && '<' == buf[*start + 1] && 'A' <= buf[*start] && 'Z' >= buf[*start]) { gotmacro = formatcode(st, buf, start, end, nomacro, 1); continue; } /* Suppress newlines and multiple spaces. */ last = buf[(*start)++]; if (isspace(last)) { outbuf_flush(st); st->wantws = 1; gotmacro = 0; while (*start < end && isspace((unsigned char)buf[*start])) (*start)++; continue; } if (OUST_MAC == st->oust && FMT__MAX != fmt) { if (gotmacro && ! st->wantws) { printf(" Ns"); st->wantws = 1; } gotmacro = 0; /* * Escape macro-like words. * This matches "Xx " and "XxEOLN". */ if (*start < end && ! st->outbuflen && isupper(last) && islower((unsigned char)buf[*start]) && (end - *start == 1 || ' ' == buf[*start + 1] || '>' == buf[*start + 1])) outbuf_addstr(st, "\\&"); last = buf[*start - 1]; } outbuf_addchar(st); } if (FMT__MAX == fmt) return(0); outbuf_flush(st); if ( ! nomacro && FMT_CODE == fmt) fputs(" Qc", stdout); st->wantws = ' ' == last; return(1); } /* * Calls formatcode() til the end of a paragraph. * Goes to OUST_MAC mode and stays there when returning, * such that the caller can add arguments to the macro line * before closing it out. */ static void formatcodeln(struct state *st, const char *linemac, const char *buf, size_t *start, size_t end, int nomacro) { int gotmacro; assert(OUST_NL == st->oust); assert(st->wantws); printf(".%s", linemac); st->oust = OUST_MAC; gotmacro = 0; while (*start < end) { if (*start + 1 < end && '<' == buf[*start + 1] && 'A' <= buf[*start] && 'Z' >= buf[*start]) { gotmacro = formatcode(st, buf, start, end, nomacro, 1); continue; } /* Suppress newlines and multiple spaces. */ last = buf[(*start)++]; if (isspace(last)) { outbuf_flush(st); st->wantws = 1; while (*start < end && isspace((unsigned char)buf[*start])) (*start)++; continue; } if (gotmacro) { if (*start < end) { if (st->wantws) printf(" No"); else printf(" Ns"); } st->wantws = 1; gotmacro = 0; } /* * Since we're already on a macro line, we want to make * sure that we don't inadvertently invoke a macro. * We need to do this carefully because section names * are used in troff and we don't want to escape * something that needn't be escaped. */ if (*start < end && ! st->outbuflen && isupper(last) && islower((unsigned char)buf[*start]) && (end - *start == 1 || ' ' == buf[*start + 1])) { outbuf_addstr(st, "\\&"); last = buf[*start - 1]; } outbuf_addchar(st); } outbuf_flush(st); st->wantws = 1; } /* * Guess at what kind of list we are. * These are taken straight from the POD manual. * I don't know what people do in real life. */ static enum list listguess(const char *buf, size_t start, size_t end) { size_t len = end - start; assert(end >= start); if (len == 1 && '*' == buf[start]) return(LIST_BULLET); if (len == 2 && '1' == buf[start] && '.' == buf[start + 1]) return(LIST_ENUM); else if (len == 1 && '1' == buf[start]) return(LIST_ENUM); else return(LIST_TAG); } /* * A command paragraph, as noted in the perlpod manual, just indicates * that we should do something, optionally with some text to print as * well. * From the perspective of external callers, * always stays in OUST_NL/wantws mode, * but its children do use OUST_MAC. */ static void command(struct state *st, const char *buf, size_t start, size_t end) { size_t len, csz; enum cmd cmd; assert('=' == buf[start]); start++; len = end - start; for (cmd = 0; cmd < CMD__MAX; cmd++) { csz = strlen(cmds[cmd]); if (len < csz) continue; if (0 == memcmp(&buf[start], cmd[cmds], csz)) break; } /* Ignore bogus commands. */ if (CMD__MAX == cmd) return; start += csz; while (start < end && ' ' == buf[start]) start++; len = end - start; if (st->paused) { st->paused = CMD_END != cmd; return; } switch (cmd) { case (CMD_POD): break; case (CMD_HEAD1): /* * The behaviour of head= follows from a quick glance at * how pod2man handles it. */ st->sect = SECT_NONE; if (end - start == 4) { if (0 == memcmp(&buf[start], "NAME", 4)) st->sect = SECT_NAME; } else if (end - start == 8) { if (0 == memcmp(&buf[start], "SYNOPSIS", 8)) st->sect = SECT_SYNOPSIS; } formatcodeln(st, "Sh", buf, &start, end, 1); mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD2): formatcodeln(st, "Ss", buf, &start, end, 1); mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); formatcodeln(st, "Em", buf, &start, end, 0); mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; case (CMD_HEAD4): puts(".Pp"); formatcodeln(st, "No", buf, &start, end, 0); mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; case (CMD_OVER): /* * If we have an existing list that hasn't had an =item * yet, then make sure that we open it now. * We use the default list type, but that can't be * helped (we haven't seen any items yet). */ if (st->lpos > 0) if (LIST__MAX == st->lstack[st->lpos - 1]) { st->lstack[st->lpos - 1] = LIST_TAG; puts(".Bl -tag -width Ds"); } st->lpos++; assert(st->lpos < LIST_STACKSZ); st->lstack[st->lpos - 1] = LIST__MAX; break; case (CMD_ITEM): if (0 == st->lpos) { /* * Bad markup. * Try to compensate. */ st->lstack[st->lpos] = LIST__MAX; st->lpos++; } assert(st->lpos > 0); /* * If we're the first =item, guess at what our content * will be: "*" is a bullet list, "1." is a numbered * list, and everything is tagged. */ if (LIST__MAX == st->lstack[st->lpos - 1]) { st->lstack[st->lpos - 1] = listguess(buf, start, end); switch (st->lstack[st->lpos - 1]) { case (LIST_BULLET): puts(".Bl -bullet"); break; case (LIST_ENUM): puts(".Bl -enum"); break; default: puts(".Bl -tag -width Ds"); break; } } switch (st->lstack[st->lpos - 1]) { case (LIST_TAG): formatcodeln(st, "It", buf, &start, end, 0); mdoc_newln(st); break; case (LIST_ENUM): /* FALLTHROUGH */ case (LIST_BULLET): /* * Abandon the remainder of the paragraph * because we're going to be a bulletted or * numbered list. */ puts(".It"); break; default: abort(); } st->haspar = 1; break; case (CMD_BACK): /* Make sure we don't back over the stack. */ if (st->lpos > 0) { st->lpos--; puts(".El"); } break; case (CMD_BEGIN): /* * We disregard all types for now. * TODO: process at least "text" in a -literal block. */ st->paused = 1; break; case (CMD_FOR): /* * We ignore all types of encodings and formats * unilaterally. */ break; case (CMD_ENCODING): break; case (CMD_CUT): st->parsing = 0; return; default: abort(); } /* Any command (but =cut) makes us start parsing. */ st->parsing = 1; } /* * Put the type provided as an argument into the dictionary. */ static void register_type(const char *ptype) { const char *pname, *pend; pname = ptype; while (isalnum((unsigned char)*pname) || '_' == *pname) pname++; if ((pname - ptype == 6 && ! strncmp(ptype, "struct", 6)) || (pname - ptype == 4 && ! strncmp(ptype, "enum", 4))) { while (' ' == *pname) pname++; pend = pname; while (isalnum((unsigned char)*pend) || '_' == *pend) pend++; if (pend > pname) dict_put(pname, pend - pname, MDOC_Vt); } else pend = pname; if (pend > ptype) dict_put(ptype, pend - ptype, MDOC_Vt); } /* * Just pump out the line in a verbatim block. * From the perspective of external callers, * always stays in OUST_NL/wantws mode. */ static void verbatim(struct state *st, char *buf, size_t start, size_t end) { size_t i, ift, ifo, ifa, ifc, inl; char *cp, *cp2; int indisplay, nopen, wantsp; if (st->paused || ! st->parsing) return; indisplay = wantsp = 0; again: if (start == end) { if (indisplay) puts(".Ed"); return; } if ('\n' == buf[start]) { wantsp = 1; start++; goto again; } /* * If we're in the SYNOPSIS, see if we're an #include block. * If we are, then print the "In" macro and re-loop. * This handles any number of inclusions, but only when they * come before the remaining parts... */ if (SECT_SYNOPSIS == st->sect) { i = start; while (i < end && buf[i] == ' ') i++; if (i == end) goto again; /* We're an include block! */ if (end - i > 10 && 0 == memcmp(&buf[i], "#include <", 10)) { start = i + 10; while (start < end && ' ' == buf[start]) start++; if (indisplay) puts(".Ed"); indisplay = wantsp = 0; fputs(".In ", stdout); /* Stop til the '>' marker or we hit eoln. */ while (start < end && '>' != buf[start] && '\n' != buf[start]) putchar(buf[start++]); putchar('\n'); if (start < end && '>' == buf[start]) start++; if (start < end && '\n' == buf[start]) start++; goto again; } /* Other preprocessor directives. */ if ('#' == buf[i]) { if (indisplay) puts(".Ed"); indisplay = wantsp = 0; fputs(".Fd ", stdout); start = i; while(start < end && '\n' != buf[start]) putchar(buf[start++]); putchar('\n'); if (start < end && '\n' == buf[start]) start++; /* Remember #define for Dv or Fn. */ if (strncmp(buf + i + 1, "define", 6) || ! isspace((unsigned char)buf[i + 7])) goto again; ifo = i + 7; while (ifo < start && isspace((unsigned char)buf[ifo])) ifo++; ifa = ifo; while ('_' == buf[ifa] || isalnum((unsigned char)buf[ifa])) ifa++; dict_put(buf + ifo, ifa - ifo, '(' == buf[ifa] ? MDOC_Fo : MDOC_Dv); goto again; } /* Parse function declaration. */ ifo = ifa = ifc = 0; inl = end; nopen = 0; for (ift = i; i < end; i++) { if (ifc) { if (buf[i] != '\n') continue; inl = i; break; } switch (buf[i]) { case '\t': /* FALLTHROUGH */ case ' ': if ( ! ifa) ifo = i; break; case '(': if (ifo) { nopen++; if ( ! ifa) ifa = i; } else i = end; break; case ')': switch (nopen) { case 0: i = end; break; case 1: ifc = i; break; default: nopen--; break; } break; default: break; } } /* Encode function declaration. */ if (ifc) { for (i = ifa; i < ifc; i++) if (buf[i] == '\n') buf[i] = ' '; buf[ifo++] = '\0'; register_type(buf + ift); if (indisplay) puts(".Ed"); indisplay = wantsp = 0; printf(".Ft %s", buf + ift); if (buf[ifo] == '*') { fputs(" *", stdout); ifo++; } putchar('\n'); buf[ifa++] = '\0'; dict_put(buf + ifo, 0, MDOC_Fo); buf[ifc++] = '\0'; if (strcmp(buf + ifa, "void")) { printf(".Fo %s\n", buf + ifo); for (;;) { cp = strchr(buf + ifa, ','); if (cp != NULL) { cp2 = cp; *cp++ = '\0'; } else cp2 = strchr(buf + ifa, '\0'); while (isalnum((unsigned char)cp2[-1]) || '_' == cp2[-1]) cp2--; if ('\0' != *cp2) dict_put(cp2, 0, MDOC_Fa); register_type(buf + ifa); if (strchr(buf + ifa, ' ') == NULL) printf(".Fa %s\n", buf + ifa); else printf(".Fa \"%s\"\n", buf + ifa); if (cp == NULL) break; while (*cp == ' ' || *cp == '\t') cp++; ifa = cp - buf; } puts(".Fc"); } else printf(".Fn %s void\n", buf + ifo); if (buf[ifc] == ';') ifc++; if (ifc < inl) { buf[inl] = '\0'; puts(buf + ifc); } start = inl < end ? inl + 1 : end; goto again; } } if ( ! indisplay) puts(".Bd -literal"); else if (wantsp) putchar('\n'); indisplay = 1; wantsp = 0; for (last = '\n'; start < end; start++) { /* * Handle accidental macros (newline starting with * control character) and escapes. */ if ('\n' == last) { if ('\n' == buf[start]) goto again; if ('.' == buf[start] || '\'' == buf[start]) printf("\\&"); } putchar(last = buf[start]); if ('\\' == buf[start]) printf("e"); } if ('\n' != last) putchar('\n'); if (indisplay) puts(".Ed"); } /* * See dosynopsisop(). */ static int hasmatch(const char *buf, size_t start, size_t end) { size_t stack; for (stack = 0; start < end; start++) if (buf[start] == '[') stack++; else if (buf[start] == ']' && 0 == stack) return(1); else if (buf[start] == ']') stack--; return(0); } /* * If we're in the SYNOPSIS section and we've encounter braces in an * ordinary paragraph, then try to see whether we're an [-option]. * Do this, if we're an opening bracket, by first seeing if we have a * matching end via hasmatch(). * If we're an ending bracket, see if we have a stack already. */ static int dosynopsisop(struct state *st, const char *buf, size_t *start, size_t end, size_t *opstack) { assert('[' == buf[*start] || ']' == buf[*start]); if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) { mdoc_newln(st); puts(".Oo"); (*opstack)++; } else if ('[' == buf[*start]) return(0); if (']' == buf[*start] && *opstack > 0) { mdoc_newln(st); puts(".Oc"); (*opstack)--; } else if (']' == buf[*start]) return(0); (*start)++; last = '\n'; while (' ' == buf[*start]) (*start)++; return(1); } /* * Format multiple "Nm" manpage names in the NAME section. * From the perspective of external callers, * always stays in OUST_NL/wantws mode, * but its children do use OUST_MAC. */ static void donamenm(struct state *st, const char *buf, size_t *start, size_t end) { size_t word; assert(OUST_NL == st->oust); assert(st->wantws); while (*start < end && isspace((unsigned char)buf[*start])) (*start)++; if (end == *start) { puts(".Nm unknown"); return; } while (*start < end) { for (word = *start; word < end; word++) if (',' == buf[word]) break; formatcodeln(st, "Nm", buf, start, word, 1); if (*start == end) { mdoc_newln(st); break; } assert(',' == buf[*start]); printf(" ,"); mdoc_newln(st); (*start)++; while (*start < end && isspace((unsigned char)buf[*start])) (*start)++; } } /* * Ordinary paragraph. * Well, this is really the hardest--POD seems to assume that, for * example, a leading space implies a newline, and so on. * Lots of other snakes in the grass: escaping a newline followed by a * period (accidental mdoc(7) control), double-newlines after macro * passages, etc. * * Uses formatcode() to go to OUST_MAC mode * and outbuf_flush() to go to OUST_TXT mode. * In text mode, wantws requests white space before the text * currently contained in the outbuf, not before upcoming text. * Must make sure to go back to OUST_NL/wantws mode before returning. */ static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { size_t i, j, opstack, wend; enum mdoc_type mtype; int eos, noeos, seq; char savechar; if ( ! st->parsing || st->paused) return; /* * Special-case: the NAME section. * If we find a "-" when searching from the end, assume that * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ if (SECT_NAME == st->sect) { for (i = end - 2; i > start; i--) if ('-' == buf[i] && isspace((unsigned char)buf[i + 1])) break; if ('-' == buf[i]) { j = i; /* Roll over multiple "-". */ for ( ; i > start; i--) if ('-' != buf[i]) break; donamenm(st, buf, &start, i + 1); start = j + 1; while (start < end && isspace((unsigned char)buf[start])) start++; while (start < end && '.' == buf[end - 1]) end--; formatcodeln(st, "Nd", buf, &start, end, 1); mdoc_newln(st); return; } } if ( ! st->haspar) puts(".Pp"); st->haspar = 0; last = '\n'; opstack = 0; for (seq = 0; start < end; seq++) { /* * Loop til we get either to a newline or escape. * Escape initial control characters. */ while (start < end) { if (start < end - 1 && '<' == buf[start + 1] && 'A' <= buf[start] && 'Z' >= buf[start]) break; else if ('\n' == buf[start]) break; else if ('\n' == last && '.' == buf[start]) outbuf_addstr(st, "\\&"); else if ('\n' == last && '\'' == buf[start]) outbuf_addstr(st, "\\&"); /* * If we're in the SYNOPSIS, have square * brackets indicate that we're opening and * closing an optional context. */ if (SECT_SYNOPSIS == st->sect && ('[' == buf[start] || ']' == buf[start]) && dosynopsisop(st, buf, &start, end, &opstack)) continue; /* Merely buffer non-whitespace. */ last = buf[start++]; if ( ! isspace(last)) outbuf_addchar(st); if (start < end && ! isspace((unsigned char)buf[start - 1]) && ! isspace((unsigned char)buf[start])) continue; /* * Found the end of a word. * Rewind trailing delimiters. */ eos = noeos = 0; for (wend = st->outbuflen; wend; wend--) if ('.' == st->outbuf[wend - 1] || '!' == st->outbuf[wend - 1] || '?' == st->outbuf[wend - 1]) eos = 1; else if ('|' == st->outbuf[wend - 1] || ',' == st->outbuf[wend - 1] || ';' == st->outbuf[wend - 1] || ':' == st->outbuf[wend - 1]) noeos = 1; else if ('\'' != st->outbuf[wend - 1] && '"' != st->outbuf[wend - 1] && ')' != st->outbuf[wend - 1] && ']' != st->outbuf[wend - 1]) break; eos &= ! noeos; /* * Detect function names. */ mtype = MDOC_Fa; savechar = '\0'; if (wend && ')' == st->outbuf[wend] && '(' == st->outbuf[wend - 1]) { mtype = dict_get(st->outbuf, --wend); if (MDOC_Dv == mtype) mtype = MDOC_Fo; if (MDOC_Fo == mtype || MDOC_MAX == mtype) { st->outbuflen = wend; st->outbuf[wend] = '\0'; mdoc_newln(st); if (MDOC_Fo == mtype) fputs(".Fn", stdout); else fputs(".Xr", stdout); st->oust = OUST_MAC; } } else { mtype = dict_get(st->outbuf, wend); if (MDOC_Dv == mtype) { savechar = st->outbuf[wend]; st->outbuf[wend] = '\0'; mdoc_newln(st); fputs(".Dv", stdout); st->oust = OUST_MAC; } else mtype = MDOC_Fa; } /* * On whitespace, flush the output buffer * and allow breaking to a macro line. */ outbuf_flush(st); /* * End macro lines, and * end text lines at the end of sentences. */ if (OUST_MAC == st->oust || (eos && wend > 1 && islower((unsigned char)st->outbuf[wend - 1]))) { if (MDOC_MAX == mtype) fputs(" 3", stdout); if (MDOC_Fa != mtype) { if (MDOC_Dv == mtype) st->outbuf[wend] = savechar; else wend += 2; while ('\0' != st->outbuf[wend]) printf(" %c", st->outbuf[wend++]); } mdoc_newln(st); } /* Advance to the next word. */ while ('\n' != buf[start] && isspace((unsigned char)buf[start])) start++; st->wantws = 1; } if (start < end - 1 && '<' == buf[start + 1] && 'A' <= buf[start] && 'Z' >= buf[start]) { formatcode(st, buf, &start, end, 0, seq); if (OUST_MAC == st->oust) { /* * Let mdoc(7) handle trailing punctuation. * XXX Some punctuation characters * are not handled yet. */ if ((start == end - 1 || (start < end - 1 && (' ' == buf[start + 1] || '\n' == buf[start + 1]))) && NULL != strchr("|.,;:?!)]", buf[start])) { putchar(' '); putchar(buf[start++]); } if (st->wantws || ' ' == buf[start] || '\n' == buf[start]) mdoc_newln(st); /* * Consume all whitespace * so we don't accidentally start * an implicit literal line. */ while (start < end && ' ' == buf[start]) start++; /* * Some text is following. * Implement requested spacing. */ if ( ! st->wantws && start < end && ('<' != buf[start + 1] || 'A' > buf[start] || 'Z' < buf[start])) { fputs(" Ns", stdout); st->wantws = 1; } } } else if (start < end && '\n' == buf[start]) { outbuf_flush(st); st->wantws = 1; if (++start >= end) continue; /* * If we have whitespace next, eat it to prevent * mdoc(7) from thinking that it's meant for * verbatim text. * It is--but if we start with that, we can't * have a macro subsequent it, which may be * possible if we have an escape next. */ if (' ' == buf[start] || '\t' == buf[start]) { mdoc_newln(st); puts(".br"); } for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; } } outbuf_flush(st); mdoc_newln(st); } /* * There are three kinds of paragraphs: verbatim (starts with whitespace * of some sort), ordinary (starts without "=" marker), or a command * (default: starts with "="). */ static void dopar(struct state *st, char *buf, size_t start, size_t end) { assert(OUST_NL == st->oust); assert(st->wantws); if (end == start) return; if (' ' == buf[start] || '\t' == buf[start]) verbatim(st, buf, start, end); else if ('=' != buf[start]) ordinary(st, buf, start, end); else command(st, buf, start, end); } /* * Loop around paragraphs within a document, processing each one in the * POD way. */ static void dofile(const struct args *args, const char *fname, const struct tm *tm, char *buf, size_t sz) { char datebuf[64]; struct state st; const char *fbase, *fext, *section, *date, *format; char *title, *cp; size_t cur, end; int verb; if (0 == sz) return; /* * Parsing the filename is almost always required, * except when both the title and the section * are provided on the command line. */ if (NULL == args->title || NULL == args->section) { fbase = strrchr(fname, '/'); if (NULL == fbase) fbase = fname; else fbase++; fext = strrchr(fbase, '.'); } else fext = NULL; /* * The title will be converted to uppercase, * so it needs to be copied. */ title = (NULL != args->title) ? strdup(args->title) : (NULL != fext) ? strndup(fbase, fext - fbase) : strdup(fbase); if (NULL == title) { perror(NULL); exit(EXIT_FAILURE); } /* Section is 1 unless suffix is "pm". */ section = (NULL != args->section) ? args->section : (NULL == fext || strcmp(fext + 1, "pm")) ? "1" : PERL_SECTION; /* Date. Or the given "tm" if not supplied. */ date = args->date; format = (NULL == date) ? "%B %d, %Y" : strcmp(date, "Mdocdate") ? NULL : "$" "Mdocdate: %B %d %Y $"; if (NULL != format) { strftime(datebuf, sizeof(datebuf), format, tm); date = datebuf; } for (cp = title; '\0' != *cp; cp++) *cp = toupper((int)*cp); /* The usual mdoc(7) preamble. */ printf(".Dd %s\n", date); printf(".Dt %s %s\n", title, section); puts(".Os"); free(title); dict_init(); memset(&st, 0, sizeof(struct state)); st.oust = OUST_NL; st.wantws = 1; assert(sz > 0); /* Main loop over file contents. */ cur = 0; for (;;) { while (cur < sz && '\n' == buf[cur]) cur++; if (cur >= sz) break; verb = isspace((unsigned char)buf[cur]); /* Read until next paragraph. */ for (end = cur + 1; end + 1 < sz; end++) if ('\n' == buf[end] && '\n' == buf[end + 1] && !(verb && end + 2 < sz && isspace((unsigned char)buf[end + 2]))) break; /* Adjust end marker for EOF. */ if (end < sz && '\n' != buf[end]) end++; /* Process paragraph and adjust start. */ dopar(&st, buf, cur, end); cur = end + 2; } dict_destroy(); } /* * Read a single file fully into memory. * If the file is "-", do it from stdin. * If successfully read, send the input buffer to dofile() for further * processing. */ static int readfile(const struct args *args, const char *fname) { int fd; char *buf; size_t bufsz, cur; ssize_t ssz; struct tm *tm; time_t ttm; struct stat st; fd = 0 != strcmp("-", fname) ? open(fname, O_RDONLY, 0) : STDIN_FILENO; if (-1 == fd) { perror(fname); return(0); } if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) { ttm = time(NULL); tm = localtime(&ttm); } else tm = localtime(&st.st_mtime); /* * Arbitrarily-sized initial buffer. * Should be big enough for most files... */ cur = 0; bufsz = 1 << 14; if (NULL == (buf = malloc(bufsz))) { perror(NULL); exit(EXIT_FAILURE); } while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) { /* Double buffer size on fill. */ if ((size_t)ssz == bufsz - cur) { bufsz *= 2; if (NULL == (buf = realloc(buf, bufsz))) { perror(NULL); exit(EXIT_FAILURE); } } cur += (size_t)ssz; } if (ssz < 0) { perror(fname); free(buf); return(0); } dofile(args, STDIN_FILENO == fd ? "STDIN" : fname, tm, buf, cur); free(buf); if (STDIN_FILENO != fd) close(fd); return(1); } int main(int argc, char *argv[]) { const char *fname, *name; struct args args; int c; name = strrchr(argv[0], '/'); if (name == NULL) name = argv[0]; else ++name; memset(&args, 0, sizeof(struct args)); fname = "-"; /* Accept no arguments for now. */ while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv"))) switch (c) { case ('h'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('c'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('q'): /* FALLTHROUGH */ case ('r'): /* FALLTHROUGH */ case ('u'): /* FALLTHROUGH */ case ('v'): /* Ignore these. */ break; case ('d'): args.date = optarg; break; case ('n'): args.title = optarg; break; case ('s'): args.section = optarg; break; default: goto usage; } argc -= optind; argv += optind; /* Accept only a single input file. */ if (argc > 1) goto usage; else if (1 == argc) fname = *argv; return(readfile(&args, fname) ? EXIT_SUCCESS : EXIT_FAILURE); usage: fprintf(stderr, "usage: %s [-d date] " "[-n title] [-s section] [file]\n", name); return(EXIT_FAILURE); }