/* $Id: pod2mdoc.c,v 1.34 2014/07/19 00:42:22 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include /* * In what section can we find Perl module manuals? * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p. * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL. */ #define PERL_SECTION "3p" struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ const char *section; /* override "Dt" section */ }; enum list { LIST_BULLET = 0, LIST_ENUM, LIST_TAG, LIST__MAX }; enum sect { SECT_NONE = 0, SECT_NAME, /* NAME section */ SECT_SYNOPSIS, /* SYNOPSIS section */ }; enum outstate { OUST_NL = 0, /* just started a new output line */ OUST_TXT, /* text line output in progress */ OUST_MAC /* macro line output in progress */ }; struct state { const char *fname; /* file being parsed */ int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ enum sect sect; /* which section are we in? */ #define LIST_STACKSZ 128 enum list lstack[LIST_STACKSZ]; /* open lists */ size_t lpos; /* where in list stack */ int haspar; /* in paragraph: do we need Pp? */ enum outstate oust; /* state of the mdoc output stream */ int wantws; /* let mdoc(7) output whitespace here */ char *outbuf; /* text buffered for output */ size_t outbufsz; /* allocated size of outbuf */ size_t outbuflen; /* current length of outbuf */ }; enum fmt { FMT_ITALIC, FMT_BOLD, FMT_CODE, FMT_LINK, FMT_ESCAPE, FMT_FILE, FMT_NBSP, FMT_INDEX, FMT_NULL, FMT__MAX }; enum cmd { CMD_POD = 0, CMD_HEAD1, CMD_HEAD2, CMD_HEAD3, CMD_HEAD4, CMD_OVER, CMD_ITEM, CMD_BACK, CMD_BEGIN, CMD_END, CMD_FOR, CMD_ENCODING, CMD_CUT, CMD__MAX }; static const char *const cmds[CMD__MAX] = { "pod", /* CMD_POD */ "head1", /* CMD_HEAD1 */ "head2", /* CMD_HEAD2 */ "head3", /* CMD_HEAD3 */ "head4", /* CMD_HEAD4 */ "over", /* CMD_OVER */ "item", /* CMD_ITEM */ "back", /* CMD_BACK */ "begin", /* CMD_BEGIN */ "end", /* CMD_END */ "for", /* CMD_FOR */ "encoding", /* CMD_ENCODING */ "cut" /* CMD_CUT */ }; static const char fmts[FMT__MAX] = { 'I', /* FMT_ITALIC */ 'B', /* FMT_BOLD */ 'C', /* FMT_CODE */ 'L', /* FMT_LINK */ 'E', /* FMT_ESCAPE */ 'F', /* FMT_FILE */ 'S', /* FMT_NBSP */ 'X', /* FMT_INDEX */ 'Z' /* FMT_NULL */ }; static int last; static void outbuf_grow(struct state *st, size_t by) { st->outbufsz += (by / 128 + 1) * 128; st->outbuf = realloc(st->outbuf, st->outbufsz); if (NULL == st->outbuf) { perror(NULL); exit(EXIT_FAILURE); } } static void outbuf_addchar(struct state *st) { if (st->outbuflen + 2 >= st->outbufsz) outbuf_grow(st, 1); st->outbuf[st->outbuflen++] = last; if ('\\' == last) st->outbuf[st->outbuflen++] = 'e'; st->outbuf[st->outbuflen] = '\0'; st->wantws = 0; } static void outbuf_addstr(struct state *st, const char *str) { size_t slen; slen = strlen(str); if (st->outbuflen + slen >= st->outbufsz) outbuf_grow(st, slen); memcpy(st->outbuf + st->outbuflen, str, slen+1); st->outbuflen += slen; last = str[slen - 1]; st->wantws = 0; } static void outbuf_flush(struct state *st) { if (0 == st->outbuflen) return; fputs(st->outbuf, stdout); *st->outbuf = '\0'; st->outbuflen = 0; if (OUST_NL == st->oust) st->oust = OUST_TXT; } static void mdoc_newln(struct state *st) { if (OUST_NL == st->oust) return; putchar('\n'); last = '\n'; st->oust = OUST_NL; st->wantws = 1; } /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. * Sets start to be one after the '>'. * * This function does not care about output modes, * it merely appends text to the output buffer, * which can then be used in any mode. */ static void formatescape(struct state *st, const char *buf, size_t *start, size_t end) { char esc[16]; /* no more needed */ size_t i, max; max = sizeof(esc) - 1; i = 0; /* Read til our buffer is full. */ while (*start < end && '>' != buf[*start] && i < max) esc[i++] = buf[(*start)++]; esc[i] = '\0'; if (i == max) { /* Too long... skip til we end. */ while (*start < end && '>' != buf[*start]) (*start)++; return; } else if (*start >= end) return; assert('>' == buf[*start]); (*start)++; /* * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ if (0 == strcmp(esc, "lt")) outbuf_addstr(st, "\\(la"); else if (0 == strcmp(esc, "gt")) outbuf_addstr(st, "\\(ra"); else if (0 == strcmp(esc, "verbar")) outbuf_addstr(st, "\\(ba"); else if (0 == strcmp(esc, "sol")) outbuf_addstr(st, "\\(sl"); } /* * Run some heuristics to intuit a link format. * I set "start" to be the end of the sequence (last right-carrot) so * that the caller can safely just continue processing. * If this is just an empty tag, I'll return 0. * * Always operates in OUST_MAC mode. * Mode handling is done by the caller. */ static int trylink(const char *buf, size_t *start, size_t end, size_t dsz) { size_t linkstart, realend, linkend, i, j, textsz, stack; /* * Scan to the start of the terminus. * This function is more or less replicated in the formatcode() * for null or index formatting codes. * However, we're slightly different because we might have * nested escapes we need to ignore. */ stack = 0; for (linkstart = realend = *start; realend < end; realend++) { if ('<' == buf[realend]) stack++; if ('>' != buf[realend]) continue; else if (stack-- > 0) continue; if (dsz == 1) break; assert(realend > 0); if (' ' != buf[realend - 1]) continue; for (i = realend, j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz == j) break; } /* Ignore stubs. */ if (realend == end || realend == *start) return(0); /* Set linkend to the end of content. */ linkend = dsz > 1 ? realend - 1 : realend; /* Re-scan to see if we have a title or section. */ for (textsz = *start; textsz < linkend; textsz++) if ('|' == buf[textsz] || '/' == buf[textsz]) break; if (textsz < linkend && '|' == buf[textsz]) { /* With title: set start, then end at section. */ linkstart = textsz + 1; textsz = textsz - *start; for (i = linkstart; i < linkend; i++) if ('/' == buf[i]) break; if (i < linkend) linkend = i; } else if (textsz < linkend && '/' == buf[textsz]) { /* With section: set end at section. */ linkend = textsz; textsz = 0; } else /* No title, no section. */ textsz = 0; *start = realend; j = linkend - linkstart; /* Do we have only subsection material? */ if (0 == j && '/' == buf[linkend]) { linkstart = linkend + 1; linkend = dsz > 1 ? realend - 1 : realend; if (0 == (j = linkend - linkstart)) return(0); printf("Sx %.*s", (int)j, &buf[linkstart]); return(1); } else if (0 == j) return(0); /* See if we qualify as being a link or not. */ if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) || (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) || (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) || (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) { /* Gross. */ printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : realend) - linkstart), &buf[linkstart]); return(1); } /* See if we qualify as a mailto. */ if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) { printf("Mt %.*s", (int)j, &buf[linkstart]); return(1); } /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */ if ((j > 3 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 3])) { printf("Xr %.*s %c", (int)(j - 3), &buf[linkstart], buf[linkend - 2]); return(1); } else if ((j > 4 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 4])) { printf("Xr %.*s %.*s", (int)(j - 4), &buf[linkstart], 2, &buf[linkend - 3]); return(1); } else if ((j > 5 && ')' == buf[linkend - 1]) && ('(' == buf[linkend - 5])) { printf("Xr %.*s %.*s", (int)(j - 5), &buf[linkstart], 3, &buf[linkend - 4]); return(1); } /* Last try: do we have a double-colon? */ for (i = linkstart + 1; i < linkend; i++) if (':' == buf[i] && ':' == buf[i - 1]) break; if (i < linkend) printf("Xr %.*s " PERL_SECTION, (int)j, &buf[linkstart]); else printf("Xr %.*s 1", (int)j, &buf[linkstart]); return(1); } /* * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section, * then it's likely that we're a flag. * Our flag might be followed by an argument, so make sure that we're * accounting for that, too. * If we don't have a flag at all, however, then assume we're an "Ar". * * Always operates in OUST_MAC mode. * Mode handlinf is done by the caller. */ static void dosynopsisfl(const char *buf, size_t *start, size_t end) { size_t i; again: assert(*start + 1 < end); assert('-' == buf[*start]); if ( ! isalnum((int)buf[*start + 1]) && '?' != buf[*start + 1] && '-' != buf[*start + 1]) { (*start)--; fputs("Ar ", stdout); return; } (*start)++; for (i = *start; i < end; i++) if (isalnum((int)buf[i])) continue; else if ('?' == buf[i]) continue; else if ('-' == buf[i]) continue; else if ('_' == buf[i]) continue; else break; assert(i < end); if ( ! (' ' == buf[i] || '>' == buf[i])) { printf("Ar "); return; } printf("Fl "); if (end - *start > 1 && isupper((int)buf[*start]) && islower((int)buf[*start + 1]) && (end - *start == 2 || ' ' == buf[*start + 2])) printf("\\&"); printf("%.*s ", (int)(i - *start), &buf[*start]); *start = i; if (' ' == buf[i]) { while (i < end && ' ' == buf[i]) i++; assert(i < end); if ('-' == buf[i]) { *start = i; goto again; } printf("Ar "); *start = i; } } /* * We're at the character in front of a format code, which is structured * like X<...> and can contain nested format codes. * This consumes the whole format code, and any nested format codes, til * the end of matched production. * If "nomacro", then we don't print any macros, just contained data * (e.g., following "Sh" or "Nm"). * "pos" is only significant in SYNOPSIS, and should be 0 when invoked * as the first format code on a line (for decoration as an "Nm"), * non-zero otherwise. * * Output mode handling is most complicated here. * We may enter in any mode. * We usually exit in OUST_MAC mode, except when * entering without OUST_MAC and the code is invalid. */ static int formatcode(struct state *st, const char *buf, size_t *start, size_t end, int nomacro, int pos) { enum fmt fmt; size_t i, j, dsz; assert(*start + 1 < end); assert('<' == buf[*start + 1]); /* * First, look up the format code. * If it's not valid, treat it as a NOOP. */ for (fmt = 0; fmt < FMT__MAX; fmt++) if (buf[*start] == fmts[fmt]) break; /* * Determine whether we're overriding our delimiter. * According to POD, if we have more than one '<' followed by a * space, then we need a space followed by matching '>' to close * the expression. * Otherwise we use the usual '<' and '>' matched pair. */ i = *start + 1; while (i < end && '<' == buf[i]) i++; assert(i > *start + 1); dsz = i - (*start + 1); if (dsz > 1 && (i >= end || ' ' != buf[i])) dsz = 1; /* Remember, if dsz>1, to jump the trailing space. */ *start += dsz + 1 + (dsz > 1 ? 1 : 0); /* * Escapes and ignored codes (NULL and INDEX) don't print macro * sequences, so just output them like normal text before * processing for real macros. */ if (FMT_ESCAPE == fmt) { formatescape(st, buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { /* * Just consume til the end delimiter, accounting for * whether it's a custom one. */ for ( ; *start < end; (*start)++) { if ('>' != buf[*start]) continue; else if (dsz == 1) break; assert(*start > 0); if (' ' != buf[*start - 1]) continue; i = *start; for (j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz != j) continue; (*start) += dsz; break; } if (*start < end) { assert('>' == buf[*start]); (*start)++; } if (isspace(last)) while (*start < end && isspace((int)buf[*start])) (*start)++; return(0); } /* * Check whether we're supposed to print macro stuff (this is * suppressed in, e.g., "Nm" and "Sh" macros). */ if (FMT__MAX != fmt && !nomacro) { /* * We may already have wantws if there was whitespace * before the code ("text Bwantws |= ' ' == buf[*start]; /* * If we are on a text line and there is no * whitespace before our content, we have to make * the previous word a prefix to the macro line. * In the following, mdoc_newln() must not be used * lest we clobber out output state. */ if (OUST_MAC != st->oust && !st->wantws) { if (OUST_NL != st->oust) putchar('\n'); printf(".Pf "); } outbuf_flush(st); /* Whitespace is easier to suppress on macro lines. */ if (OUST_MAC == st->oust && !st->wantws) printf(" Ns "); /* Unless we are on a macro line, start one. */ if (OUST_MAC != st->oust && st->wantws) { if (OUST_NL != st->oust) putchar('\n'); putchar('.'); } else putchar(' '); /* * Print the macro corresponding to this format code, * and update the output state afterwards. */ switch (fmt) { case (FMT_ITALIC): printf("Em "); break; case (FMT_BOLD): if (SECT_SYNOPSIS == st->sect) { if (1 == dsz && '-' == buf[*start]) dosynopsisfl(buf, start, end); else if (0 == pos) printf("Nm "); else printf("Ar "); break; } if (0 == strncmp(buf + *start, "NULL", 4) && ('=' == buf[*start + 4] || '>' == buf[*start + 4])) printf("Dv "); else printf("Sy "); break; case (FMT_CODE): printf("Qo Li "); break; case (FMT_LINK): /* Try to link; use "No" if it's empty. */ if ( ! trylink(buf, start, end, dsz)) printf("No "); break; case (FMT_FILE): printf("Pa "); break; case (FMT_NBSP): printf("No "); break; default: abort(); } st->oust = OUST_MAC; st->wantws = 1; } else outbuf_flush(st); /* * Process until we reach the end marker (e.g., '>') or until we * find a nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. */ while (*start < end) { if ('>' == buf[*start] && 1 == dsz) { (*start)++; break; } else if ('>' == buf[*start] && ' ' == buf[*start - 1]) { /* * Handle custom delimiters. * These require a certain number of * space-preceded carrots before we're really at * the end. */ i = *start; for (j = 0; i < end && j < dsz; j++) if ('>' != buf[i++]) break; if (dsz == j) { *start += dsz; break; } } if (*start + 1 < end && '<' == buf[*start + 1] && 'A' <= buf[*start] && 'Z' >= buf[*start]) { formatcode(st, buf, start, end, nomacro, 1); continue; } /* Suppress newlines and multiple spaces. */ last = buf[(*start)++]; if (' ' == last || '\n' == last) { putchar(' '); while (*start < end && ' ' == buf[*start]) (*start)++; continue; } if (OUST_MAC == st->oust && FMT__MAX != fmt) { if ( ! st->wantws) { printf(" Ns "); st->wantws = 1; } /* * Escape macro-like words. * This matches "Xx " and "XxEOLN". */ if (end - *start > 0 && isupper((unsigned char)last) && islower((unsigned char)buf[*start]) && (end - *start == 1 || ' ' == buf[*start + 1] || '>' == buf[*start + 1])) printf("\\&"); } putchar(last); /* Protect against character escapes. */ if ('\\' == last) putchar('e'); } if (FMT__MAX == fmt) return(0); if ( ! nomacro && FMT_CODE == fmt) printf(" Qc "); st->wantws = ' ' == last; return(1); } /* * Calls formatcode() til the end of a paragraph. * Goes to OUST_MAC mode and stays there when returning, * such that the caller can add arguments to the macro line * before closing it out. */ static void formatcodeln(struct state *st, const char *linemac, const char *buf, size_t *start, size_t end, int nomacro) { int gotmacro, wantws; assert(OUST_NL == st->oust); assert(st->wantws); printf(".%s ", linemac); st->oust = OUST_MAC; gotmacro = 0; while (*start < end) { wantws = ' ' == buf[*start] || '\n' == buf[*start]; if (wantws) { last = ' '; do { (*start)++; } while (*start < end && ' ' == buf[*start]); } if (*start + 1 < end && '<' == buf[*start + 1] && 'A' <= buf[*start] && 'Z' >= buf[*start]) { st->wantws |= wantws; gotmacro = formatcode(st, buf, start, end, nomacro, 1); continue; } if (gotmacro) { if (*start < end || st->outbuflen) { if (st->wantws || (wantws && !st->outbuflen)) printf(" No "); else printf(" Ns "); } gotmacro = 0; } outbuf_flush(st); st->wantws = wantws; if (*start >= end) break; if (st->wantws) { putchar(' '); st->wantws = 0; } /* * Since we're already on a macro line, we want to make * sure that we don't inadvertently invoke a macro. * We need to do this carefully because section names * are used in troff and we don't want to escape * something that needn't be escaped. */ if (' ' == last && end - *start > 1 && isupper((unsigned char)buf[*start]) && islower((unsigned char)buf[*start + 1]) && (end - *start == 2 || ' ' == buf[*start + 2])) printf("\\&"); putchar(last = buf[*start]); /* Protect against character escapes. */ if ('\\' == last) putchar('e'); (*start)++; } } /* * Guess at what kind of list we are. * These are taken straight from the POD manual. * I don't know what people do in real life. */ static enum list listguess(const char *buf, size_t start, size_t end) { size_t len = end - start; assert(end >= start); if (len == 1 && '*' == buf[start]) return(LIST_BULLET); if (len == 2 && '1' == buf[start] && '.' == buf[start + 1]) return(LIST_ENUM); else if (len == 1 && '1' == buf[start]) return(LIST_ENUM); else return(LIST_TAG); } /* * A command paragraph, as noted in the perlpod manual, just indicates * that we should do something, optionally with some text to print as * well. * From the perspective of external callers, * always stays in OUST_NL/wantws mode, * but its children do use OUST_MAC. */ static void command(struct state *st, const char *buf, size_t start, size_t end) { size_t len, csz; enum cmd cmd; assert('=' == buf[start]); start++; len = end - start; for (cmd = 0; cmd < CMD__MAX; cmd++) { csz = strlen(cmds[cmd]); if (len < csz) continue; if (0 == memcmp(&buf[start], cmd[cmds], csz)) break; } /* Ignore bogus commands. */ if (CMD__MAX == cmd) return; start += csz; while (start < end && ' ' == buf[start]) start++; len = end - start; if (st->paused) { st->paused = CMD_END != cmd; return; } switch (cmd) { case (CMD_POD): break; case (CMD_HEAD1): /* * The behaviour of head= follows from a quick glance at * how pod2man handles it. */ st->sect = SECT_NONE; if (end - start == 4) { if (0 == memcmp(&buf[start], "NAME", 4)) st->sect = SECT_NAME; } else if (end - start == 8) { if (0 == memcmp(&buf[start], "SYNOPSIS", 8)) st->sect = SECT_SYNOPSIS; } formatcodeln(st, "Sh", buf, &start, end, 1); mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD2): formatcodeln(st, "Ss", buf, &start, end, 1); mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); formatcodeln(st, "Em", buf, &start, end, 0); mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; case (CMD_HEAD4): puts(".Pp"); formatcodeln(st, "No", buf, &start, end, 0); mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; case (CMD_OVER): /* * If we have an existing list that hasn't had an =item * yet, then make sure that we open it now. * We use the default list type, but that can't be * helped (we haven't seen any items yet). */ if (st->lpos > 0) if (LIST__MAX == st->lstack[st->lpos - 1]) { st->lstack[st->lpos - 1] = LIST_TAG; puts(".Bl -tag -width Ds"); } st->lpos++; assert(st->lpos < LIST_STACKSZ); st->lstack[st->lpos - 1] = LIST__MAX; break; case (CMD_ITEM): if (0 == st->lpos) { /* * Bad markup. * Try to compensate. */ st->lstack[st->lpos] = LIST__MAX; st->lpos++; } assert(st->lpos > 0); /* * If we're the first =item, guess at what our content * will be: "*" is a bullet list, "1." is a numbered * list, and everything is tagged. */ if (LIST__MAX == st->lstack[st->lpos - 1]) { st->lstack[st->lpos - 1] = listguess(buf, start, end); switch (st->lstack[st->lpos - 1]) { case (LIST_BULLET): puts(".Bl -bullet"); break; case (LIST_ENUM): puts(".Bl -enum"); break; default: puts(".Bl -tag -width Ds"); break; } } switch (st->lstack[st->lpos - 1]) { case (LIST_TAG): formatcodeln(st, "It", buf, &start, end, 0); mdoc_newln(st); break; case (LIST_ENUM): /* FALLTHROUGH */ case (LIST_BULLET): /* * Abandon the remainder of the paragraph * because we're going to be a bulletted or * numbered list. */ puts(".It"); break; default: abort(); } st->haspar = 1; break; case (CMD_BACK): /* Make sure we don't back over the stack. */ if (st->lpos > 0) { st->lpos--; puts(".El"); } break; case (CMD_BEGIN): /* * We disregard all types for now. * TODO: process at least "text" in a -literal block. */ st->paused = 1; break; case (CMD_FOR): /* * We ignore all types of encodings and formats * unilaterally. */ break; case (CMD_ENCODING): break; case (CMD_CUT): st->parsing = 0; return; default: abort(); } /* Any command (but =cut) makes us start parsing. */ st->parsing = 1; } /* * Just pump out the line in a verbatim block. * From the perspective of external callers, * always stays in OUST_NL/wantws mode. */ static void verbatim(struct state *st, const char *buf, size_t start, size_t end) { size_t i; if ( ! st->parsing || st->paused) return; again: /* * If we're in the SYNOPSIS, see if we're an #include block. * If we are, then print the "In" macro and re-loop. * This handles any number of inclusions, but only when they * come before the remaining parts... */ if (SECT_SYNOPSIS == st->sect) { i = start; for (i = start; i < end && ' ' == buf[i]; i++) /* Spin. */ ; if (i == end) return; /* We're an include block! */ if (end - i > 10 && 0 == memcmp(&buf[i], "#include <", 10)) { start = i + 10; while (start < end && ' ' == buf[start]) start++; fputs(".In ", stdout); /* Stop til the '>' marker or we hit eoln. */ while (start < end && '>' != buf[start] && '\n' != buf[start]) putchar(buf[start++]); putchar('\n'); if (start < end && '>' == buf[start]) start++; if (start < end && '\n' == buf[start]) start++; if (start < end) goto again; return; } } if (start == end) return; puts(".Bd -literal"); for (last = ' '; start < end; start++) { /* * Handle accidental macros (newline starting with * control character) and escapes. */ if ('\n' == last) if ('.' == buf[start] || '\'' == buf[start]) printf("\\&"); putchar(last = buf[start]); if ('\\' == buf[start]) printf("e"); } putchar(last = '\n'); puts(".Ed"); } /* * See dosynopsisop(). */ static int hasmatch(const char *buf, size_t start, size_t end) { size_t stack; for (stack = 0; start < end; start++) if (buf[start] == '[') stack++; else if (buf[start] == ']' && 0 == stack) return(1); else if (buf[start] == ']') stack--; return(0); } /* * If we're in the SYNOPSIS section and we've encounter braces in an * ordinary paragraph, then try to see whether we're an [-option]. * Do this, if we're an opening bracket, by first seeing if we have a * matching end via hasmatch(). * If we're an ending bracket, see if we have a stack already. */ static int dosynopsisop(struct state *st, const char *buf, size_t *start, size_t end, size_t *opstack) { assert('[' == buf[*start] || ']' == buf[*start]); if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) { mdoc_newln(st); puts(".Oo"); (*opstack)++; } else if ('[' == buf[*start]) return(0); if (']' == buf[*start] && *opstack > 0) { mdoc_newln(st); puts(".Oc"); (*opstack)--; } else if (']' == buf[*start]) return(0); (*start)++; last = '\n'; while (' ' == buf[*start]) (*start)++; return(1); } /* * Format multiple "Nm" manpage names in the NAME section. * From the perspective of external callers, * always stays in OUST_NL/wantws mode, * but its children do use OUST_MAC. */ static void donamenm(struct state *st, const char *buf, size_t *start, size_t end) { size_t word; assert(OUST_NL == st->oust); assert(st->wantws); while (*start < end && ' ' == buf[*start]) (*start)++; if (end == *start) { puts(".Nm unknown"); return; } while (*start < end) { for (word = *start; word < end; word++) if (',' == buf[word]) break; formatcodeln(st, "Nm", buf, start, word, 1); if (*start == end) { mdoc_newln(st); break; } assert(',' == buf[*start]); printf(" ,"); mdoc_newln(st); (*start)++; while (*start < end && ' ' == buf[*start]) (*start)++; } } /* * Ordinary paragraph. * Well, this is really the hardest--POD seems to assume that, for * example, a leading space implies a newline, and so on. * Lots of other snakes in the grass: escaping a newline followed by a * period (accidental mdoc(7) control), double-newlines after macro * passages, etc. * * Uses formatcode() to go to OUST_MAC mode * and outbuf_flush() to go to OUST_TXT mode. * Main text mode wantws handling is in this function. * Must make sure to go back to OUST_NL/wantws mode before returning. */ static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { size_t i, j, opstack; int seq; if ( ! st->parsing || st->paused) return; /* * Special-case: the NAME section. * If we find a "-" when searching from the end, assume that * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ if (SECT_NAME == st->sect) { for (i = end - 2; i > start; i--) if ('-' == buf[i] && ' ' == buf[i + 1]) break; if ('-' == buf[i]) { j = i; /* Roll over multiple "-". */ for ( ; i > start; i--) if ('-' != buf[i]) break; donamenm(st, buf, &start, i + 1); start = j + 1; while (start < end && ' ' == buf[start]) start++; formatcodeln(st, "Nd", buf, &start, end, 1); mdoc_newln(st); return; } } if ( ! st->haspar) puts(".Pp"); st->haspar = 0; last = '\n'; opstack = 0; for (seq = 0; start < end; seq++) { /* * Loop til we get either to a newline or escape. * Escape initial control characters. */ while (start < end) { if (start < end - 1 && '<' == buf[start + 1] && 'A' <= buf[start] && 'Z' >= buf[start]) break; else if ('\n' == buf[start]) break; else if ('\n' == last && '.' == buf[start]) outbuf_addstr(st, "\\&"); else if ('\n' == last && '\'' == buf[start]) outbuf_addstr(st, "\\&"); /* * If we're in the SYNOPSIS, have square * brackets indicate that we're opening and * closing an optional context. */ if (SECT_SYNOPSIS == st->sect && ('[' == buf[start] || ']' == buf[start]) && dosynopsisop(st, buf, &start, end, &opstack)) continue; /* * On whitespace, flush the output buffer * and allow breaking to a macro line. * Otherwise, buffer text and clear wantws. */ last = buf[start++]; if (' ' == last) { outbuf_flush(st); putchar(' '); st->wantws = 1; } else outbuf_addchar(st); } if (start < end - 1 && '<' == buf[start + 1] && 'A' <= buf[start] && 'Z' >= buf[start]) { formatcode(st, buf, &start, end, 0, seq); if (OUST_MAC == st->oust) { /* * Let mdoc(7) handle trailing punctuation. * XXX Some punctuation characters * are not handled yet. */ if ((start == end - 1 || (start < end - 1 && (' ' == buf[start + 1] || '\n' == buf[start + 1]))) && ('.' == buf[start] || ',' == buf[start])) { putchar(' '); putchar(buf[start++]); } if (st->wantws || ' ' == buf[start] || '\n' == buf[start]) mdoc_newln(st); /* * Consume all whitespace * so we don't accidentally start * an implicit literal line. */ while (start < end && ' ' == buf[start]) start++; /* * Some text is following. * Implement requested spacing. */ if ( ! st->wantws && start < end && ('<' != buf[start + 1] || 'A' > buf[start] || 'Z' < buf[start])) { printf(" Ns "); st->wantws = 1; } } } else if (start < end && '\n' == buf[start]) { outbuf_flush(st); mdoc_newln(st); if (++start >= end) continue; /* * If we have whitespace next, eat it to prevent * mdoc(7) from thinking that it's meant for * verbatim text. * It is--but if we start with that, we can't * have a macro subsequent it, which may be * possible if we have an escape next. */ if (' ' == buf[start] || '\t' == buf[start]) puts(".br"); for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; } } outbuf_flush(st); mdoc_newln(st); } /* * There are three kinds of paragraphs: verbatim (starts with whitespace * of some sort), ordinary (starts without "=" marker), or a command * (default: starts with "="). */ static void dopar(struct state *st, const char *buf, size_t start, size_t end) { assert(OUST_NL == st->oust); assert(st->wantws); if (end == start) return; if (' ' == buf[start] || '\t' == buf[start]) verbatim(st, buf, start, end); else if ('=' != buf[start]) ordinary(st, buf, start, end); else command(st, buf, start, end); } /* * Loop around paragraphs within a document, processing each one in the * POD way. */ static void dofile(const struct args *args, const char *fname, const struct tm *tm, const char *buf, size_t sz) { char datebuf[64]; struct state st; const char *fbase, *fext, *section, *date; char *title, *cp; size_t sup, end, i, cur = 0; if (0 == sz) return; /* * Parsing the filename is almost always required, * except when both the title and the section * are provided on the command line. */ if (NULL == args->title || NULL == args->section) { fbase = strrchr(fname, '/'); if (NULL == fbase) fbase = fname; else fbase++; fext = strrchr(fbase, '.'); } else fext = NULL; /* * The title will be converted to uppercase, * so it needs to be copied. */ title = (NULL != args->title) ? strdup(args->title) : (NULL != fext) ? strndup(fbase, fext - fbase) : strdup(fbase); if (NULL == title) { perror(NULL); exit(EXIT_FAILURE); } /* Section is 1 unless suffix is "pm". */ section = (NULL != args->section) ? args->section : (NULL == fext || strcmp(fext + 1, "pm")) ? "1" : PERL_SECTION; /* Date. Or the given "tm" if not supplied. */ if (NULL == (date = args->date)) { strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm); date = datebuf; } for (cp = title; '\0' != *cp; cp++) *cp = toupper((int)*cp); /* The usual mdoc(7) preamble. */ printf(".Dd %s\n", date); printf(".Dt %s %s\n", title, section); puts(".Os"); free(title); memset(&st, 0, sizeof(struct state)); st.oust = OUST_NL; st.wantws = 1; assert(sz > 0); /* Main loop over file contents. */ while (cur < sz) { /* Read until next paragraph. */ for (i = cur + 1; i < sz; i++) if ('\n' == buf[i] && '\n' == buf[i - 1]) { /* Consume blank paragraphs. */ while (i + 1 < sz && '\n' == buf[i + 1]) i++; break; } /* Adjust end marker for EOF. */ end = i < sz ? i - 1 : ('\n' == buf[sz - 1] ? sz - 1 : sz); sup = i < sz ? end + 2 : sz; /* Process paragraph and adjust start. */ dopar(&st, buf, cur, end); cur = sup; } } /* * Read a single file fully into memory. * If the file is "-", do it from stdin. * If successfully read, send the input buffer to dofile() for further * processing. */ static int readfile(const struct args *args, const char *fname) { int fd; char *buf; size_t bufsz, cur; ssize_t ssz; struct tm *tm; time_t ttm; struct stat st; fd = 0 != strcmp("-", fname) ? open(fname, O_RDONLY, 0) : STDIN_FILENO; if (-1 == fd) { perror(fname); return(0); } if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) { ttm = time(NULL); tm = localtime(&ttm); } else tm = localtime(&st.st_mtime); /* * Arbitrarily-sized initial buffer. * Should be big enough for most files... */ cur = 0; bufsz = 1 << 14; if (NULL == (buf = malloc(bufsz))) { perror(NULL); exit(EXIT_FAILURE); } while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) { /* Double buffer size on fill. */ if ((size_t)ssz == bufsz - cur) { bufsz *= 2; if (NULL == (buf = realloc(buf, bufsz))) { perror(NULL); exit(EXIT_FAILURE); } } cur += (size_t)ssz; } if (ssz < 0) { perror(fname); free(buf); return(0); } dofile(args, STDIN_FILENO == fd ? "STDIN" : fname, tm, buf, cur); free(buf); if (STDIN_FILENO != fd) close(fd); return(1); } int main(int argc, char *argv[]) { const char *fname, *name; struct args args; int c; name = strrchr(argv[0], '/'); if (name == NULL) name = argv[0]; else ++name; memset(&args, 0, sizeof(struct args)); fname = "-"; /* Accept no arguments for now. */ while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv"))) switch (c) { case ('h'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('c'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('q'): /* FALLTHROUGH */ case ('r'): /* FALLTHROUGH */ case ('u'): /* FALLTHROUGH */ case ('v'): /* Ignore these. */ break; case ('d'): args.date = optarg; break; case ('n'): args.title = optarg; break; case ('s'): args.section = optarg; break; default: goto usage; } argc -= optind; argv += optind; /* Accept only a single input file. */ if (argc > 1) goto usage; else if (1 == argc) fname = *argv; return(readfile(&args, fname) ? EXIT_SUCCESS : EXIT_FAILURE); usage: fprintf(stderr, "usage: %s [-d date] " "[-n title] [-s section] [file]\n", name); return(EXIT_FAILURE); }