=================================================================== RCS file: /cvs/pod2mdoc/pod2mdoc.c,v retrieving revision 1.7 retrieving revision 1.21 diff -u -p -r1.7 -r1.21 --- pod2mdoc/pod2mdoc.c 2014/03/23 23:35:59 1.7 +++ pod2mdoc/pod2mdoc.c 2014/04/03 11:55:01 1.21 @@ -1,4 +1,4 @@ -/* $Id: pod2mdoc.c,v 1.7 2014/03/23 23:35:59 kristaps Exp $ */ +/* $Id: pod2mdoc.c,v 1.21 2014/04/03 11:55:01 kristaps Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * @@ -26,6 +26,13 @@ #include #include +/* + * In what section can we find Perl module manuals? + * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p. + * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL. + */ +#define PERL_SECTION "3p" + struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ @@ -39,11 +46,17 @@ enum list { LIST__MAX }; +enum sect { + SECT_NONE = 0, + SECT_NAME, /* NAME section */ + SECT_SYNOPSIS, /* SYNOPSIS section */ +}; + struct state { int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ int haspar; /* in paragraph: do we need Pp? */ - int isname; /* are we the NAME section? */ + enum sect sect; /* which section are we in? */ const char *fname; /* file being parsed */ #define LIST_STACKSZ 128 enum list lstack[LIST_STACKSZ]; /* open lists */ @@ -158,20 +171,201 @@ formatescape(const char *buf, size_t *start, size_t en } /* - * Skip space characters. + * Run some heuristics to intuit a link format. + * I set "start" to be the end of the sequence (last right-carrot) so + * that the caller can safely just continue processing. + * If this is just an empty tag, I'll return 0. */ static int -skipspace(const char *buf, size_t *start, size_t end) +trylink(const char *buf, size_t *start, size_t end, size_t dsz) { - size_t sv = *start; + size_t linkstart, realend, linkend, + i, j, textsz, stack; + const char *text; - while (*start < end && ' ' == buf[*start]) - (*start)++; + /* + * Scan to the start of the terminus. + * This function is more or less replicated in the formatcode() + * for null or index formatting codes. + */ + stack = 0; + for (linkstart = realend = *start; realend < end; realend++) { + if ('>' != buf[realend]) + continue; + else if (dsz == 1) + break; + assert(realend > 0); + if (' ' != buf[realend - 1]) + continue; + for (i = realend, j = 0; i < end && j < dsz; j++) + if ('>' != buf[i++]) + break; + if (dsz == j) + break; + } - return(*start > sv); + /* Ignore stubs. */ + if (realend == end || realend == *start) + return(0); + + /* Set linkend to the end of content. */ + linkend = dsz > 1 ? realend - 1 : realend; + + /* Re-scan to see if we have a title or section. */ + text = &buf[*start]; + for (textsz = *start; textsz < linkend; textsz++) + if ('|' == buf[textsz] || '/' == buf[textsz]) + break; + + if (textsz < linkend && '|' == buf[textsz]) { + /* With title: set start, then end at section. */ + linkstart = textsz + 1; + textsz = textsz - *start; + for (i = linkstart; i < linkend; i++) + if ('/' == buf[i]) + break; + if (i < linkend) + linkend = i; + } else if (textsz < linkend && '/' == buf[textsz]) { + /* With section: set end at section. */ + linkend = textsz; + textsz = 0; + } else + /* No title, no section. */ + textsz = 0; + + *start = realend; + j = linkend - linkstart; + + /* Do we have only subsection material? */ + if (0 == j && '/' == buf[linkend]) { + linkstart = linkend + 1; + linkend = dsz > 1 ? realend - 1 : realend; + if (0 == (j = linkend - linkstart)) + return(0); + printf("Sx %.*s", (int)j, &buf[linkstart]); + return(1); + } else if (0 == j) + return(0); + + /* See if we qualify as being a link or not. */ + if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) || + (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) || + (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) { + /* Gross. */ + printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : + realend) - linkstart), &buf[linkstart]); + return(1); + } + + /* See if we qualify as a mailto. */ + if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) { + printf("Mt %.*s", (int)j, &buf[linkstart]); + return(1); + } + + /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */ + if ((j > 3 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 3])) { + printf("Xr %.*s %c", (int)(j - 3), + &buf[linkstart], buf[linkend - 2]); + return(1); + } else if ((j > 4 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 4])) { + printf("Xr %.*s %.*s", (int)(j - 4), + &buf[linkstart], 2, &buf[linkend - 3]); + return(1); + } else if ((j > 5 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 5])) { + printf("Xr %.*s %.*s", (int)(j - 5), + &buf[linkstart], 3, &buf[linkend - 4]); + return(1); + } + + /* Last try: do we have a double-colon? */ + for (i = linkstart + 1; i < linkend; i++) + if (':' == buf[i] && ':' == buf[i - 1]) + break; + + if (i < linkend) + printf("Xr %.*s " PERL_SECTION, + (int)j, &buf[linkstart]); + else + printf("Xr %.*s 1", (int)j, &buf[linkstart]); + + return(1); } /* + * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section, + * then it's likely that we're a flag. + * Our flag might be followed by an argument, so make sure that we're + * accounting for that, too. + * If we don't have a flag at all, however, then assume we're an "Ar". + */ +static void +dosynopsisfl(const char *buf, size_t *start, size_t end) +{ + size_t i; +again: + assert(*start + 1 < end); + assert('-' == buf[*start]); + + if ( ! isalnum((int)buf[*start + 1]) && + '?' != buf[*start + 1] && + '-' != buf[*start + 1]) { + (*start)--; + fputs("Ar ", stdout); + return; + } + + (*start)++; + for (i = *start; i < end; i++) + if (isalnum((int)buf[i])) + continue; + else if ('?' == buf[i]) + continue; + else if ('-' == buf[i]) + continue; + else if ('_' == buf[i]) + continue; + else + break; + + assert(i < end); + + if ( ! (' ' == buf[i] || '>' == buf[i])) { + printf("Ar "); + return; + } + + printf("Fl "); + if (end - *start > 1 && + isupper((int)buf[*start]) && + islower((int)buf[*start + 1]) && + (end - *start == 2 || + ' ' == buf[*start + 2])) + printf("\\&"); + printf("%.*s ", (int)(i - *start), &buf[*start]); + *start = i; + + if (' ' == buf[i]) { + while (i < end && ' ' == buf[i]) + i++; + assert(i < end); + if ('-' == buf[i]) { + *start = i; + goto again; + } + printf("Ar "); + *start = i; + } +} + +/* * We're at the character in front of a format code, which is structured * like X<...> and can contain nested format codes. * This consumes the whole format code, and any nested format codes, til @@ -180,13 +374,16 @@ skipspace(const char *buf, size_t *start, size_t end) * been printed to the current line. * If "nomacro", then we don't print any macros, just contained data * (e.g., following "Sh" or "Nm"). + * "pos" is only significant in SYNOPSIS, and should be 0 when invoked + * as the first format code on a line (for decoration as an "Nm"), + * non-zero otherwise. * Return whether we've printed a macro or not--in other words, whether * this should trigger a subsequent newline (this should be ignored when * reentrant). */ static int -formatcode(const char *buf, size_t *start, - size_t end, int reentrant, int nomacro) +formatcode(struct state *st, const char *buf, size_t *start, + size_t end, int reentrant, int nomacro, int pos) { enum fmt fmt; size_t i, j, dsz; @@ -204,6 +401,8 @@ formatcode(const char *buf, size_t *start, if (FMT__MAX == fmt) { putchar(last = buf[(*start)++]); + if ('\\' == last) + putchar('e'); return(0); } @@ -290,19 +489,29 @@ formatcode(const char *buf, size_t *start, printf("Em "); break; case (FMT_BOLD): + if (SECT_SYNOPSIS == st->sect) { + if (1 == dsz && '-' == buf[*start]) + dosynopsisfl(buf, start, end); + else if (0 == pos) + printf("Nm "); + else + printf("Ar "); + break; + } printf("Sy "); break; case (FMT_CODE): printf("Qo Li "); break; case (FMT_LINK): - printf("Lk "); + /* Try to link; use "No" if it's empty. */ + if ( ! trylink(buf, start, end, dsz)) + printf("No "); break; case (FMT_FILE): printf("Pa "); break; case (FMT_NBSP): - /* TODO. */ printf("No "); break; default: @@ -338,7 +547,7 @@ formatcode(const char *buf, size_t *start, } } if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, nomacro); + formatcode(st, buf, start, end, 1, nomacro, 1); continue; } @@ -363,6 +572,10 @@ formatcode(const char *buf, size_t *start, else putchar(last = buf[*start]); + /* Protect against character escapes. */ + if ('\\' == last) + putchar('e'); + (*start)++; if (' ' == last) @@ -389,13 +602,14 @@ formatcode(const char *buf, size_t *start, * Calls formatcode() til the end of a paragraph. */ static void -formatcodeln(const char *buf, size_t *start, size_t end, int nomacro) +formatcodeln(struct state *st, const char *buf, + size_t *start, size_t end, int nomacro) { last = ' '; while (*start < end) { if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, nomacro); + formatcode(st, buf, start, end, 1, nomacro, 1); continue; } /* @@ -412,10 +626,15 @@ formatcodeln(const char *buf, size_t *start, size_t en ' ' == buf[*start + 2])) printf("\\&"); - if ('\n' != buf[*start]) - putchar(last = buf[*start]); - else + if ('\n' == buf[*start]) putchar(last = ' '); + else + putchar(last = buf[*start]); + + /* Protect against character escapes. */ + if ('\\' == last) + putchar('e'); + (*start)++; } } @@ -471,7 +690,9 @@ command(struct state *st, const char *buf, size_t star return; start += csz; - skipspace(buf, &start, end); + while (start < end && ' ' == buf[start]) + start++; + len = end - start; if (st->paused) { @@ -488,24 +709,28 @@ command(struct state *st, const char *buf, size_t star * how pod2man handles it. */ printf(".Sh "); - st->isname = 0; - if (end - start == 4) + st->sect = SECT_NONE; + if (end - start == 4) { if (0 == memcmp(&buf[start], "NAME", 4)) - st->isname = 1; - formatcodeln(buf, &start, end, 1); + st->sect = SECT_NAME; + } else if (end - start == 8) { + if (0 == memcmp(&buf[start], "SYNOPSIS", 8)) + st->sect = SECT_SYNOPSIS; + } + formatcodeln(st, buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD2): printf(".Ss "); - formatcodeln(buf, &start, end, 1); + formatcodeln(st, buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); printf(".Em "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; @@ -513,7 +738,7 @@ command(struct state *st, const char *buf, size_t star case (CMD_HEAD4): puts(".Pp"); printf(".No "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; @@ -567,7 +792,7 @@ command(struct state *st, const char *buf, size_t star switch (st->lstack[st->lpos - 1]) { case (LIST_TAG): printf(".It "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); break; case (LIST_ENUM): @@ -624,23 +849,118 @@ command(struct state *st, const char *buf, size_t star static void verbatim(struct state *st, const char *buf, size_t start, size_t end) { - size_t sv = start; + int last; if ( ! st->parsing || st->paused) return; puts(".Bd -literal"); - while (start < end) { - if (start > sv && '\n' == buf[start - 1]) + for (last = ' '; start < end; start++) { + /* + * Handle accidental macros (newline starting with + * control character) and escapes. + */ + if ('\n' == last) if ('.' == buf[start] || '\'' == buf[start]) printf("\\&"); - putchar(buf[start++]); + putchar(last = buf[start]); + if ('\\' == buf[start]) + printf("e"); } putchar('\n'); puts(".Ed"); } /* + * See dosynopsisop(). + */ +static int +hasmatch(const char *buf, size_t start, size_t end) +{ + size_t stack; + + for (stack = 0; start < end; start++) + if (buf[start] == '[') + stack++; + else if (buf[start] == ']' && 0 == stack) + return(1); + else if (buf[start] == ']') + stack--; + return(0); +} + +/* + * If we're in the SYNOPSIS section and we've encounter braces in an + * ordinary paragraph, then try to see whether we're an [-option]. + * Do this, if we're an opening bracket, by first seeing if we have a + * matching end via hasmatch(). + * If we're an ending bracket, see if we have a stack already. + */ +static int +dosynopsisop(const char *buf, int *last, + size_t *start, size_t end, size_t *opstack) +{ + + assert('[' == buf[*start] || ']' == buf[*start]); + + if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) { + if ('\n' != *last) + putchar('\n'); + puts(".Oo"); + (*opstack)++; + } else if ('[' == buf[*start]) + return(0); + + if (']' == buf[*start] && *opstack > 0) { + if ('\n' != *last) + putchar('\n'); + puts(".Oc"); + (*opstack)--; + } else if (']' == buf[*start]) + return(0); + + (*start)++; + *last = '\n'; + while (' ' == buf[*start]) + (*start)++; + return(1); +} + +/* + * Format multiple "Nm" manpage names in the NAME section. + */ +static void +donamenm(struct state *st, const char *buf, size_t *start, size_t end) +{ + size_t word; + + while (*start < end && ' ' == buf[*start]) + (*start)++; + + if (end == *start) { + puts(".Nm unknown"); + return; + } + + while (*start < end) { + fputs(".Nm ", stdout); + for (word = *start; word < end; word++) + if (',' == buf[word]) + break; + formatcodeln(st, buf, start, word, 1); + if (*start == end) { + putchar('\n'); + continue; + } + assert(',' == buf[*start]); + puts(" ,"); + (*start)++; + while (*start < end && ' ' == buf[*start]) + (*start)++; + } +} + +/* * Ordinary paragraph. * Well, this is really the hardest--POD seems to assume that, for * example, a leading space implies a newline, and so on. @@ -651,7 +971,8 @@ verbatim(struct state *st, const char *buf, size_t sta static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { - size_t i, j; + size_t i, j, opstack; + int seq; if ( ! st->parsing || st->paused) return; @@ -662,9 +983,9 @@ ordinary(struct state *st, const char *buf, size_t sta * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ - if (st->isname) { - for (i = end - 1; i > start; i--) - if ('-' == buf[i]) + if (SECT_NAME == st->sect) { + for (i = end - 2; i > start; i--) + if ('-' == buf[i] && ' ' == buf[i + 1]) break; if ('-' == buf[i]) { j = i; @@ -672,12 +993,12 @@ ordinary(struct state *st, const char *buf, size_t sta for ( ; i > start; i--) if ('-' != buf[i]) break; - printf(".Nm "); - formatcodeln(buf, &start, i + 1, 1); - putchar('\n'); + donamenm(st, buf, &start, i + 1); start = j + 1; - printf(".Nd "); - formatcodeln(buf, &start, end, 1); + while (start < end && ' ' == buf[start]) + start++; + fputs(".Nd ", stdout); + formatcodeln(st, buf, &start, end, 1); putchar('\n'); return; } @@ -688,8 +1009,9 @@ ordinary(struct state *st, const char *buf, size_t sta st->haspar = 0; last = '\n'; + opstack = 0; - while (start < end) { + for (seq = 0; start < end; seq++) { /* * Loop til we get either to a newline or escape. * Escape initial control characters. @@ -703,7 +1025,20 @@ ordinary(struct state *st, const char *buf, size_t sta printf("\\&"); else if ('\n' == last && '\'' == buf[start]) printf("\\&"); + /* + * If we're in the SYNOPSIS, have square + * brackets indicate that we're opening and + * closing an optional context. + */ + if (SECT_SYNOPSIS == st->sect && + ('[' == buf[start] || + ']' == buf[start]) && + dosynopsisop(buf, &last, + &start, end, &opstack)) + continue; putchar(last = buf[start++]); + if ('\\' == last) + putchar('e'); } if (start < end - 1 && '<' == buf[start + 1]) { @@ -713,8 +1048,21 @@ ordinary(struct state *st, const char *buf, size_t sta * what, so print a newline now. * Then print the (possibly nested) macros and * following that, a newline. + * Consume all whitespace so we don't + * accidentally start an implicit literal line. + * If the macro ends with a flush comma or + * period, let mdoc(7) handle it for us. */ - if (formatcode(buf, &start, end, 0, 0)) { + if (formatcode(st, buf, &start, end, 0, 0, seq)) { + if ((start == end - 1 || + (start < end - 1 && + (' ' == buf[start + 1] || + '\n' == buf[start + 1]))) && + ('.' == buf[start] || + ',' == buf[start])) { + putchar(' '); + putchar(buf[start++]); + } putchar(last = '\n'); while (start < end && ' ' == buf[start]) start++; @@ -743,17 +1091,7 @@ ordinary(struct state *st, const char *buf, size_t sta for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; - } else if (start < end) { - /* - * Default: print the character. - * Escape initial control characters. - */ - if ('\n' == last && '.' == buf[start]) - printf("\\&"); - else if ('\n' == last && '\'' == buf[start]) - printf("\\&"); - putchar(last = buf[start++]); - } + } } if (last != '\n') @@ -817,7 +1155,7 @@ dofile(const struct args *args, const char *fname, if (NULL != (cp = strrchr(title, '.'))) { *cp++ = '\0'; if (0 == strcmp(cp, "pm")) - section = "3p"; + section = PERL_SECTION; } }