=================================================================== RCS file: /cvs/pod2mdoc/pod2mdoc.c,v retrieving revision 1.5 retrieving revision 1.53 diff -u -p -r1.5 -r1.53 --- pod2mdoc/pod2mdoc.c 2014/03/23 13:00:24 1.5 +++ pod2mdoc/pod2mdoc.c 2015/02/19 13:50:45 1.53 @@ -1,6 +1,7 @@ -/* $Id: pod2mdoc.c,v 1.5 2014/03/23 13:00:24 kristaps Exp $ */ +/* $Id: pod2mdoc.c,v 1.53 2015/02/19 13:50:45 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons + * Copyright (c) 2014, 2015 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above @@ -26,6 +27,15 @@ #include #include +#include "dict.h" + +/* + * In what section can we find Perl module manuals? + * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p. + * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL. + */ +#define PERL_SECTION "3p" + struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ @@ -39,15 +49,32 @@ enum list { LIST__MAX }; +enum sect { + SECT_NONE = 0, + SECT_NAME, /* NAME section */ + SECT_SYNOPSIS, /* SYNOPSIS section */ +}; + +enum outstate { + OUST_NL = 0, /* just started a new output line */ + OUST_TXT, /* text line output in progress */ + OUST_MAC /* macro line output in progress */ +}; + struct state { + const char *fname; /* file being parsed */ int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ - int haspar; /* in paragraph: do we need Pp? */ - int isname; /* are we the NAME section? */ - const char *fname; /* file being parsed */ + enum sect sect; /* which section are we in? */ #define LIST_STACKSZ 128 enum list lstack[LIST_STACKSZ]; /* open lists */ size_t lpos; /* where in list stack */ + int haspar; /* in paragraph: do we need Pp? */ + enum outstate oust; /* state of the mdoc output stream */ + int wantws; /* let mdoc(7) output whitespace here */ + char *outbuf; /* text buffered for output */ + size_t outbufsz; /* allocated size of outbuf */ + size_t outbuflen; /* current length of outbuf */ }; enum fmt { @@ -108,13 +135,88 @@ static const char fmts[FMT__MAX] = { 'Z' /* FMT_NULL */ }; +static unsigned char last; + + +static void +outbuf_grow(struct state *st, size_t by) +{ + + st->outbufsz += (by / 128 + 1) * 128; + st->outbuf = realloc(st->outbuf, st->outbufsz); + if (NULL == st->outbuf) { + perror(NULL); + exit(EXIT_FAILURE); + } +} + +static void +outbuf_addchar(struct state *st) +{ + + if (st->outbuflen + 2 >= st->outbufsz) + outbuf_grow(st, 1); + st->outbuf[st->outbuflen++] = last; + if ('\\' == last) + st->outbuf[st->outbuflen++] = 'e'; + st->outbuf[st->outbuflen] = '\0'; +} + +static void +outbuf_addstr(struct state *st, const char *str) +{ + size_t slen; + + slen = strlen(str); + if (st->outbuflen + slen >= st->outbufsz) + outbuf_grow(st, slen); + memcpy(st->outbuf + st->outbuflen, str, slen+1); + st->outbuflen += slen; + last = str[slen - 1]; +} + +static void +outbuf_flush(struct state *st) +{ + + if (0 == st->outbuflen) + return; + + if (OUST_TXT == st->oust && st->wantws) + putchar(' '); + + fputs(st->outbuf, stdout); + *st->outbuf = '\0'; + st->outbuflen = 0; + + if (OUST_NL == st->oust) + st->oust = OUST_TXT; +} + +static void +mdoc_newln(struct state *st) +{ + + if (OUST_NL == st->oust) + return; + + putchar('\n'); + last = '\n'; + st->oust = OUST_NL; + st->wantws = 1; +} + /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. * Sets start to be one after the '>'. + * + * This function does not care about output modes, + * it merely appends text to the output buffer, + * which can then be used in any mode. */ static void -formatescape(const char *buf, size_t *start, size_t end) +formatescape(struct state *st, const char *buf, size_t *start, size_t end) { char esc[16]; /* no more needed */ size_t i, max; @@ -141,51 +243,257 @@ formatescape(const char *buf, size_t *start, size_t en * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ - if (0 == strcmp(esc, "lt")) - printf("\\(la"); + if (0 == strcmp(esc, "lt")) + outbuf_addstr(st, "\\(la"); else if (0 == strcmp(esc, "gt")) - printf("\\(ra"); - else if (0 == strcmp(esc, "vb")) - printf("\\(ba"); + outbuf_addstr(st, "\\(ra"); + else if (0 == strcmp(esc, "verbar")) + outbuf_addstr(st, "\\(ba"); else if (0 == strcmp(esc, "sol")) - printf("\\(sl"); + outbuf_addstr(st, "\\(sl"); } /* - * Skip space characters. + * Run some heuristics to intuit a link format. + * I set "start" to be the end of the sequence (last right-carrot) so + * that the caller can safely just continue processing. + * If this is just an empty tag, I'll return 0. + * + * Always operates in OUST_MAC mode. + * Mode handling is done by the caller. */ static int -skipspace(const char *buf, size_t *start, size_t end) +trylink(const char *buf, size_t *start, size_t end, size_t dsz) { - size_t sv = *start; + size_t linkstart, realend, linkend, + i, j, textsz, stack; - while (*start < end && ' ' == buf[*start]) - (*start)++; + /* + * Scan to the start of the terminus. + * This function is more or less replicated in the formatcode() + * for null or index formatting codes. + * However, we're slightly different because we might have + * nested escapes we need to ignore. + */ + stack = 0; + for (linkstart = realend = *start; realend < end; realend++) { + if ('<' == buf[realend]) + stack++; + if ('>' != buf[realend]) + continue; + else if (stack-- > 0) + continue; + if (dsz == 1) + break; + assert(realend > 0); + if (' ' != buf[realend - 1]) + continue; + for (i = realend, j = 0; i < end && j < dsz; j++) + if ('>' != buf[i++]) + break; + if (dsz == j) + break; + } - return(*start > sv); + /* Ignore stubs. */ + if (realend == end || realend == *start) + return(0); + + /* Set linkend to the end of content. */ + linkend = dsz > 1 ? realend - 1 : realend; + + /* Re-scan to see if we have a title or section. */ + for (textsz = *start; textsz < linkend; textsz++) + if ('|' == buf[textsz] || '/' == buf[textsz]) + break; + + if (textsz < linkend && '|' == buf[textsz]) { + /* With title: set start, then end at section. */ + linkstart = textsz + 1; + textsz = textsz - *start; + for (i = linkstart; i < linkend; i++) + if ('/' == buf[i]) + break; + if (i < linkend) + linkend = i; + } else if (textsz < linkend && '/' == buf[textsz]) { + /* With section: set end at section. */ + linkend = textsz; + textsz = 0; + } else + /* No title, no section. */ + textsz = 0; + + *start = realend; + j = linkend - linkstart; + + /* Do we have only subsection material? */ + if (0 == j && '/' == buf[linkend]) { + linkstart = linkend + 1; + linkend = dsz > 1 ? realend - 1 : realend; + if (0 == (j = linkend - linkstart)) + return(0); + printf("Sx %.*s", (int)j, &buf[linkstart]); + return(1); + } else if (0 == j) + return(0); + + /* See if we qualify as being a link or not. */ + if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) || + (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) || + (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) || + (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) { + /* Gross. */ + printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : + realend) - linkstart), &buf[linkstart]); + return(1); + } + + /* See if we qualify as a mailto. */ + if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) { + printf("Mt %.*s", (int)j, &buf[linkstart]); + return(1); + } + + /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */ + if ((j > 3 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 3])) { + printf("Xr %.*s %c", (int)(j - 3), + &buf[linkstart], buf[linkend - 2]); + return(1); + } else if ((j > 4 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 4])) { + printf("Xr %.*s %.*s", (int)(j - 4), + &buf[linkstart], 2, &buf[linkend - 3]); + return(1); + } else if ((j > 5 && ')' == buf[linkend - 1]) && + ('(' == buf[linkend - 5])) { + printf("Xr %.*s %.*s", (int)(j - 5), + &buf[linkstart], 3, &buf[linkend - 4]); + return(1); + } + + /* Last try: do we have a double-colon? */ + for (i = linkstart + 1; i < linkend; i++) + if (':' == buf[i] && ':' == buf[i - 1]) + break; + + if (i < linkend) + printf("Xr %.*s " PERL_SECTION, + (int)j, &buf[linkstart]); + else + printf("Xr %.*s 1", (int)j, &buf[linkstart]); + + return(1); } /* + * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section, + * then it's likely that we're a flag. + * Our flag might be followed by an argument, so make sure that we're + * accounting for that, too. + * If we don't have a flag at all, however, then assume we're an "Ar". + * + * Always operates in OUST_MAC mode. + * Mode handlinf is done by the caller. + */ +static void +dosynopsisfl(const char *buf, size_t *start, size_t end) +{ + size_t i; +again: + assert(*start + 1 < end); + assert('-' == buf[*start]); + + if ( ! isalnum((int)buf[*start + 1]) && + '?' != buf[*start + 1] && + '-' != buf[*start + 1]) { + (*start)--; + fputs("Ar ", stdout); + return; + } + + (*start)++; + for (i = *start; i < end; i++) + if (isalnum((int)buf[i])) + continue; + else if ('?' == buf[i]) + continue; + else if ('-' == buf[i]) + continue; + else if ('_' == buf[i]) + continue; + else + break; + + assert(i < end); + + if ( ! (' ' == buf[i] || '>' == buf[i])) { + printf("Ar "); + return; + } + + printf("Fl "); + if (end - *start > 1 && + isupper((int)buf[*start]) && + islower((int)buf[*start + 1]) && + (end - *start == 2 || + ' ' == buf[*start + 2])) + printf("\\&"); + printf("%.*s ", (int)(i - *start), &buf[*start]); + *start = i; + + if (' ' == buf[i]) { + while (i < end && ' ' == buf[i]) + i++; + assert(i < end); + if ('-' == buf[i]) { + *start = i; + goto again; + } + printf("Ar "); + *start = i; + } +} + +/* * We're at the character in front of a format code, which is structured * like X<...> and can contain nested format codes. * This consumes the whole format code, and any nested format codes, til * the end of matched production. - * If "reentrant", then we're being called after a macro has already - * been printed to the current line. - * "last" is set to the last read character: this is used to determine - * whether we should buffer with space or not. - * If "nomacro", then we don't print any macros, just contained data. + * If "nomacro", then we don't print any macros, just contained data + * (e.g., following "Sh" or "Nm"). + * "pos" is only significant in SYNOPSIS, and should be 0 when invoked + * as the first format code on a line (for decoration as an "Nm"), + * non-zero otherwise. + * + * Output mode handling is most complicated here. + * We may enter in any mode. + * We usually exit in OUST_MAC mode, except when + * entering without OUST_MAC and the code is invalid. */ static int -formatcode(const char *buf, size_t *start, - size_t end, int reentrant, int last, int nomacro) +formatcode(struct state *st, const char *buf, size_t *start, + size_t end, int nomacro, int pos) { - enum fmt fmt; size_t i, j, dsz; + enum fmt fmt; + int wantws; + unsigned char uc; assert(*start + 1 < end); assert('<' == buf[*start + 1]); + /* + * First, look up the format code. + * If it's not valid, treat it as a NOOP. + */ + for (fmt = 0; fmt < FMT__MAX; fmt++) + if (buf[*start] == fmts[fmt]) + break; + /* * Determine whether we're overriding our delimiter. * According to POD, if we have more than one '<' followed by a @@ -201,33 +509,21 @@ formatcode(const char *buf, size_t *start, if (dsz > 1 && (i >= end || ' ' != buf[i])) dsz = 1; - for (fmt = 0; fmt < FMT__MAX; fmt++) - if (buf[*start] == fmts[fmt]) - break; - - /* Invalid macros are just regular text. */ - - if (FMT__MAX == fmt) { - putchar(buf[*start]); - (*start)++; - return(0); - } - /* Remember, if dsz>1, to jump the trailing space. */ *start += dsz + 1 + (dsz > 1 ? 1 : 0); /* - * Escapes don't print macro sequences, so just output them like - * normal text before processing for macros. + * Escapes and ignored codes (NULL and INDEX) don't print macro + * sequences, so just output them like normal text before + * processing for real macros. */ if (FMT_ESCAPE == fmt) { - formatescape(buf, start, end); + formatescape(st, buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { /* - * For indices and nulls, just consume. - * Be wary of encountering custom delimiters (dsz>1), - * which require special handling. + * Just consume til the end delimiter, accounting for + * whether it's a custom one. */ for ( ; *start < end; (*start)++) { if ('>' != buf[*start]) @@ -246,57 +542,128 @@ formatcode(const char *buf, size_t *start, (*start) += dsz; break; } + if (*start < end) { + assert('>' == buf[*start]); + (*start)++; + } + if (isspace(last)) + while (*start < end && isspace((int)buf[*start])) + (*start)++; return(0); } - if ( ! nomacro) { + /* + * Check whether we're supposed to print macro stuff (this is + * suppressed in, e.g., "Nm" and "Sh" macros). + */ + if (FMT__MAX != fmt && !nomacro) { + /* - * Print out the macro describing this format code. - * If we're not "reentrant" (not yet on a macro line) - * then print a newline, if necessary, and the macro - * indicator. - * Otherwise, offset us with a space. + * We may already have wantws if there was whitespace + * before the code ("text Boust ? st->wantws : ! st->outbuflen); + + /* + * If we are on a text line and there is no + * whitespace before our content, we have to make + * the previous word a prefix to the macro line. + * In the following, mdoc_newln() must not be used + * lest we clobber out output state. + */ + + if (OUST_MAC != st->oust && ! wantws) { + if (OUST_NL != st->oust) + putchar('\n'); + printf(".Pf "); + st->wantws = 0; + } + + outbuf_flush(st); + + /* Whitespace is easier to suppress on macro lines. */ + + if (OUST_MAC == st->oust && ! wantws) + printf(" Ns "); + + /* Unless we are on a macro line, start one. */ + + if (OUST_MAC != st->oust && wantws) { + if (OUST_NL != st->oust) + putchar('\n'); putchar('.'); - else + } else putchar(' '); - + /* - * If we don't have whitespace before us, then suppress - * macro whitespace with Ns. + * Print the macro corresponding to this format code, + * and update the output state afterwards. */ - if (' ' != last) - printf("Ns "); + switch (fmt) { case (FMT_ITALIC): printf("Em "); break; case (FMT_BOLD): - printf("Sy "); + if (SECT_SYNOPSIS == st->sect) { + if (1 == dsz && '-' == buf[*start]) + dosynopsisfl(buf, start, end); + else if (0 == pos) + printf("Nm "); + else + printf("Ar "); + break; + } + i = 0; + uc = buf[*start]; + while (isalnum(uc) || '_' == uc || ' ' == uc) + uc = buf[*start + ++i]; + if ('=' != uc && '>' != uc) + i = 0; + if (4 == i && ! strncmp(buf + *start, "NULL", 4)) { + printf("Dv "); + break; + } + switch (i ? dict_get(buf + *start, i) : MDOC_MAX) { + case MDOC_Fa: + printf("Fa "); + break; + case MDOC_Vt: + printf("Vt "); + break; + default: + printf("Sy "); + break; + } break; case (FMT_CODE): printf("Qo Li "); break; case (FMT_LINK): - printf("Lk "); + /* Try to link; use "No" if it's empty. */ + if ( ! trylink(buf, start, end, dsz)) + printf("No "); break; case (FMT_FILE): printf("Pa "); break; case (FMT_NBSP): - /* TODO. */ printf("No "); break; default: abort(); } - } + st->oust = OUST_MAC; + st->wantws = 1; + } else + outbuf_flush(st); /* - * Read until we reach the end market (e.g., '>') or until we + * Process until we reach the end marker (e.g., '>') or until we * find a nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. @@ -322,81 +689,114 @@ formatcode(const char *buf, size_t *start, break; } } - if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + if (*start + 1 < end && '<' == buf[*start + 1] && + 'A' <= buf[*start] && 'Z' >= buf[*start]) { + if ( ! formatcode(st, buf, start, end, nomacro, 1)) + st->wantws = 1; continue; } - /* - * Make sure that any macro-like words (or - * really any word starting with a capital - * letter) is assumed to be a macro that must be - * escaped. - * This matches "Xx " and "XxEOLN". - */ - if ((' ' == last || '\n' == last) && - end - *start > 1 && - isupper((int)buf[*start]) && - islower((int)buf[*start + 1]) && - (end - *start == 2 || - ' ' == buf[*start + 2])) - printf("\\&"); + /* Suppress newlines and multiple spaces. */ - /* Suppress newline. */ - if ('\n' == (last = buf[(*start)++])) - last = ' '; + last = buf[(*start)++]; + if (' ' == last || '\n' == last) { + putchar(' '); + while (*start < end && ' ' == buf[*start]) + (*start)++; + continue; + } - putchar(last); - } + if (OUST_MAC == st->oust && FMT__MAX != fmt) { + if ( ! st->wantws) { + printf(" Ns "); + st->wantws = 1; + } - if ( ! nomacro && FMT_CODE == fmt) - printf(" Qc "); + /* + * Escape macro-like words. + * This matches "Xx " and "XxEOLN". + */ - if (reentrant) - return(1); + if (end - *start > 0 && + isupper((unsigned char)last) && + islower((unsigned char)buf[*start]) && + (end - *start == 1 || + ' ' == buf[*start + 1] || + '>' == buf[*start + 1])) + printf("\\&"); + } - /* FIXME: with the "Qc", this doens't work good. */ + putchar(last); - /* - * If we're not reentrant, we want to put ending punctuation on - * the macro line so that it's properly handled by being - * smooshed against the terminal word. - */ - skipspace(buf, start, end); + /* Protect against character escapes. */ - if (',' != buf[*start] && '.' != buf[*start] && - '!' != buf[*start] && '?' != buf[*start] && - ')' != buf[*start]) - return(1); - while (*start < end) { - if (',' != buf[*start] && - '.' != buf[*start] && - '!' != buf[*start] && - '?' != buf[*start] && - ')' != buf[*start]) - break; - putchar(' '); - putchar(buf[*start]); - (*start)++; + if ('\\' == last) + putchar('e'); } - skipspace(buf, start, end); - return(1); + + if ( ! nomacro && FMT_CODE == fmt) + printf(" Qc "); + + st->wantws = ' ' == last; + return(FMT__MAX != fmt); } /* * Calls formatcode() til the end of a paragraph. + * Goes to OUST_MAC mode and stays there when returning, + * such that the caller can add arguments to the macro line + * before closing it out. */ static void -formatcodeln(const char *buf, size_t *start, size_t end, int nomacro) +formatcodeln(struct state *st, const char *linemac, + const char *buf, size_t *start, size_t end, int nomacro) { - int last; + int gotmacro, wantws; - last = ' '; + assert(OUST_NL == st->oust); + assert(st->wantws); + printf(".%s ", linemac); + st->oust = OUST_MAC; + + gotmacro = 0; while (*start < end) { - if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + wantws = ' ' == buf[*start] || '\n' == buf[*start]; + if (wantws) { + last = ' '; + do { + (*start)++; + } while (*start < end && ' ' == buf[*start]); + } + + if (*start + 1 < end && '<' == buf[*start + 1] && + 'A' <= buf[*start] && 'Z' >= buf[*start]) { + st->wantws |= wantws; + gotmacro = formatcode(st, buf, + start, end, nomacro, 1); continue; } + + if (gotmacro) { + if (*start < end || st->outbuflen) { + if (st->wantws || + (wantws && !st->outbuflen)) + printf(" No "); + else + printf(" Ns "); + } + gotmacro = 0; + } + outbuf_flush(st); + st->wantws = wantws; + + if (*start >= end) + break; + + if (st->wantws) { + putchar(' '); + st->wantws = 0; + } + /* * Since we're already on a macro line, we want to make * sure that we don't inadvertently invoke a macro. @@ -405,16 +805,18 @@ formatcodeln(const char *buf, size_t *start, size_t en * something that needn't be escaped. */ if (' ' == last && end - *start > 1 && - isupper((int)buf[*start]) && - islower((int)buf[*start + 1]) && - (end - *start == 2 || - ' ' == buf[*start + 2])) + isupper((unsigned char)buf[*start]) && + islower((unsigned char)buf[*start + 1]) && + (end - *start == 2 || ' ' == buf[*start + 2])) printf("\\&"); - if ('\n' != buf[*start]) - putchar(last = buf[*start]); - else - putchar(last = ' '); + putchar(last = buf[*start]); + + /* Protect against character escapes. */ + + if ('\\' == last) + putchar('e'); + (*start)++; } } @@ -445,6 +847,9 @@ listguess(const char *buf, size_t start, size_t end) * A command paragraph, as noted in the perlpod manual, just indicates * that we should do something, optionally with some text to print as * well. + * From the perspective of external callers, + * always stays in OUST_NL/wantws mode, + * but its children do use OUST_MAC. */ static void command(struct state *st, const char *buf, size_t start, size_t end) @@ -470,7 +875,9 @@ command(struct state *st, const char *buf, size_t star return; start += csz; - skipspace(buf, &start, end); + while (start < end && ' ' == buf[start]) + start++; + len = end - start; if (st->paused) { @@ -486,34 +893,34 @@ command(struct state *st, const char *buf, size_t star * The behaviour of head= follows from a quick glance at * how pod2man handles it. */ - printf(".Sh "); - st->isname = 0; - if (end - start == 4) + st->sect = SECT_NONE; + if (end - start == 4) { if (0 == memcmp(&buf[start], "NAME", 4)) - st->isname = 1; - formatcodeln(buf, &start, end, 1); - putchar('\n'); + st->sect = SECT_NAME; + } else if (end - start == 8) { + if (0 == memcmp(&buf[start], "SYNOPSIS", 8)) + st->sect = SECT_SYNOPSIS; + } + formatcodeln(st, "Sh", buf, &start, end, 1); + mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD2): - printf(".Ss "); - formatcodeln(buf, &start, end, 1); - putchar('\n'); + formatcodeln(st, "Ss", buf, &start, end, 1); + mdoc_newln(st); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); - printf(".Em "); - formatcodeln(buf, &start, end, 0); - putchar('\n'); + formatcodeln(st, "Em", buf, &start, end, 0); + mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; case (CMD_HEAD4): puts(".Pp"); - printf(".No "); - formatcodeln(buf, &start, end, 0); - putchar('\n'); + formatcodeln(st, "No", buf, &start, end, 0); + mdoc_newln(st); puts(".Pp"); st->haspar = 1; break; @@ -534,6 +941,14 @@ command(struct state *st, const char *buf, size_t star st->lstack[st->lpos - 1] = LIST__MAX; break; case (CMD_ITEM): + if (0 == st->lpos) { + /* + * Bad markup. + * Try to compensate. + */ + st->lstack[st->lpos] = LIST__MAX; + st->lpos++; + } assert(st->lpos > 0); /* * If we're the first =item, guess at what our content @@ -557,9 +972,8 @@ command(struct state *st, const char *buf, size_t star } switch (st->lstack[st->lpos - 1]) { case (LIST_TAG): - printf(".It "); - formatcodeln(buf, &start, end, 0); - putchar('\n'); + formatcodeln(st, "It", buf, &start, end, 0); + mdoc_newln(st); break; case (LIST_ENUM): /* FALLTHROUGH */ @@ -610,33 +1024,370 @@ command(struct state *st, const char *buf, size_t star } /* + * Put the type provided as an argument into the dictionary. + */ +static void +register_type(const char *ptype) +{ + const char *pname, *pend; + + pname = ptype; + while (isalnum((unsigned char)*pname) || '_' == *pname) + pname++; + if ((pname - ptype == 6 && ! strncmp(ptype, "struct", 6)) || + (pname - ptype == 4 && ! strncmp(ptype, "enum", 4))) { + while (' ' == *pname) + pname++; + pend = pname; + while (isalnum((unsigned char)*pend) || '_' == *pend) + pend++; + if (pend > pname) + dict_put(pname, pend - pname, MDOC_Vt); + } else + pend = pname; + if (pend > ptype) + dict_put(ptype, pend - ptype, MDOC_Vt); +} + +/* * Just pump out the line in a verbatim block. + * From the perspective of external callers, + * always stays in OUST_NL/wantws mode. */ static void -verbatim(struct state *st, const char *buf, size_t start, size_t end) +verbatim(struct state *st, char *buf, size_t start, size_t end) { + size_t i, ift, ifo, ifa, ifc, inl; + char *cp, *cp2; + int indisplay, nopen, wantsp; - if ( ! st->parsing || st->paused) + if (st->paused || ! st->parsing) return; - puts(".Bd -literal"); - printf("%.*s\n", (int)(end - start), &buf[start]); - puts(".Ed"); + indisplay = wantsp = 0; + +again: + if (start == end) { + if (indisplay) + puts(".Ed"); + return; + } + + if ('\n' == buf[start]) { + wantsp = 1; + start++; + goto again; + } + + /* + * If we're in the SYNOPSIS, see if we're an #include block. + * If we are, then print the "In" macro and re-loop. + * This handles any number of inclusions, but only when they + * come before the remaining parts... + */ + if (SECT_SYNOPSIS == st->sect) { + i = start; + while (i < end && buf[i] == ' ') + i++; + if (i == end) + goto again; + + /* We're an include block! */ + if (end - i > 10 && + 0 == memcmp(&buf[i], "#include <", 10)) { + start = i + 10; + while (start < end && ' ' == buf[start]) + start++; + if (indisplay) + puts(".Ed"); + indisplay = wantsp = 0; + fputs(".In ", stdout); + /* Stop til the '>' marker or we hit eoln. */ + while (start < end && + '>' != buf[start] && '\n' != buf[start]) + putchar(buf[start++]); + putchar('\n'); + if (start < end && '>' == buf[start]) + start++; + if (start < end && '\n' == buf[start]) + start++; + goto again; + } + + /* Other preprocessor directives. */ + if ('#' == buf[i]) { + if (indisplay) + puts(".Ed"); + indisplay = wantsp = 0; + fputs(".Fd ", stdout); + start = i; + while(start < end && '\n' != buf[start]) + putchar(buf[start++]); + putchar('\n'); + if (start < end && '\n' == buf[start]) + start++; + + /* Remember #define for Dv or Fn. */ + + if (strncmp(buf + i + 1, "define", 6) || + ! isspace((unsigned char)buf[i + 7])) + goto again; + + ifo = i + 7; + while (ifo < start && + isspace((unsigned char)buf[ifo])) + ifo++; + ifa = ifo; + while ('_' == buf[ifa] || + isalnum((unsigned char)buf[ifa])) + ifa++; + dict_put(buf + ifo, ifa - ifo, + '(' == buf[ifa] ? MDOC_Fo : MDOC_Dv); + + goto again; + } + + /* Parse function declaration. */ + ifo = ifa = ifc = 0; + inl = end; + nopen = 0; + for (ift = i; i < end; i++) { + if (ifc) { + if (buf[i] != '\n') + continue; + inl = i; + break; + } + switch (buf[i]) { + case '\t': + /* FALLTHROUGH */ + case ' ': + if ( ! ifa) + ifo = i; + break; + case '(': + if (ifo) { + nopen++; + if ( ! ifa) + ifa = i; + } else + i = end; + break; + case ')': + switch (nopen) { + case 0: + i = end; + break; + case 1: + ifc = i; + break; + default: + nopen--; + break; + } + break; + default: + break; + } + } + + /* Encode function declaration. */ + if (ifc) { + for (i = ifa; i < ifc; i++) + if (buf[i] == '\n') + buf[i] = ' '; + buf[ifo++] = '\0'; + register_type(buf + ift); + if (indisplay) + puts(".Ed"); + indisplay = wantsp = 0; + printf(".Ft %s", buf + ift); + if (buf[ifo] == '*') { + fputs(" *", stdout); + ifo++; + } + putchar('\n'); + buf[ifa++] = '\0'; + printf(".Fo %s\n", buf + ifo); + dict_put(buf + ifo, 0, MDOC_Fo); + buf[ifc++] = '\0'; + for (;;) { + cp = strchr(buf + ifa, ','); + if (cp != NULL) { + cp2 = cp; + *cp++ = '\0'; + } else + cp2 = strchr(buf + ifa, '\0'); + while (isalnum((unsigned char)cp2[-1]) || + '_' == cp2[-1]) + cp2--; + if ('\0' != *cp2) + dict_put(cp2, 0, MDOC_Fa); + register_type(buf + ifa); + if (strchr(buf + ifa, ' ') == NULL) + printf(".Fa %s\n", buf + ifa); + else + printf(".Fa \"%s\"\n", buf + ifa); + if (cp == NULL) + break; + while (*cp == ' ' || *cp == '\t') + cp++; + ifa = cp - buf; + } + puts(".Fc"); + if (buf[ifc] == ';') + ifc++; + if (ifc < inl) { + buf[inl] = '\0'; + puts(buf + ifc); + } + start = inl < end ? inl + 1 : end; + goto again; + } + } + + if ( ! indisplay) + puts(".Bd -literal"); + else if (wantsp) + putchar('\n'); + indisplay = 1; + wantsp = 0; + + for (last = '\n'; start < end; start++) { + /* + * Handle accidental macros (newline starting with + * control character) and escapes. + */ + if ('\n' == last) { + if ('\n' == buf[start]) + goto again; + if ('.' == buf[start] || '\'' == buf[start]) + printf("\\&"); + } + putchar(last = buf[start]); + if ('\\' == buf[start]) + printf("e"); + } + if ('\n' != last) + putchar('\n'); + if (indisplay) + puts(".Ed"); } /* + * See dosynopsisop(). + */ +static int +hasmatch(const char *buf, size_t start, size_t end) +{ + size_t stack; + + for (stack = 0; start < end; start++) + if (buf[start] == '[') + stack++; + else if (buf[start] == ']' && 0 == stack) + return(1); + else if (buf[start] == ']') + stack--; + return(0); +} + +/* + * If we're in the SYNOPSIS section and we've encounter braces in an + * ordinary paragraph, then try to see whether we're an [-option]. + * Do this, if we're an opening bracket, by first seeing if we have a + * matching end via hasmatch(). + * If we're an ending bracket, see if we have a stack already. + */ +static int +dosynopsisop(struct state *st, const char *buf, + size_t *start, size_t end, size_t *opstack) +{ + + assert('[' == buf[*start] || ']' == buf[*start]); + + if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) { + mdoc_newln(st); + puts(".Oo"); + (*opstack)++; + } else if ('[' == buf[*start]) + return(0); + + if (']' == buf[*start] && *opstack > 0) { + mdoc_newln(st); + puts(".Oc"); + (*opstack)--; + } else if (']' == buf[*start]) + return(0); + + (*start)++; + last = '\n'; + while (' ' == buf[*start]) + (*start)++; + return(1); +} + +/* + * Format multiple "Nm" manpage names in the NAME section. + * From the perspective of external callers, + * always stays in OUST_NL/wantws mode, + * but its children do use OUST_MAC. + */ +static void +donamenm(struct state *st, const char *buf, size_t *start, size_t end) +{ + size_t word; + + assert(OUST_NL == st->oust); + assert(st->wantws); + + while (*start < end && isspace((unsigned char)buf[*start])) + (*start)++; + + if (end == *start) { + puts(".Nm unknown"); + return; + } + + while (*start < end) { + for (word = *start; word < end; word++) + if (',' == buf[word]) + break; + formatcodeln(st, "Nm", buf, start, word, 1); + if (*start == end) { + mdoc_newln(st); + break; + } + assert(',' == buf[*start]); + printf(" ,"); + mdoc_newln(st); + (*start)++; + while (*start < end && isspace((unsigned char)buf[*start])) + (*start)++; + } +} + +/* * Ordinary paragraph. * Well, this is really the hardest--POD seems to assume that, for * example, a leading space implies a newline, and so on. * Lots of other snakes in the grass: escaping a newline followed by a * period (accidental mdoc(7) control), double-newlines after macro * passages, etc. + * + * Uses formatcode() to go to OUST_MAC mode + * and outbuf_flush() to go to OUST_TXT mode. + * In text mode, wantws requests white space before the text + * currently contained in the outbuf, not before upcoming text. + * Must make sure to go back to OUST_NL/wantws mode before returning. */ static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { - int last; - size_t i, j; + size_t i, j, opstack, wend; + enum mdoc_type mtype; + int eos, noeos, seq; + char savechar; if ( ! st->parsing || st->paused) return; @@ -647,9 +1398,10 @@ ordinary(struct state *st, const char *buf, size_t sta * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ - if (st->isname) { - for (i = end - 1; i > start; i--) - if ('-' == buf[i]) + if (SECT_NAME == st->sect) { + for (i = end - 2; i > start; i--) + if ('-' == buf[i] && + isspace((unsigned char)buf[i + 1])) break; if ('-' == buf[i]) { j = i; @@ -657,13 +1409,13 @@ ordinary(struct state *st, const char *buf, size_t sta for ( ; i > start; i--) if ('-' != buf[i]) break; - printf(".Nm "); - formatcodeln(buf, &start, i + 1, 1); - putchar('\n'); + donamenm(st, buf, &start, i + 1); start = j + 1; - printf(".Nd "); - formatcodeln(buf, &start, end, 1); - putchar('\n'); + while (start < end && + isspace((unsigned char)buf[start])) + start++; + formatcodeln(st, "Nd", buf, &start, end, 1); + mdoc_newln(st); return; } } @@ -673,41 +1425,186 @@ ordinary(struct state *st, const char *buf, size_t sta st->haspar = 0; last = '\n'; + opstack = 0; - while (start < end) { + for (seq = 0; start < end; seq++) { /* * Loop til we get either to a newline or escape. * Escape initial control characters. */ while (start < end) { - if (start < end - 1 && '<' == buf[start + 1]) + if (start < end - 1 && '<' == buf[start + 1] && + 'A' <= buf[start] && 'Z' >= buf[start]) break; else if ('\n' == buf[start]) break; else if ('\n' == last && '.' == buf[start]) - printf("\\&"); + outbuf_addstr(st, "\\&"); else if ('\n' == last && '\'' == buf[start]) - printf("\\&"); - putchar(last = buf[start++]); - } + outbuf_addstr(st, "\\&"); + /* + * If we're in the SYNOPSIS, have square + * brackets indicate that we're opening and + * closing an optional context. + */ - if (start < end - 1 && '<' == buf[start + 1]) { + if (SECT_SYNOPSIS == st->sect && + ('[' == buf[start] || + ']' == buf[start]) && + dosynopsisop(st, buf, + &start, end, &opstack)) + continue; + + /* Merely buffer non-whitespace. */ + + last = buf[start++]; + if ( ! isspace(last)) + outbuf_addchar(st); + if (start < end && + ! isspace((unsigned char)buf[start - 1]) && + ! isspace((unsigned char)buf[start])) + continue; + /* - * We've encountered a format code. - * This is going to trigger a macro no matter - * what, so print a newline now. - * Then print the (possibly nested) macros and - * following that, a newline. + * Found the end of a word. + * Rewind trailing delimiters. */ - if (formatcode(buf, &start, end, 0, last, 0)) - putchar(last = '\n'); - } else if (start < end && '\n' == buf[start]) { + + eos = noeos = 0; + for (wend = st->outbuflen; wend; wend--) + if ('.' == st->outbuf[wend - 1] || + '!' == st->outbuf[wend - 1] || + '?' == st->outbuf[wend - 1]) + eos = 1; + else if ('|' == st->outbuf[wend - 1] || + ',' == st->outbuf[wend - 1] || + ';' == st->outbuf[wend - 1] || + ':' == st->outbuf[wend - 1]) + noeos = 1; + else if ('\'' != st->outbuf[wend - 1] && + '"' != st->outbuf[wend - 1] && + ')' != st->outbuf[wend - 1] && + ']' != st->outbuf[wend - 1]) + break; + eos &= ! noeos; + /* - * Print the newline only if we haven't already - * printed a newline. + * Detect function names. */ - if (last != '\n') - putchar(last = buf[start]); + + mtype = MDOC_Fa; + savechar = '\0'; + if (wend && ')' == st->outbuf[wend] && + '(' == st->outbuf[wend - 1]) { + mtype = dict_get(st->outbuf, --wend); + if (MDOC_Dv == mtype) + mtype = MDOC_Fo; + if (MDOC_Fo == mtype || MDOC_MAX == mtype) { + st->outbuflen = wend; + st->outbuf[wend] = '\0'; + mdoc_newln(st); + if (MDOC_Fo == mtype) + fputs(".Fn ", stdout); + else + fputs(".Xr ", stdout); + st->oust = OUST_MAC; + } + } else { + mtype = dict_get(st->outbuf, wend); + if (MDOC_Dv == mtype) { + savechar = st->outbuf[wend]; + st->outbuf[wend] = '\0'; + mdoc_newln(st); + fputs(".Dv ", stdout); + st->oust = OUST_MAC; + } else + mtype = MDOC_Fa; + } + + /* + * On whitespace, flush the output buffer + * and allow breaking to a macro line. + */ + + outbuf_flush(st); + + /* + * End macro lines, and + * end text lines at the end of sentences. + */ + + if (OUST_MAC == st->oust || (eos && wend > 1 && + islower((unsigned char)st->outbuf[wend - 1]))) { + if (MDOC_MAX == mtype) + fputs(" 3", stdout); + if (MDOC_Fa != mtype) { + if (MDOC_Dv == mtype) + st->outbuf[wend] = savechar; + else + wend += 2; + while ('\0' != st->outbuf[wend]) + printf(" %c", + st->outbuf[wend++]); + } + mdoc_newln(st); + } + + /* Advance to the next word. */ + + while ('\n' != buf[start] && + isspace((unsigned char)buf[start])) + start++; + st->wantws = 1; + } + + if (start < end - 1 && '<' == buf[start + 1] && + 'A' <= buf[start] && 'Z' >= buf[start]) { + formatcode(st, buf, &start, end, 0, seq); + if (OUST_MAC == st->oust) { + /* + * Let mdoc(7) handle trailing punctuation. + * XXX Some punctuation characters + * are not handled yet. + */ + if ((start == end - 1 || + (start < end - 1 && + (' ' == buf[start + 1] || + '\n' == buf[start + 1]))) && + NULL != strchr("|.,;:?!)]", buf[start])) { + putchar(' '); + putchar(buf[start++]); + } + + if (st->wantws || + ' ' == buf[start] || + '\n' == buf[start]) + mdoc_newln(st); + + /* + * Consume all whitespace + * so we don't accidentally start + * an implicit literal line. + */ + + while (start < end && ' ' == buf[start]) + start++; + + /* + * Some text is following. + * Implement requested spacing. + */ + + if ( ! st->wantws && start < end && + ('<' != buf[start + 1] || + 'A' > buf[start] || + 'Z' < buf[start])) { + printf(" Ns "); + st->wantws = 1; + } + } + } else if (start < end && '\n' == buf[start]) { + outbuf_flush(st); + mdoc_newln(st); if (++start >= end) continue; /* @@ -718,28 +1615,15 @@ ordinary(struct state *st, const char *buf, size_t sta * have a macro subsequent it, which may be * possible if we have an escape next. */ - if (' ' == buf[start] || '\t' == buf[start]) { + if (' ' == buf[start] || '\t' == buf[start]) puts(".br"); - last = '\n'; - } for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; - } else if (start < end) { - /* - * Default: print the character. - * Escape initial control characters. - */ - if ('\n' == last && '.' == buf[start]) - printf("\\&"); - else if ('\n' == last && '\'' == buf[start]) - printf("\\&"); - putchar(last = buf[start++]); - } + } } - - if (last != '\n') - putchar('\n'); + outbuf_flush(st); + mdoc_newln(st); } /* @@ -748,9 +1632,12 @@ ordinary(struct state *st, const char *buf, size_t sta * (default: starts with "="). */ static void -dopar(struct state *st, const char *buf, size_t start, size_t end) +dopar(struct state *st, char *buf, size_t start, size_t end) { + assert(OUST_NL == st->oust); + assert(st->wantws); + if (end == start) return; if (' ' == buf[start] || '\t' == buf[start]) @@ -767,26 +1654,43 @@ dopar(struct state *st, const char *buf, size_t start, */ static void dofile(const struct args *args, const char *fname, - const struct tm *tm, const char *buf, size_t sz) + const struct tm *tm, char *buf, size_t sz) { - size_t sup, end, i, cur = 0; - struct state st; - const char *section, *date; char datebuf[64]; + struct state st; + const char *fbase, *fext, *section, *date, *format; char *title, *cp; + size_t cur, end; + int verb; if (0 == sz) return; - /* Title is last path component of the filename. */ + /* + * Parsing the filename is almost always required, + * except when both the title and the section + * are provided on the command line. + */ - if (NULL != args->title) - title = strdup(args->title); - else if (NULL != (cp = strrchr(fname, '/'))) - title = strdup(cp + 1); - else - title = strdup(fname); - + if (NULL == args->title || NULL == args->section) { + fbase = strrchr(fname, '/'); + if (NULL == fbase) + fbase = fname; + else + fbase++; + fext = strrchr(fbase, '.'); + } else + fext = NULL; + + /* + * The title will be converted to uppercase, + * so it needs to be copied. + */ + + title = (NULL != args->title) ? strdup(args->title) : + (NULL != fext) ? strndup(fbase, fext - fbase) : + strdup(fbase); + if (NULL == title) { perror(NULL); exit(EXIT_FAILURE); @@ -794,19 +1698,18 @@ dofile(const struct args *args, const char *fname, /* Section is 1 unless suffix is "pm". */ - if (NULL == (section = args->section)) { - section = "1"; - if (NULL != (cp = strrchr(title, '.'))) { - *cp++ = '\0'; - if (0 == strcmp(cp, "pm")) - section = "3p"; - } - } + section = (NULL != args->section) ? args->section : + (NULL == fext || strcmp(fext + 1, "pm")) ? "1" : + PERL_SECTION; /* Date. Or the given "tm" if not supplied. */ - if (NULL == (date = args->date)) { - strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm); + date = args->date; + format = (NULL == date) ? "%B %d, %Y" : + strcmp(date, "Mdocdate") ? NULL : "$" "Mdocdate: %B %d %Y $"; + + if (NULL != format) { + strftime(datebuf, sizeof(datebuf), format, tm); date = datebuf; } @@ -821,30 +1724,43 @@ dofile(const struct args *args, const char *fname, free(title); + dict_init(); memset(&st, 0, sizeof(struct state)); + st.oust = OUST_NL; + st.wantws = 1; + assert(sz > 0); /* Main loop over file contents. */ - while (cur < sz) { + cur = 0; + for (;;) { + while (cur < sz && '\n' == buf[cur]) + cur++; + if (cur >= sz) + break; + + verb = isspace((unsigned char)buf[cur]); + /* Read until next paragraph. */ - for (i = cur + 1; i < sz; i++) - if ('\n' == buf[i] && '\n' == buf[i - 1]) { - /* Consume blank paragraphs. */ - while (i + 1 < sz && '\n' == buf[i + 1]) - i++; + + for (end = cur + 1; end + 1 < sz; end++) + if ('\n' == buf[end] && '\n' == buf[end + 1] && + !(verb && end + 2 < sz && + isspace((unsigned char)buf[end + 2]))) break; - } /* Adjust end marker for EOF. */ - end = i < sz ? i - 1 : - ('\n' == buf[sz - 1] ? sz - 1 : sz); - sup = i < sz ? end + 2 : sz; + if (end < sz && '\n' != buf[end]) + end++; + /* Process paragraph and adjust start. */ + dopar(&st, buf, cur, end); - cur = sup; + cur = end + 2; } + dict_destroy(); } /* @@ -864,8 +1780,6 @@ readfile(const struct args *args, const char *fname) time_t ttm; struct stat st; - assert(NULL != fname); - fd = 0 != strcmp("-", fname) ? open(fname, O_RDONLY, 0) : STDIN_FILENO; @@ -971,8 +1885,8 @@ main(int argc, char *argv[]) /* Accept only a single input file. */ - if (argc > 2) - return(EXIT_FAILURE); + if (argc > 1) + goto usage; else if (1 == argc) fname = *argv; @@ -981,7 +1895,7 @@ main(int argc, char *argv[]) usage: fprintf(stderr, "usage: %s [-d date] " - "[-n title] [-s section]\n", name); + "[-n title] [-s section] [file]\n", name); return(EXIT_FAILURE); }