=================================================================== RCS file: /cvs/pod2mdoc/pod2mdoc.c,v retrieving revision 1.5 retrieving revision 1.12 diff -u -p -r1.5 -r1.12 --- pod2mdoc/pod2mdoc.c 2014/03/23 13:00:24 1.5 +++ pod2mdoc/pod2mdoc.c 2014/04/01 11:58:32 1.12 @@ -1,4 +1,4 @@ -/* $Id: pod2mdoc.c,v 1.5 2014/03/23 13:00:24 kristaps Exp $ */ +/* $Id: pod2mdoc.c,v 1.12 2014/04/01 11:58:32 kristaps Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * @@ -26,6 +26,11 @@ #include #include +/* + * In what section can we find Perl manuals? + */ +#define PERL_SECTION "3p" + struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ @@ -39,11 +44,17 @@ enum list { LIST__MAX }; +enum sect { + SECT_NONE = 0, + SECT_NAME, /* NAME section */ + SECT_SYNOPSIS, /* SYNOPSIS section */ +}; + struct state { int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ int haspar; /* in paragraph: do we need Pp? */ - int isname; /* are we the NAME section? */ + enum sect sect; /* which section are we in? */ const char *fname; /* file being parsed */ #define LIST_STACKSZ 128 enum list lstack[LIST_STACKSZ]; /* open lists */ @@ -108,6 +119,8 @@ static const char fmts[FMT__MAX] = { 'Z' /* FMT_NULL */ }; +static int last; + /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. @@ -141,7 +154,7 @@ formatescape(const char *buf, size_t *start, size_t en * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ - if (0 == strcmp(esc, "lt")) + if (0 == strcmp(esc, "lt")) printf("\\(la"); else if (0 == strcmp(esc, "gt")) printf("\\(ra"); @@ -149,20 +162,81 @@ formatescape(const char *buf, size_t *start, size_t en printf("\\(ba"); else if (0 == strcmp(esc, "sol")) printf("\\(sl"); + else + return; + + last = 'a'; } /* - * Skip space characters. + * Run some heuristics to intuit a link format. + * I recognise L as a Perl manpage, printing it in section 3p; + * or a general UNIX foo(5) manpage. + * If I recognise one, I set "start" to be the end of the sequence so + * that the caller can safely just continue processing. + * Otherwise, I don't touch "start". */ static int -skipspace(const char *buf, size_t *start, size_t end) +trylink(const char *buf, size_t *start, size_t end, size_t dsz) { - size_t sv = *start; + size_t sv, nstart, nend, i, j; + int hasdouble; - while (*start < end && ' ' == buf[*start]) - (*start)++; + /* + * Scan to the start of the terminus. + * This function is more or less replicated in the formatcode() + * for null or index formatting codes. + */ + hasdouble = 0; + for (sv = nstart = *start; nstart < end; nstart++) { + /* Do we have a double-colon? */ + if (':' == buf[nstart] && + nstart > sv && + ':' == buf[nstart - 1]) + hasdouble = 1; + if ('>' != buf[nstart]) + continue; + else if (dsz == 1) + break; + assert(nstart > 0); + if (' ' != buf[nstart - 1]) + continue; + i = nstart; + for (j = 0; i < end && j < dsz; j++) + if ('>' != buf[i++]) + break; + if (dsz == j) + break; + } + + /* We don't care about stubs. */ + if (nstart == end || nstart == *start) + return(0); - return(*start > sv); + /* Set nend to the end of content. */ + nend = nstart; + if (dsz > 1) + nend--; + + /* + * Provide for some common invocations of the link primitive. + * First, allow us to link to other Perl manuals. + */ + if (hasdouble) + printf("Xr %.*s " PERL_SECTION, + (int)(nend - sv), &buf[sv]); + else if (nend - sv > 3 && isalnum(buf[sv]) && + ')' == buf[nend - 1] && + isdigit((int)buf[nend - 2]) && + '(' == buf[nend - 3]) + printf("Xr %.*s %c", + (int)(nend - 3 - sv), + &buf[sv], buf[nend - 2]); + else + return(0); + + *start = nstart; + return(1); } /* @@ -172,13 +246,15 @@ skipspace(const char *buf, size_t *start, size_t end) * the end of matched production. * If "reentrant", then we're being called after a macro has already * been printed to the current line. - * "last" is set to the last read character: this is used to determine - * whether we should buffer with space or not. - * If "nomacro", then we don't print any macros, just contained data. + * If "nomacro", then we don't print any macros, just contained data + * (e.g., following "Sh" or "Nm"). + * Return whether we've printed a macro or not--in other words, whether + * this should trigger a subsequent newline (this should be ignored when + * reentrant). */ static int -formatcode(const char *buf, size_t *start, - size_t end, int reentrant, int last, int nomacro) +formatcode(struct state *st, const char *buf, + size_t *start, size_t end, int reentrant, int nomacro) { enum fmt fmt; size_t i, j, dsz; @@ -186,6 +262,21 @@ formatcode(const char *buf, size_t *start, assert(*start + 1 < end); assert('<' == buf[*start + 1]); + /* + * First, look up the format code. + * If it's not valid, then exit immediately. + */ + for (fmt = 0; fmt < FMT__MAX; fmt++) + if (buf[*start] == fmts[fmt]) + break; + + if (FMT__MAX == fmt) { + putchar(last = buf[(*start)++]); + if ('\\' == last) + putchar('e'); + return(0); + } + /* * Determine whether we're overriding our delimiter. * According to POD, if we have more than one '<' followed by a @@ -201,33 +292,21 @@ formatcode(const char *buf, size_t *start, if (dsz > 1 && (i >= end || ' ' != buf[i])) dsz = 1; - for (fmt = 0; fmt < FMT__MAX; fmt++) - if (buf[*start] == fmts[fmt]) - break; - - /* Invalid macros are just regular text. */ - - if (FMT__MAX == fmt) { - putchar(buf[*start]); - (*start)++; - return(0); - } - /* Remember, if dsz>1, to jump the trailing space. */ *start += dsz + 1 + (dsz > 1 ? 1 : 0); /* - * Escapes don't print macro sequences, so just output them like - * normal text before processing for macros. + * Escapes and ignored codes (NULL and INDEX) don't print macro + * sequences, so just output them like normal text before + * processing for real macros. */ if (FMT_ESCAPE == fmt) { formatescape(buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { /* - * For indices and nulls, just consume. - * Be wary of encountering custom delimiters (dsz>1), - * which require special handling. + * Just consume til the end delimiter, accounting for + * whether it's a custom one. */ for ( ; *start < end; (*start)++) { if ('>' != buf[*start]) @@ -249,6 +328,10 @@ formatcode(const char *buf, size_t *start, return(0); } + /* + * Check whether we're supposed to print macro stuff (this is + * suppressed in, e.g., "Nm" and "Sh" macros). + */ if ( ! nomacro) { /* * Print out the macro describing this format code. @@ -257,37 +340,61 @@ formatcode(const char *buf, size_t *start, * indicator. * Otherwise, offset us with a space. */ - if ( ! reentrant && last != '\n') - putchar('\n'); - if ( ! reentrant) + if ( ! reentrant) { + if (last != '\n') + putchar('\n'); putchar('.'); - else + } else putchar(' '); /* - * If we don't have whitespace before us, then suppress - * macro whitespace with Ns. + * If we don't have whitespace before us (and none after + * the opening delimiter), then suppress macro + * whitespace with Pf. */ - if (' ' != last) - printf("Ns "); + if (' ' != last && '\n' != last && ' ' != buf[*start]) + printf("Pf "); + switch (fmt) { case (FMT_ITALIC): printf("Em "); break; case (FMT_BOLD): + /* + * Doclifting: if we're a bold "-xx" and we're + * in the SYNOPSIS section, then it's likely + * that we're a flag. + * Be really strict: only do this when the dash + * is followed by alnums til the end marker, + * which mustn't be a custom. + */ + if (SECT_SYNOPSIS == st->sect && + end - *start > 1 && + '-' == buf[*start] && + (isalnum((int)buf[*start + 1]) || + '?' == buf[*start + 1])) { + for (i = *start + 1; i < end; i++) + if ( ! isalnum((int)buf[i])) + break; + if (i < end && '>' == buf[i]) { + (*start)++; + printf("Fl "); + break; + } + } printf("Sy "); break; case (FMT_CODE): printf("Qo Li "); break; case (FMT_LINK): - printf("Lk "); + if ( ! trylink(buf, start, end, dsz)) + printf("No "); break; case (FMT_FILE): printf("Pa "); break; case (FMT_NBSP): - /* TODO. */ printf("No "); break; default: @@ -296,7 +403,7 @@ formatcode(const char *buf, size_t *start, } /* - * Read until we reach the end market (e.g., '>') or until we + * Process until we reach the end marker (e.g., '>') or until we * find a nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. @@ -323,7 +430,7 @@ formatcode(const char *buf, size_t *start, } } if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + formatcode(st, buf, start, end, 1, nomacro); continue; } @@ -343,43 +450,34 @@ formatcode(const char *buf, size_t *start, printf("\\&"); /* Suppress newline. */ - if ('\n' == (last = buf[(*start)++])) - last = ' '; + if ('\n' == buf[*start]) + putchar(last = ' '); + else + putchar(last = buf[*start]); - putchar(last); + /* Protect against character escapes. */ + if ('\\' == last) + putchar('e'); + + (*start)++; + + if (' ' == last) + while (*start < end && ' ' == buf[*start]) + (*start)++; } if ( ! nomacro && FMT_CODE == fmt) printf(" Qc "); - if (reentrant) - return(1); - - /* FIXME: with the "Qc", this doens't work good. */ - /* - * If we're not reentrant, we want to put ending punctuation on - * the macro line so that it's properly handled by being - * smooshed against the terminal word. + * We're now subsequent the format code. + * If there isn't a space (or newline) here, and we haven't just + * printed a space, then suppress space. */ - skipspace(buf, start, end); + if ( ! nomacro && ' ' != last) + if (' ' != buf[*start] && '\n' != buf[*start]) + printf(" Ns "); - if (',' != buf[*start] && '.' != buf[*start] && - '!' != buf[*start] && '?' != buf[*start] && - ')' != buf[*start]) - return(1); - while (*start < end) { - if (',' != buf[*start] && - '.' != buf[*start] && - '!' != buf[*start] && - '?' != buf[*start] && - ')' != buf[*start]) - break; - putchar(' '); - putchar(buf[*start]); - (*start)++; - } - skipspace(buf, start, end); return(1); } @@ -387,14 +485,14 @@ formatcode(const char *buf, size_t *start, * Calls formatcode() til the end of a paragraph. */ static void -formatcodeln(const char *buf, size_t *start, size_t end, int nomacro) +formatcodeln(struct state *st, const char *buf, + size_t *start, size_t end, int nomacro) { - int last; last = ' '; while (*start < end) { if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + formatcode(st, buf, start, end, 1, nomacro); continue; } /* @@ -411,10 +509,15 @@ formatcodeln(const char *buf, size_t *start, size_t en ' ' == buf[*start + 2])) printf("\\&"); - if ('\n' != buf[*start]) - putchar(last = buf[*start]); - else + if ('\n' == buf[*start]) putchar(last = ' '); + else + putchar(last = buf[*start]); + + /* Protect against character escapes. */ + if ('\\' == last) + putchar('e'); + (*start)++; } } @@ -470,7 +573,9 @@ command(struct state *st, const char *buf, size_t star return; start += csz; - skipspace(buf, &start, end); + while (start < end && ' ' == buf[start]) + start++; + len = end - start; if (st->paused) { @@ -487,24 +592,28 @@ command(struct state *st, const char *buf, size_t star * how pod2man handles it. */ printf(".Sh "); - st->isname = 0; - if (end - start == 4) + st->sect = SECT_NONE; + if (end - start == 4) { if (0 == memcmp(&buf[start], "NAME", 4)) - st->isname = 1; - formatcodeln(buf, &start, end, 1); + st->sect = SECT_NAME; + } else if (end - start == 8) { + if (0 == memcmp(&buf[start], "SYNOPSIS", 8)) + st->sect = SECT_SYNOPSIS; + } + formatcodeln(st, buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD2): printf(".Ss "); - formatcodeln(buf, &start, end, 1); + formatcodeln(st, buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); printf(".Em "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; @@ -512,7 +621,7 @@ command(struct state *st, const char *buf, size_t star case (CMD_HEAD4): puts(".Pp"); printf(".No "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; @@ -534,6 +643,14 @@ command(struct state *st, const char *buf, size_t star st->lstack[st->lpos - 1] = LIST__MAX; break; case (CMD_ITEM): + if (0 == st->lpos) { + /* + * Bad markup. + * Try to compensate. + */ + st->lstack[st->lpos] = LIST__MAX; + st->lpos++; + } assert(st->lpos > 0); /* * If we're the first =item, guess at what our content @@ -558,7 +675,7 @@ command(struct state *st, const char *buf, size_t star switch (st->lstack[st->lpos - 1]) { case (LIST_TAG): printf(".It "); - formatcodeln(buf, &start, end, 0); + formatcodeln(st, buf, &start, end, 0); putchar('\n'); break; case (LIST_ENUM): @@ -615,12 +732,25 @@ command(struct state *st, const char *buf, size_t star static void verbatim(struct state *st, const char *buf, size_t start, size_t end) { + int last; if ( ! st->parsing || st->paused) return; puts(".Bd -literal"); - printf("%.*s\n", (int)(end - start), &buf[start]); + for (last = ' '; start < end; start++) { + /* + * Handle accidental macros (newline starting with + * control character) and escapes. + */ + if ('\n' == last) + if ('.' == buf[start] || '\'' == buf[start]) + printf("\\&"); + putchar(last = buf[start]); + if ('\\' == buf[start]) + printf("e"); + } + putchar('\n'); puts(".Ed"); } @@ -635,7 +765,6 @@ verbatim(struct state *st, const char *buf, size_t sta static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { - int last; size_t i, j; if ( ! st->parsing || st->paused) @@ -647,7 +776,7 @@ ordinary(struct state *st, const char *buf, size_t sta * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ - if (st->isname) { + if (SECT_NAME == st->sect) { for (i = end - 1; i > start; i--) if ('-' == buf[i]) break; @@ -658,11 +787,11 @@ ordinary(struct state *st, const char *buf, size_t sta if ('-' != buf[i]) break; printf(".Nm "); - formatcodeln(buf, &start, i + 1, 1); + formatcodeln(st, buf, &start, i + 1, 1); putchar('\n'); start = j + 1; printf(".Nd "); - formatcodeln(buf, &start, end, 1); + formatcodeln(st, buf, &start, end, 1); putchar('\n'); return; } @@ -688,7 +817,29 @@ ordinary(struct state *st, const char *buf, size_t sta printf("\\&"); else if ('\n' == last && '\'' == buf[start]) printf("\\&"); +#if notyet + /* + * If we're in the SYNOPSIS, have square + * brackets indicate that we're opening and + * closing an optional context. + */ + if (SECT_SYNOPSIS == st->sect) { + if ('[' == buf[start] || + ']' == buf[start]) { + if (last != '\n') + putchar('\n'); + if ('[' == buf[start]) + printf(".Oo\n"); + else + printf(".Oc\n"); + start++; + continue; + } + } +#endif putchar(last = buf[start++]); + if ('\\' == last) + putchar('e'); } if (start < end - 1 && '<' == buf[start + 1]) { @@ -698,9 +849,14 @@ ordinary(struct state *st, const char *buf, size_t sta * what, so print a newline now. * Then print the (possibly nested) macros and * following that, a newline. + * Consume all whitespace so we don't + * accidentally start an implicit literal line. */ - if (formatcode(buf, &start, end, 0, last, 0)) + if (formatcode(st, buf, &start, end, 0, 0)) { putchar(last = '\n'); + while (start < end && ' ' == buf[start]) + start++; + } } else if (start < end && '\n' == buf[start]) { /* * Print the newline only if we haven't already @@ -725,17 +881,7 @@ ordinary(struct state *st, const char *buf, size_t sta for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; - } else if (start < end) { - /* - * Default: print the character. - * Escape initial control characters. - */ - if ('\n' == last && '.' == buf[start]) - printf("\\&"); - else if ('\n' == last && '\'' == buf[start]) - printf("\\&"); - putchar(last = buf[start++]); - } + } } if (last != '\n') @@ -799,7 +945,7 @@ dofile(const struct args *args, const char *fname, if (NULL != (cp = strrchr(title, '.'))) { *cp++ = '\0'; if (0 == strcmp(cp, "pm")) - section = "3p"; + section = PERL_SECTION; } }