=================================================================== RCS file: /cvs/pod2mdoc/pod2mdoc.c,v retrieving revision 1.3 retrieving revision 1.7 diff -u -p -r1.3 -r1.7 --- pod2mdoc/pod2mdoc.c 2014/03/20 15:18:56 1.3 +++ pod2mdoc/pod2mdoc.c 2014/03/23 23:35:59 1.7 @@ -1,4 +1,4 @@ -/* $Id: pod2mdoc.c,v 1.3 2014/03/20 15:18:56 schwarze Exp $ */ +/* $Id: pod2mdoc.c,v 1.7 2014/03/23 23:35:59 kristaps Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * @@ -32,12 +32,22 @@ struct args { const char *section; /* override "Dt" section */ }; +enum list { + LIST_BULLET = 0, + LIST_ENUM, + LIST_TAG, + LIST__MAX +}; + struct state { int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ int haspar; /* in paragraph: do we need Pp? */ int isname; /* are we the NAME section? */ const char *fname; /* file being parsed */ +#define LIST_STACKSZ 128 + enum list lstack[LIST_STACKSZ]; /* open lists */ + size_t lpos; /* where in list stack */ }; enum fmt { @@ -98,6 +108,8 @@ static const char fmts[FMT__MAX] = { 'Z' /* FMT_NULL */ }; +static int last; + /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. @@ -131,7 +143,7 @@ formatescape(const char *buf, size_t *start, size_t en * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ - if (0 == strcmp(esc, "lt")) + if (0 == strcmp(esc, "lt")) printf("\\(la"); else if (0 == strcmp(esc, "gt")) printf("\\(ra"); @@ -139,17 +151,24 @@ formatescape(const char *buf, size_t *start, size_t en printf("\\(ba"); else if (0 == strcmp(esc, "sol")) printf("\\(sl"); + else + return; + + last = 'a'; } /* * Skip space characters. */ -static void +static int skipspace(const char *buf, size_t *start, size_t end) { + size_t sv = *start; while (*start < end && ' ' == buf[*start]) (*start)++; + + return(*start > sv); } /* @@ -159,49 +178,90 @@ skipspace(const char *buf, size_t *start, size_t end) * the end of matched production. * If "reentrant", then we're being called after a macro has already * been printed to the current line. - * "last" is set to the last read character: this is used to determine - * whether we should buffer with space or not. - * If "nomacro", then we don't print any macros, just contained data. + * If "nomacro", then we don't print any macros, just contained data + * (e.g., following "Sh" or "Nm"). + * Return whether we've printed a macro or not--in other words, whether + * this should trigger a subsequent newline (this should be ignored when + * reentrant). */ static int formatcode(const char *buf, size_t *start, - size_t end, int reentrant, int last, int nomacro) + size_t end, int reentrant, int nomacro) { enum fmt fmt; + size_t i, j, dsz; assert(*start + 1 < end); assert('<' == buf[*start + 1]); + /* + * First, look up the format code. + * If it's not valid, then exit immediately. + */ for (fmt = 0; fmt < FMT__MAX; fmt++) if (buf[*start] == fmts[fmt]) break; - /* Invalid macros are just regular text. */ - if (FMT__MAX == fmt) { - putchar(buf[*start]); - (*start)++; + putchar(last = buf[(*start)++]); return(0); } - *start += 2; + /* + * Determine whether we're overriding our delimiter. + * According to POD, if we have more than one '<' followed by a + * space, then we need a space followed by matching '>' to close + * the expression. + * Otherwise we use the usual '<' and '>' matched pair. + */ + i = *start + 1; + while (i < end && '<' == buf[i]) + i++; + assert(i > *start + 1); + dsz = i - (*start + 1); + if (dsz > 1 && (i >= end || ' ' != buf[i])) + dsz = 1; + /* Remember, if dsz>1, to jump the trailing space. */ + *start += dsz + 1 + (dsz > 1 ? 1 : 0); + /* - * Escapes don't print macro sequences, so just output them like - * normal text before processing for macros. + * Escapes and ignored codes (NULL and INDEX) don't print macro + * sequences, so just output them like normal text before + * processing for real macros. */ if (FMT_ESCAPE == fmt) { formatescape(buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { - /* For indices and nulls, just consume. */ - while (*start < end && '>' != buf[*start]) - (*start)++; - if (*start < end) - (*start)++; + /* + * Just consume til the end delimiter, accounting for + * whether it's a custom one. + */ + for ( ; *start < end; (*start)++) { + if ('>' != buf[*start]) + continue; + else if (dsz == 1) + break; + assert(*start > 0); + if (' ' != buf[*start - 1]) + continue; + i = *start; + for (j = 0; i < end && j < dsz; j++) + if ('>' != buf[i++]) + break; + if (dsz != j) + continue; + (*start) += dsz; + break; + } return(0); } + /* + * Check whether we're supposed to print macro stuff (this is + * suppressed in, e.g., "Nm" and "Sh" macros). + */ if ( ! nomacro) { /* * Print out the macro describing this format code. @@ -210,19 +270,21 @@ formatcode(const char *buf, size_t *start, * indicator. * Otherwise, offset us with a space. */ - if ( ! reentrant && last != '\n') - putchar('\n'); - if ( ! reentrant) + if ( ! reentrant) { + if (last != '\n') + putchar('\n'); putchar('.'); - else + } else putchar(' '); /* - * If we don't have whitespace before us, then suppress - * macro whitespace with Ns. + * If we don't have whitespace before us (and none after + * the opening delimiter), then suppress macro + * whitespace with Pf. */ - if (' ' != last) - printf("Ns "); + if (' ' != last && '\n' != last && ' ' != buf[*start]) + printf("Pf "); + switch (fmt) { case (FMT_ITALIC): printf("Em "); @@ -249,68 +311,77 @@ formatcode(const char *buf, size_t *start, } /* - * Read until we reach the end market ('>') or until we find a - * nested format code. + * Process until we reach the end marker (e.g., '>') or until we + * find a nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. */ while (*start < end) { - if ('>' == buf[*start]) { + if ('>' == buf[*start] && 1 == dsz) { (*start)++; break; + } else if ('>' == buf[*start] && + ' ' == buf[*start - 1]) { + /* + * Handle custom delimiters. + * These require a certain number of + * space-preceded carrots before we're really at + * the end. + */ + i = *start; + for (j = 0; i < end && j < dsz; j++) + if ('>' != buf[i++]) + break; + if (dsz == j) { + *start += dsz; + break; + } } if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + formatcode(buf, start, end, 1, nomacro); continue; } - /* - * Make sure that any macro-like words (or - * really any word starting with a capital - * letter) is assumed to be a macro that must be - * escaped. - * XXX: should this be isalpha()? - */ - if ((' ' == last || '\n' == last) && - isupper(buf[*start])) - printf("\\&"); + /* + * Make sure that any macro-like words (or + * really any word starting with a capital + * letter) is assumed to be a macro that must be + * escaped. + * This matches "Xx " and "XxEOLN". + */ + if ((' ' == last || '\n' == last) && + end - *start > 1 && + isupper((int)buf[*start]) && + islower((int)buf[*start + 1]) && + (end - *start == 2 || + ' ' == buf[*start + 2])) + printf("\\&"); - last = buf[*start]; - if ('\n' == last) - last = ' '; - putchar(last); + /* Suppress newline. */ + if ('\n' == buf[*start]) + putchar(last = ' '); + else + putchar(last = buf[*start]); (*start)++; + + if (' ' == last) + while (*start < end && ' ' == buf[*start]) + (*start)++; } if ( ! nomacro && FMT_CODE == fmt) printf(" Qc "); - if (reentrant) - return(1); - /* - * If we're not reentrant, we want to put ending punctuation on - * the macro line so that it's properly handled by being - * smooshed against the terminal word. + * We're now subsequent the format code. + * If there isn't a space (or newline) here, and we haven't just + * printed a space, then suppress space. */ - skipspace(buf, start, end); - if (',' != buf[*start] && '.' != buf[*start] && - '!' != buf[*start] && '?' != buf[*start] && - ')' != buf[*start]) - return(1); - while (*start < end) { - if (',' != buf[*start] && - '.' != buf[*start] && - '!' != buf[*start] && - '?' != buf[*start] && - ')' != buf[*start]) - break; - putchar(' '); - putchar(buf[*start]); - (*start)++; - } - skipspace(buf, start, end); + if ( ! nomacro && ' ' != last) + if (' ' != buf[*start] && '\n' != buf[*start]) + printf(" Ns "); + return(1); } @@ -320,21 +391,58 @@ formatcode(const char *buf, size_t *start, static void formatcodeln(const char *buf, size_t *start, size_t end, int nomacro) { - int last; - last = '\n'; + last = ' '; while (*start < end) { if (*start + 1 < end && '<' == buf[*start + 1]) { - formatcode(buf, start, end, 1, last, nomacro); + formatcode(buf, start, end, 1, nomacro); continue; } + /* + * Since we're already on a macro line, we want to make + * sure that we don't inadvertently invoke a macro. + * We need to do this carefully because section names + * are used in troff and we don't want to escape + * something that needn't be escaped. + */ + if (' ' == last && end - *start > 1 && + isupper((int)buf[*start]) && + islower((int)buf[*start + 1]) && + (end - *start == 2 || + ' ' == buf[*start + 2])) + printf("\\&"); + if ('\n' != buf[*start]) putchar(last = buf[*start]); + else + putchar(last = ' '); (*start)++; } } /* + * Guess at what kind of list we are. + * These are taken straight from the POD manual. + * I don't know what people do in real life. + */ +static enum list +listguess(const char *buf, size_t start, size_t end) +{ + size_t len = end - start; + + assert(end >= start); + + if (len == 1 && '*' == buf[start]) + return(LIST_BULLET); + if (len == 2 && '1' == buf[start] && '.' == buf[start + 1]) + return(LIST_ENUM); + else if (len == 1 && '1' == buf[start]) + return(LIST_ENUM); + else + return(LIST_TAG); +} + +/* * A command paragraph, as noted in the perlpod manual, just indicates * that we should do something, optionally with some text to print as * well. @@ -411,21 +519,78 @@ command(struct state *st, const char *buf, size_t star st->haspar = 1; break; case (CMD_OVER): - /* - * TODO: we should be doing this after we process the - * first =item to see whether we'll do an -enum, - * -bullet, or something else. + /* + * If we have an existing list that hasn't had an =item + * yet, then make sure that we open it now. + * We use the default list type, but that can't be + * helped (we haven't seen any items yet). */ - puts(".Bl -tag -width Ds"); + if (st->lpos > 0) + if (LIST__MAX == st->lstack[st->lpos - 1]) { + st->lstack[st->lpos - 1] = LIST_TAG; + puts(".Bl -tag -width Ds"); + } + st->lpos++; + assert(st->lpos < LIST_STACKSZ); + st->lstack[st->lpos - 1] = LIST__MAX; break; case (CMD_ITEM): - printf(".It "); - formatcodeln(buf, &start, end, 0); - putchar('\n'); + if (0 == st->lpos) { + /* + * Bad markup. + * Try to compensate. + */ + st->lstack[st->lpos] = LIST__MAX; + st->lpos++; + } + assert(st->lpos > 0); + /* + * If we're the first =item, guess at what our content + * will be: "*" is a bullet list, "1." is a numbered + * list, and everything is tagged. + */ + if (LIST__MAX == st->lstack[st->lpos - 1]) { + st->lstack[st->lpos - 1] = + listguess(buf, start, end); + switch (st->lstack[st->lpos - 1]) { + case (LIST_BULLET): + puts(".Bl -bullet"); + break; + case (LIST_ENUM): + puts(".Bl -enum"); + break; + default: + puts(".Bl -tag -width Ds"); + break; + } + } + switch (st->lstack[st->lpos - 1]) { + case (LIST_TAG): + printf(".It "); + formatcodeln(buf, &start, end, 0); + putchar('\n'); + break; + case (LIST_ENUM): + /* FALLTHROUGH */ + case (LIST_BULLET): + /* + * Abandon the remainder of the paragraph + * because we're going to be a bulletted or + * numbered list. + */ + puts(".It"); + break; + default: + abort(); + } st->haspar = 1; break; case (CMD_BACK): - puts(".El"); + /* Make sure we don't back over the stack. */ + if (st->lpos > 0) { + st->lpos--; + puts(".El"); + } break; case (CMD_BEGIN): /* @@ -459,12 +624,19 @@ command(struct state *st, const char *buf, size_t star static void verbatim(struct state *st, const char *buf, size_t start, size_t end) { + size_t sv = start; if ( ! st->parsing || st->paused) return; puts(".Bd -literal"); - printf("%.*s\n", (int)(end - start), &buf[start]); + while (start < end) { + if (start > sv && '\n' == buf[start - 1]) + if ('.' == buf[start] || '\'' == buf[start]) + printf("\\&"); + putchar(buf[start++]); + } + putchar('\n'); puts(".Ed"); } @@ -479,7 +651,6 @@ verbatim(struct state *st, const char *buf, size_t sta static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { - int last; size_t i, j; if ( ! st->parsing || st->paused) @@ -501,10 +672,13 @@ ordinary(struct state *st, const char *buf, size_t sta for ( ; i > start; i--) if ('-' != buf[i]) break; - printf(".Nm %.*s\n", - (int)((i + 1) - start), &buf[start]); - printf(".Nd %.*s\n", - (int)(end - (j + 1)), &buf[j + 1]); + printf(".Nm "); + formatcodeln(buf, &start, i + 1, 1); + putchar('\n'); + start = j + 1; + printf(".Nd "); + formatcodeln(buf, &start, end, 1); + putchar('\n'); return; } } @@ -540,8 +714,11 @@ ordinary(struct state *st, const char *buf, size_t sta * Then print the (possibly nested) macros and * following that, a newline. */ - if (formatcode(buf, &start, end, 0, last, 0)) + if (formatcode(buf, &start, end, 0, 0)) { putchar(last = '\n'); + while (start < end && ' ' == buf[start]) + start++; + } } else if (start < end && '\n' == buf[start]) { /* * Print the newline only if we haven't already