/* $Id: pod2mdoc.c,v 1.2 2014/03/20 15:15:32 schwarze Exp $ */ /* * Copyright (c) 2014 Kristaps Dzonsons * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include struct args { const char *title; /* override "Dt" title */ const char *date; /* override "Dd" date */ const char *section; /* override "Dt" section */ }; struct state { int parsing; /* after =cut of before command */ int paused; /* in =begin and before =end */ int haspar; /* in paragraph: do we need Pp? */ int isname; /* are we the NAME section? */ const char *fname; /* file being parsed */ }; enum fmt { FMT_ITALIC, FMT_BOLD, FMT_CODE, FMT_LINK, FMT_ESCAPE, FMT_FILE, FMT_NBSP, FMT_INDEX, FMT_NULL, FMT__MAX }; enum cmd { CMD_POD = 0, CMD_HEAD1, CMD_HEAD2, CMD_HEAD3, CMD_HEAD4, CMD_OVER, CMD_ITEM, CMD_BACK, CMD_BEGIN, CMD_END, CMD_FOR, CMD_ENCODING, CMD_CUT, CMD__MAX }; static const char *const cmds[CMD__MAX] = { "pod", /* CMD_POD */ "head1", /* CMD_HEAD1 */ "head2", /* CMD_HEAD2 */ "head3", /* CMD_HEAD3 */ "head4", /* CMD_HEAD4 */ "over", /* CMD_OVER */ "item", /* CMD_ITEM */ "back", /* CMD_BACK */ "begin", /* CMD_BEGIN */ "end", /* CMD_END */ "for", /* CMD_FOR */ "encoding", /* CMD_ENCODING */ "cut" /* CMD_CUT */ }; static const char fmts[FMT__MAX] = { 'I', /* FMT_ITALIC */ 'B', /* FMT_BOLD */ 'C', /* FMT_CODE */ 'L', /* FMT_LINK */ 'E', /* FMT_ESCAPE */ 'F', /* FMT_FILE */ 'S', /* FMT_NBSP */ 'X', /* FMT_INDEX */ 'Z' /* FMT_NULL */ }; /* * Given buf[*start] is at the start of an escape name, read til the end * of the escape ('>') then try to do something with it. * Sets start to be one after the '>'. */ static void formatescape(const char *buf, size_t *start, size_t end) { char esc[16]; /* no more needed */ size_t i, max; max = sizeof(esc) - 1; i = 0; /* Read til our buffer is full. */ while (*start < end && '>' != buf[*start] && i < max) esc[i++] = buf[(*start)++]; esc[i] = '\0'; if (i == max) { /* Too long... skip til we end. */ while (*start < end && '>' != buf[*start]) (*start)++; return; } else if (*start >= end) return; assert('>' == buf[*start]); (*start)++; /* * TODO: right now, we only recognise the named escapes. * Just let the rest of them go. */ if (0 == strcmp(esc, "lt")) printf("\\(la"); else if (0 == strcmp(esc, "gt")) printf("\\(ra"); else if (0 == strcmp(esc, "vb")) printf("\\(ba"); else if (0 == strcmp(esc, "sol")) printf("\\(sl"); } /* * Skip space characters. */ static void skipspace(const char *buf, size_t *start, size_t end) { while (*start < end && ' ' == buf[*start]) (*start)++; } /* * We're at the character in front of a format code, which is structured * like X<...> and can contain nested format codes. * This consumes the whole format code, and any nested format codes, til * the end of matched production. * If "reentrant", then we're being called after a macro has already * been printed to the current line. * "last" is set to the last read character: this is used to determine * whether we should buffer with space or not. * If "nomacro", then we don't print any macros, just contained data. */ static int formatcode(const char *buf, size_t *start, size_t end, int reentrant, int last, int nomacro) { enum fmt fmt; assert(*start + 1 < end); assert('<' == buf[*start + 1]); for (fmt = 0; fmt < FMT__MAX; fmt++) if (buf[*start] == fmts[fmt]) break; /* Invalid macros are just regular text. */ if (FMT__MAX == fmt) { putchar(buf[*start]); (*start)++; return(0); } *start += 2; /* * Escapes don't print macro sequences, so just output them like * normal text before processing for macros. */ if (FMT_ESCAPE == fmt) { formatescape(buf, start, end); return(0); } else if (FMT_NULL == fmt || FMT_INDEX == fmt) { /* For indices and nulls, just consume. */ while (*start < end && '>' != buf[*start]) (*start)++; if (*start < end) (*start)++; return(0); } if ( ! nomacro) { /* * Print out the macro describing this format code. * If we're not "reentrant" (not yet on a macro line) * then print a newline, if necessary, and the macro * indicator. * Otherwise, offset us with a space. */ if ( ! reentrant && last != '\n') putchar('\n'); if ( ! reentrant) putchar('.'); else putchar(' '); /* * If we don't have whitespace before us, then suppress * macro whitespace with Ns. */ if (' ' != last) printf("Ns "); switch (fmt) { case (FMT_ITALIC): printf("Em "); break; case (FMT_BOLD): printf("Sy "); break; case (FMT_CODE): printf("Qo Li "); break; case (FMT_LINK): printf("Lk "); break; case (FMT_FILE): printf("Pa "); break; case (FMT_NBSP): /* TODO. */ printf("No "); break; default: abort(); } } /* * Read until we reach the end market ('>') or until we find a * nested format code. * Don't emit any newlines: since we're on a macro line, we * don't want to break the line. */ while (*start < end) { if ('>' == buf[*start]) { (*start)++; break; } if (*start + 1 < end && '<' == buf[*start + 1]) { formatcode(buf, start, end, 1, last, nomacro); continue; } if ('\n' != buf[*start]) { /* * Make sure that any macro-like words (or * really any word starting with a capital * letter) is assumed to be a macro that must be * escaped. * XXX: should this be isalpha()? */ if ((' ' == last || '\n' == last) && isupper(buf[*start])) printf("\\&"); putchar(last = buf[*start]); } (*start)++; } if ( ! nomacro && FMT_CODE == fmt) printf(" Qc "); if (reentrant) return(1); /* * If we're not reentrant, we want to put ending punctuation on * the macro line so that it's properly handled by being * smooshed against the terminal word. */ skipspace(buf, start, end); if (',' != buf[*start] && '.' != buf[*start] && '!' != buf[*start] && '?' != buf[*start] && ')' != buf[*start]) return(1); while (*start < end) { if (',' != buf[*start] && '.' != buf[*start] && '!' != buf[*start] && '?' != buf[*start] && ')' != buf[*start]) break; putchar(' '); putchar(buf[*start]); (*start)++; } skipspace(buf, start, end); return(1); } /* * Calls formatcode() til the end of a paragraph. */ static void formatcodeln(const char *buf, size_t *start, size_t end, int nomacro) { int last; last = '\n'; while (*start < end) { if (*start + 1 < end && '<' == buf[*start + 1]) { formatcode(buf, start, end, 1, last, nomacro); continue; } if ('\n' != buf[*start]) putchar(last = buf[*start]); (*start)++; } } /* * A command paragraph, as noted in the perlpod manual, just indicates * that we should do something, optionally with some text to print as * well. */ static void command(struct state *st, const char *buf, size_t start, size_t end) { size_t len, csz; enum cmd cmd; assert('=' == buf[start]); start++; len = end - start; for (cmd = 0; cmd < CMD__MAX; cmd++) { csz = strlen(cmds[cmd]); if (len < csz) continue; if (0 == memcmp(&buf[start], cmd[cmds], csz)) break; } /* Ignore bogus commands. */ if (CMD__MAX == cmd) return; start += csz; skipspace(buf, &start, end); len = end - start; if (st->paused) { st->paused = CMD_END != cmd; return; } switch (cmd) { case (CMD_POD): break; case (CMD_HEAD1): /* * The behaviour of head= follows from a quick glance at * how pod2man handles it. */ printf(".Sh "); st->isname = 0; if (end - start == 4) if (0 == memcmp(&buf[start], "NAME", 4)) st->isname = 1; formatcodeln(buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD2): printf(".Ss "); formatcodeln(buf, &start, end, 1); putchar('\n'); st->haspar = 1; break; case (CMD_HEAD3): puts(".Pp"); printf(".Em "); formatcodeln(buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; break; case (CMD_HEAD4): puts(".Pp"); printf(".No "); formatcodeln(buf, &start, end, 0); putchar('\n'); puts(".Pp"); st->haspar = 1; break; case (CMD_OVER): /* * TODO: we should be doing this after we process the * first =item to see whether we'll do an -enum, * -bullet, or something else. */ puts(".Bl -tag -width Ds"); break; case (CMD_ITEM): printf(".It "); formatcodeln(buf, &start, end, 0); putchar('\n'); st->haspar = 1; break; case (CMD_BACK): puts(".El"); break; case (CMD_BEGIN): /* * We disregard all types for now. * TODO: process at least "text" in a -literal block. */ st->paused = 1; break; case (CMD_FOR): /* * We ignore all types of encodings and formats * unilaterally. */ break; case (CMD_ENCODING): break; case (CMD_CUT): st->parsing = 0; return; default: abort(); } /* Any command (but =cut) makes us start parsing. */ st->parsing = 1; } /* * Just pump out the line in a verbatim block. */ static void verbatim(struct state *st, const char *buf, size_t start, size_t end) { if ( ! st->parsing || st->paused) return; puts(".Bd -literal"); printf("%.*s\n", (int)(end - start), &buf[start]); puts(".Ed"); } /* * Ordinary paragraph. * Well, this is really the hardest--POD seems to assume that, for * example, a leading space implies a newline, and so on. * Lots of other snakes in the grass: escaping a newline followed by a * period (accidental mdoc(7) control), double-newlines after macro * passages, etc. */ static void ordinary(struct state *st, const char *buf, size_t start, size_t end) { int last; size_t i, j; if ( ! st->parsing || st->paused) return; /* * Special-case: the NAME section. * If we find a "-" when searching from the end, assume that * we're in "name - description" format. * To wit, print out a "Nm" and "Nd" in that format. */ if (st->isname) { for (i = end - 1; i > start; i--) if ('-' == buf[i]) break; if ('-' == buf[i]) { j = i; /* Roll over multiple "-". */ for ( ; i > start; i--) if ('-' != buf[i]) break; printf(".Nm %.*s\n", (int)((i + 1) - start), &buf[start]); printf(".Nd %.*s\n", (int)(end - (j + 1)), &buf[j + 1]); return; } } if ( ! st->haspar) puts(".Pp"); st->haspar = 0; last = '\n'; while (start < end) { /* * Loop til we get either to a newline or escape. * Escape initial control characters. */ while (start < end) { if (start < end - 1 && '<' == buf[start + 1]) break; else if ('\n' == buf[start]) break; else if ('\n' == last && '.' == buf[start]) printf("\\&"); else if ('\n' == last && '\'' == buf[start]) printf("\\&"); putchar(last = buf[start++]); } if (start < end - 1 && '<' == buf[start + 1]) { /* * We've encountered a format code. * This is going to trigger a macro no matter * what, so print a newline now. * Then print the (possibly nested) macros and * following that, a newline. */ if (formatcode(buf, &start, end, 0, last, 0)) putchar(last = '\n'); } else if (start < end && '\n' == buf[start]) { /* * Print the newline only if we haven't already * printed a newline. */ if (last != '\n') putchar(last = buf[start]); if (++start >= end) continue; /* * If we have whitespace next, eat it to prevent * mdoc(7) from thinking that it's meant for * verbatim text. * It is--but if we start with that, we can't * have a macro subsequent it, which may be * possible if we have an escape next. */ if (' ' == buf[start] || '\t' == buf[start]) { puts(".br"); last = '\n'; } for ( ; start < end; start++) if (' ' != buf[start] && '\t' != buf[start]) break; } else if (start < end) { /* * Default: print the character. * Escape initial control characters. */ if ('\n' == last && '.' == buf[start]) printf("\\&"); else if ('\n' == last && '\'' == buf[start]) printf("\\&"); putchar(last = buf[start++]); } } if (last != '\n') putchar('\n'); } /* * There are three kinds of paragraphs: verbatim (starts with whitespace * of some sort), ordinary (starts without "=" marker), or a command * (default: starts with "="). */ static void dopar(struct state *st, const char *buf, size_t start, size_t end) { if (end == start) return; if (' ' == buf[start] || '\t' == buf[start]) verbatim(st, buf, start, end); else if ('=' != buf[start]) ordinary(st, buf, start, end); else command(st, buf, start, end); } /* * Loop around paragraphs within a document, processing each one in the * POD way. */ static void dofile(const struct args *args, const char *fname, const struct tm *tm, const char *buf, size_t sz) { size_t sup, end, i, cur = 0; struct state st; const char *section, *date; char datebuf[64]; char *title, *cp; if (0 == sz) return; /* Title is last path component of the filename. */ if (NULL != args->title) title = strdup(args->title); else if (NULL != (cp = strrchr(fname, '/'))) title = strdup(cp + 1); else title = strdup(fname); if (NULL == title) { perror(NULL); exit(EXIT_FAILURE); } /* Section is 1 unless suffix is "pm". */ if (NULL == (section = args->section)) { section = "1"; if (NULL != (cp = strrchr(title, '.'))) { *cp++ = '\0'; if (0 == strcmp(cp, "pm")) section = "3p"; } } /* Date. Or the given "tm" if not supplied. */ if (NULL == (date = args->date)) { strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm); date = datebuf; } for (cp = title; '\0' != *cp; cp++) *cp = toupper((int)*cp); /* The usual mdoc(7) preamble. */ printf(".Dd %s\n", date); printf(".Dt %s %s\n", title, section); puts(".Os"); free(title); memset(&st, 0, sizeof(struct state)); assert(sz > 0); /* Main loop over file contents. */ while (cur < sz) { /* Read until next paragraph. */ for (i = cur + 1; i < sz; i++) if ('\n' == buf[i] && '\n' == buf[i - 1]) { /* Consume blank paragraphs. */ while (i + 1 < sz && '\n' == buf[i + 1]) i++; break; } /* Adjust end marker for EOF. */ end = i < sz ? i - 1 : ('\n' == buf[sz - 1] ? sz - 1 : sz); sup = i < sz ? end + 2 : sz; /* Process paragraph and adjust start. */ dopar(&st, buf, cur, end); cur = sup; } } /* * Read a single file fully into memory. * If the file is "-", do it from stdin. * If successfully read, send the input buffer to dofile() for further * processing. */ static int readfile(const struct args *args, const char *fname) { int fd; char *buf; size_t bufsz, cur; ssize_t ssz; struct tm *tm; time_t ttm; struct stat st; assert(NULL != fname); fd = 0 != strcmp("-", fname) ? open(fname, O_RDONLY, 0) : STDIN_FILENO; if (-1 == fd) { perror(fname); return(0); } if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) { ttm = time(NULL); tm = localtime(&ttm); } else tm = localtime(&st.st_mtime); /* * Arbitrarily-sized initial buffer. * Should be big enough for most files... */ cur = 0; bufsz = 1 << 14; if (NULL == (buf = malloc(bufsz))) { perror(NULL); exit(EXIT_FAILURE); } while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) { /* Double buffer size on fill. */ if ((size_t)ssz == bufsz - cur) { bufsz *= 2; if (NULL == (buf = realloc(buf, bufsz))) { perror(NULL); exit(EXIT_FAILURE); } } cur += (size_t)ssz; } if (ssz < 0) { perror(fname); free(buf); return(0); } dofile(args, STDIN_FILENO == fd ? "STDIN" : fname, tm, buf, cur); free(buf); if (STDIN_FILENO != fd) close(fd); return(1); } int main(int argc, char *argv[]) { const char *fname, *name; struct args args; int c; name = strrchr(argv[0], '/'); if (name == NULL) name = argv[0]; else ++name; memset(&args, 0, sizeof(struct args)); fname = "-"; /* Accept no arguments for now. */ while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv"))) switch (c) { case ('h'): /* FALLTHROUGH */ case ('l'): /* FALLTHROUGH */ case ('c'): /* FALLTHROUGH */ case ('o'): /* FALLTHROUGH */ case ('q'): /* FALLTHROUGH */ case ('r'): /* FALLTHROUGH */ case ('u'): /* FALLTHROUGH */ case ('v'): /* Ignore these. */ break; case ('d'): args.date = optarg; break; case ('n'): args.title = optarg; break; case ('s'): args.section = optarg; break; default: goto usage; } argc -= optind; argv += optind; /* Accept only a single input file. */ if (argc > 2) return(EXIT_FAILURE); else if (1 == argc) fname = *argv; return(readfile(&args, fname) ? EXIT_SUCCESS : EXIT_FAILURE); usage: fprintf(stderr, "usage: %s [-d date] " "[-n title] [-s section]\n", name); return(EXIT_FAILURE); }