Annotation of pod2mdoc/pod2mdoc.c, Revision 1.20
1.20 ! kristaps 1: /* $Id: pod2mdoc.c,v 1.19 2014/04/03 10:17:14 kristaps Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
1.10 kristaps 29: /*
1.19 kristaps 30: * In what section can we find Perl module manuals?
31: * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
32: * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
1.10 kristaps 33: */
34: #define PERL_SECTION "3p"
35:
1.1 schwarze 36: struct args {
37: const char *title; /* override "Dt" title */
38: const char *date; /* override "Dd" date */
39: const char *section; /* override "Dt" section */
40: };
41:
1.4 schwarze 42: enum list {
43: LIST_BULLET = 0,
44: LIST_ENUM,
45: LIST_TAG,
46: LIST__MAX
47: };
48:
1.11 kristaps 49: enum sect {
50: SECT_NONE = 0,
51: SECT_NAME, /* NAME section */
52: SECT_SYNOPSIS, /* SYNOPSIS section */
53: };
54:
1.1 schwarze 55: struct state {
56: int parsing; /* after =cut of before command */
57: int paused; /* in =begin and before =end */
58: int haspar; /* in paragraph: do we need Pp? */
1.11 kristaps 59: enum sect sect; /* which section are we in? */
1.1 schwarze 60: const char *fname; /* file being parsed */
1.4 schwarze 61: #define LIST_STACKSZ 128
62: enum list lstack[LIST_STACKSZ]; /* open lists */
63: size_t lpos; /* where in list stack */
1.1 schwarze 64: };
65:
66: enum fmt {
67: FMT_ITALIC,
68: FMT_BOLD,
69: FMT_CODE,
70: FMT_LINK,
71: FMT_ESCAPE,
72: FMT_FILE,
73: FMT_NBSP,
74: FMT_INDEX,
75: FMT_NULL,
76: FMT__MAX
77: };
78:
79: enum cmd {
80: CMD_POD = 0,
81: CMD_HEAD1,
82: CMD_HEAD2,
83: CMD_HEAD3,
84: CMD_HEAD4,
85: CMD_OVER,
86: CMD_ITEM,
87: CMD_BACK,
88: CMD_BEGIN,
89: CMD_END,
90: CMD_FOR,
91: CMD_ENCODING,
92: CMD_CUT,
93: CMD__MAX
94: };
95:
96: static const char *const cmds[CMD__MAX] = {
97: "pod", /* CMD_POD */
98: "head1", /* CMD_HEAD1 */
99: "head2", /* CMD_HEAD2 */
100: "head3", /* CMD_HEAD3 */
101: "head4", /* CMD_HEAD4 */
102: "over", /* CMD_OVER */
103: "item", /* CMD_ITEM */
104: "back", /* CMD_BACK */
105: "begin", /* CMD_BEGIN */
106: "end", /* CMD_END */
107: "for", /* CMD_FOR */
108: "encoding", /* CMD_ENCODING */
109: "cut" /* CMD_CUT */
110: };
111:
112: static const char fmts[FMT__MAX] = {
113: 'I', /* FMT_ITALIC */
114: 'B', /* FMT_BOLD */
115: 'C', /* FMT_CODE */
116: 'L', /* FMT_LINK */
117: 'E', /* FMT_ESCAPE */
118: 'F', /* FMT_FILE */
119: 'S', /* FMT_NBSP */
120: 'X', /* FMT_INDEX */
121: 'Z' /* FMT_NULL */
122: };
123:
1.6 kristaps 124: static int last;
125:
1.1 schwarze 126: /*
127: * Given buf[*start] is at the start of an escape name, read til the end
128: * of the escape ('>') then try to do something with it.
129: * Sets start to be one after the '>'.
130: */
131: static void
132: formatescape(const char *buf, size_t *start, size_t end)
133: {
134: char esc[16]; /* no more needed */
135: size_t i, max;
136:
137: max = sizeof(esc) - 1;
138: i = 0;
139: /* Read til our buffer is full. */
140: while (*start < end && '>' != buf[*start] && i < max)
141: esc[i++] = buf[(*start)++];
142: esc[i] = '\0';
143:
144: if (i == max) {
145: /* Too long... skip til we end. */
146: while (*start < end && '>' != buf[*start])
147: (*start)++;
148: return;
149: } else if (*start >= end)
150: return;
151:
152: assert('>' == buf[*start]);
153: (*start)++;
154:
155: /*
156: * TODO: right now, we only recognise the named escapes.
157: * Just let the rest of them go.
158: */
1.6 kristaps 159: if (0 == strcmp(esc, "lt"))
1.1 schwarze 160: printf("\\(la");
161: else if (0 == strcmp(esc, "gt"))
162: printf("\\(ra");
163: else if (0 == strcmp(esc, "vb"))
164: printf("\\(ba");
165: else if (0 == strcmp(esc, "sol"))
166: printf("\\(sl");
1.6 kristaps 167: else
168: return;
169:
170: last = 'a';
1.1 schwarze 171: }
172:
173: /*
1.9 kristaps 174: * Run some heuristics to intuit a link format.
1.19 kristaps 175: * I set "start" to be the end of the sequence (last right-carrot) so
1.9 kristaps 176: * that the caller can safely just continue processing.
1.19 kristaps 177: * If this is just an empty tag, I'll return 0.
1.9 kristaps 178: */
179: static int
180: trylink(const char *buf, size_t *start, size_t end, size_t dsz)
181: {
1.19 kristaps 182: size_t linkstart, realend, linkend, i, j, textsz;
1.18 kristaps 183: const char *text;
1.9 kristaps 184:
185: /*
186: * Scan to the start of the terminus.
187: * This function is more or less replicated in the formatcode()
188: * for null or index formatting codes.
189: */
1.19 kristaps 190: for (linkstart = realend = *start; realend < end; realend++) {
191: if ('>' != buf[realend])
1.9 kristaps 192: continue;
193: else if (dsz == 1)
194: break;
1.19 kristaps 195: assert(realend > 0);
196: if (' ' != buf[realend - 1])
1.9 kristaps 197: continue;
1.19 kristaps 198: for (i = realend, j = 0; i < end && j < dsz; j++)
1.9 kristaps 199: if ('>' != buf[i++])
200: break;
201: if (dsz == j)
202: break;
203: }
1.19 kristaps 204:
205: /* Ignore stubs. */
206: if (realend == end || realend == *start)
1.9 kristaps 207: return(0);
208:
1.19 kristaps 209: /* Set linkend to the end of content. */
210: linkend = dsz > 1 ? realend - 1 : realend;
1.18 kristaps 211:
1.19 kristaps 212: /* Re-scan to see if we have a title or section. */
1.18 kristaps 213: text = &buf[*start];
1.19 kristaps 214: for (textsz = *start; textsz < linkend; textsz++)
215: if ('|' == buf[textsz] || '/' == buf[textsz])
1.18 kristaps 216: break;
217:
1.19 kristaps 218: if (textsz < linkend && '|' == buf[textsz]) {
1.20 ! kristaps 219: /* With title: set start, then end at section. */
1.19 kristaps 220: linkstart = textsz + 1;
1.18 kristaps 221: textsz = textsz - *start;
1.19 kristaps 222: for (i = linkstart; i < linkend; i++)
223: if ('/' == buf[i])
224: break;
225: if (i < linkend)
226: linkend = i;
1.20 ! kristaps 227: } else if (textsz < linkend && '/' == buf[textsz]) {
! 228: /* With section: set end at section. */
! 229: linkend = textsz;
! 230: textsz = 0;
! 231: } else
! 232: /* No title, no section. */
1.18 kristaps 233: textsz = 0;
1.19 kristaps 234:
235: *start = realend;
236: j = linkend - linkstart;
237:
1.20 ! kristaps 238: /* Do we have only subsection material? */
! 239: if (0 == j && '/' == buf[linkend]) {
! 240: linkstart = linkend + 1;
! 241: linkend = dsz > 1 ? realend - 1 : realend;
! 242: if (0 == (j = linkend - linkstart))
! 243: return(0);
! 244: printf("Sx %.*s", (int)j, &buf[linkstart]);
! 245: return(1);
! 246: } else if (0 == j)
1.19 kristaps 247: return(0);
248:
249: /* See if we qualify as being a link or not. */
1.20 ! kristaps 250: if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
! 251: (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
! 252: (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
! 253: (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
! 254: (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
! 255: (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
! 256: /* Gross. */
! 257: printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
! 258: realend) - linkstart), &buf[linkstart]);
1.19 kristaps 259: return(1);
260: }
261:
262: /* See if we qualify as a mailto. */
1.20 ! kristaps 263: if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
1.19 kristaps 264: printf("Mt %.*s", (int)j, &buf[linkstart]);
265: return(1);
266: }
267:
268: /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
269: if ((j > 3 && ')' == buf[linkend - 1]) &&
270: ('(' == buf[linkend - 3])) {
271: printf("Xr %.*s %c", (int)(j - 3),
272: &buf[linkstart], buf[linkend - 2]);
273: return(1);
274: } else if ((j > 4 && ')' == buf[linkend - 1]) &&
275: ('(' == buf[linkend - 4])) {
276: printf("Xr %.*s %.*s", (int)(j - 4),
277: &buf[linkstart], 2, &buf[linkend - 3]);
278: return(1);
279: } else if ((j > 5 && ')' == buf[linkend - 1]) &&
280: ('(' == buf[linkend - 5])) {
281: printf("Xr %.*s %.*s", (int)(j - 5),
282: &buf[linkstart], 3, &buf[linkend - 4]);
283: return(1);
284: }
285:
286: /* Last try: do we have a double-colon? */
287: for (i = linkstart + 1; i < linkend; i++)
288: if (':' == buf[i] && ':' == buf[i - 1])
1.18 kristaps 289: break;
1.9 kristaps 290:
1.19 kristaps 291: if (i < linkend)
1.10 kristaps 292: printf("Xr %.*s " PERL_SECTION,
1.19 kristaps 293: (int)j, &buf[linkstart]);
1.9 kristaps 294: else
1.19 kristaps 295: printf("Xr %.*s 1", (int)j, &buf[linkstart]);
1.9 kristaps 296:
297: return(1);
298: }
299:
1.13 kristaps 300: /*
301: * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
302: * then it's likely that we're a flag.
303: * Our flag might be followed by an argument, so make sure that we're
304: * accounting for that, too.
305: * If we don't have a flag at all, however, then assume we're an "Ar".
306: */
307: static void
308: dosynopsisfl(const char *buf, size_t *start, size_t end)
309: {
310: size_t i;
311: again:
1.14 kristaps 312: assert(*start + 1 < end);
313: assert('-' == buf[*start]);
314:
315: if ( ! isalnum((int)buf[*start + 1]) &&
316: '?' != buf[*start + 1] &&
317: '-' != buf[*start + 1]) {
318: (*start)--;
319: fputs("Ar ", stdout);
320: return;
321: }
322:
1.13 kristaps 323: (*start)++;
324: for (i = *start; i < end; i++)
325: if (isalnum((int)buf[i]))
326: continue;
1.14 kristaps 327: else if ('?' == buf[i])
328: continue;
1.13 kristaps 329: else if ('-' == buf[i])
330: continue;
331: else if ('_' == buf[i])
332: continue;
333: else
334: break;
335:
336: assert(i < end);
337:
338: if ( ! (' ' == buf[i] || '>' == buf[i])) {
339: printf("Ar ");
340: return;
341: }
342:
343: printf("Fl ");
344: if (end - *start > 1 &&
345: isupper((int)buf[*start]) &&
346: islower((int)buf[*start + 1]) &&
347: (end - *start == 2 ||
348: ' ' == buf[*start + 2]))
349: printf("\\&");
350: printf("%.*s ", (int)(i - *start), &buf[*start]);
351: *start = i;
352:
353: if (' ' == buf[i]) {
354: while (i < end && ' ' == buf[i])
355: i++;
356: assert(i < end);
357: if ('-' == buf[i]) {
358: *start = i;
359: goto again;
360: }
361: printf("Ar ");
362: *start = i;
363: }
364: }
365:
1.9 kristaps 366: /*
1.1 schwarze 367: * We're at the character in front of a format code, which is structured
368: * like X<...> and can contain nested format codes.
369: * This consumes the whole format code, and any nested format codes, til
370: * the end of matched production.
371: * If "reentrant", then we're being called after a macro has already
372: * been printed to the current line.
1.6 kristaps 373: * If "nomacro", then we don't print any macros, just contained data
374: * (e.g., following "Sh" or "Nm").
1.15 kristaps 375: * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
376: * as the first format code on a line (for decoration as an "Nm"),
377: * non-zero otherwise.
1.6 kristaps 378: * Return whether we've printed a macro or not--in other words, whether
379: * this should trigger a subsequent newline (this should be ignored when
380: * reentrant).
1.1 schwarze 381: */
382: static int
1.15 kristaps 383: formatcode(struct state *st, const char *buf, size_t *start,
384: size_t end, int reentrant, int nomacro, int pos)
1.1 schwarze 385: {
386: enum fmt fmt;
1.5 kristaps 387: size_t i, j, dsz;
1.1 schwarze 388:
389: assert(*start + 1 < end);
390: assert('<' == buf[*start + 1]);
391:
1.6 kristaps 392: /*
393: * First, look up the format code.
394: * If it's not valid, then exit immediately.
395: */
396: for (fmt = 0; fmt < FMT__MAX; fmt++)
397: if (buf[*start] == fmts[fmt])
398: break;
399:
400: if (FMT__MAX == fmt) {
401: putchar(last = buf[(*start)++]);
1.8 kristaps 402: if ('\\' == last)
403: putchar('e');
1.6 kristaps 404: return(0);
405: }
406:
1.5 kristaps 407: /*
408: * Determine whether we're overriding our delimiter.
409: * According to POD, if we have more than one '<' followed by a
410: * space, then we need a space followed by matching '>' to close
411: * the expression.
412: * Otherwise we use the usual '<' and '>' matched pair.
413: */
414: i = *start + 1;
415: while (i < end && '<' == buf[i])
416: i++;
417: assert(i > *start + 1);
418: dsz = i - (*start + 1);
419: if (dsz > 1 && (i >= end || ' ' != buf[i]))
420: dsz = 1;
421:
422: /* Remember, if dsz>1, to jump the trailing space. */
423: *start += dsz + 1 + (dsz > 1 ? 1 : 0);
1.1 schwarze 424:
425: /*
1.6 kristaps 426: * Escapes and ignored codes (NULL and INDEX) don't print macro
427: * sequences, so just output them like normal text before
428: * processing for real macros.
1.1 schwarze 429: */
430: if (FMT_ESCAPE == fmt) {
431: formatescape(buf, start, end);
432: return(0);
433: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
1.5 kristaps 434: /*
1.6 kristaps 435: * Just consume til the end delimiter, accounting for
436: * whether it's a custom one.
1.5 kristaps 437: */
438: for ( ; *start < end; (*start)++) {
439: if ('>' != buf[*start])
440: continue;
441: else if (dsz == 1)
442: break;
443: assert(*start > 0);
444: if (' ' != buf[*start - 1])
445: continue;
446: i = *start;
447: for (j = 0; i < end && j < dsz; j++)
448: if ('>' != buf[i++])
449: break;
450: if (dsz != j)
451: continue;
452: (*start) += dsz;
453: break;
454: }
1.1 schwarze 455: return(0);
456: }
457:
1.6 kristaps 458: /*
459: * Check whether we're supposed to print macro stuff (this is
460: * suppressed in, e.g., "Nm" and "Sh" macros).
461: */
1.1 schwarze 462: if ( ! nomacro) {
463: /*
464: * Print out the macro describing this format code.
465: * If we're not "reentrant" (not yet on a macro line)
466: * then print a newline, if necessary, and the macro
467: * indicator.
468: * Otherwise, offset us with a space.
469: */
1.6 kristaps 470: if ( ! reentrant) {
471: if (last != '\n')
472: putchar('\n');
1.1 schwarze 473: putchar('.');
1.6 kristaps 474: } else
1.1 schwarze 475: putchar(' ');
476:
477: /*
1.6 kristaps 478: * If we don't have whitespace before us (and none after
479: * the opening delimiter), then suppress macro
480: * whitespace with Pf.
1.1 schwarze 481: */
1.6 kristaps 482: if (' ' != last && '\n' != last && ' ' != buf[*start])
483: printf("Pf ");
484:
1.1 schwarze 485: switch (fmt) {
486: case (FMT_ITALIC):
487: printf("Em ");
488: break;
489: case (FMT_BOLD):
1.14 kristaps 490: if (SECT_SYNOPSIS == st->sect) {
491: if (1 == dsz && '-' == buf[*start])
492: dosynopsisfl(buf, start, end);
1.15 kristaps 493: else if (0 == pos)
494: printf("Nm ");
1.14 kristaps 495: else
496: printf("Ar ");
497: break;
498: }
499: printf("Sy ");
1.1 schwarze 500: break;
501: case (FMT_CODE):
1.2 schwarze 502: printf("Qo Li ");
1.1 schwarze 503: break;
504: case (FMT_LINK):
1.19 kristaps 505: /* Try to link; use "No" if it's empty. */
1.9 kristaps 506: if ( ! trylink(buf, start, end, dsz))
507: printf("No ");
1.1 schwarze 508: break;
509: case (FMT_FILE):
510: printf("Pa ");
511: break;
512: case (FMT_NBSP):
513: printf("No ");
514: break;
515: default:
516: abort();
517: }
518: }
519:
520: /*
1.6 kristaps 521: * Process until we reach the end marker (e.g., '>') or until we
1.5 kristaps 522: * find a nested format code.
1.1 schwarze 523: * Don't emit any newlines: since we're on a macro line, we
524: * don't want to break the line.
525: */
526: while (*start < end) {
1.5 kristaps 527: if ('>' == buf[*start] && 1 == dsz) {
1.1 schwarze 528: (*start)++;
529: break;
1.5 kristaps 530: } else if ('>' == buf[*start] &&
531: ' ' == buf[*start - 1]) {
532: /*
533: * Handle custom delimiters.
534: * These require a certain number of
535: * space-preceded carrots before we're really at
536: * the end.
537: */
538: i = *start;
539: for (j = 0; i < end && j < dsz; j++)
540: if ('>' != buf[i++])
541: break;
542: if (dsz == j) {
543: *start += dsz;
544: break;
545: }
1.1 schwarze 546: }
547: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 548: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 549: continue;
550: }
1.3 schwarze 551:
1.4 schwarze 552: /*
553: * Make sure that any macro-like words (or
554: * really any word starting with a capital
555: * letter) is assumed to be a macro that must be
556: * escaped.
557: * This matches "Xx " and "XxEOLN".
558: */
559: if ((' ' == last || '\n' == last) &&
560: end - *start > 1 &&
561: isupper((int)buf[*start]) &&
562: islower((int)buf[*start + 1]) &&
563: (end - *start == 2 ||
564: ' ' == buf[*start + 2]))
565: printf("\\&");
1.3 schwarze 566:
1.4 schwarze 567: /* Suppress newline. */
1.6 kristaps 568: if ('\n' == buf[*start])
569: putchar(last = ' ');
570: else
571: putchar(last = buf[*start]);
1.4 schwarze 572:
1.8 kristaps 573: /* Protect against character escapes. */
574: if ('\\' == last)
575: putchar('e');
576:
1.6 kristaps 577: (*start)++;
578:
579: if (' ' == last)
580: while (*start < end && ' ' == buf[*start])
581: (*start)++;
1.1 schwarze 582: }
1.2 schwarze 583:
584: if ( ! nomacro && FMT_CODE == fmt)
585: printf(" Qc ");
1.1 schwarze 586:
587: /*
1.6 kristaps 588: * We're now subsequent the format code.
589: * If there isn't a space (or newline) here, and we haven't just
590: * printed a space, then suppress space.
1.1 schwarze 591: */
1.6 kristaps 592: if ( ! nomacro && ' ' != last)
593: if (' ' != buf[*start] && '\n' != buf[*start])
594: printf(" Ns ");
1.5 kristaps 595:
1.1 schwarze 596: return(1);
597: }
598:
599: /*
600: * Calls formatcode() til the end of a paragraph.
601: */
602: static void
1.11 kristaps 603: formatcodeln(struct state *st, const char *buf,
604: size_t *start, size_t end, int nomacro)
1.1 schwarze 605: {
606:
1.4 schwarze 607: last = ' ';
1.1 schwarze 608: while (*start < end) {
609: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 610: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 611: continue;
612: }
1.4 schwarze 613: /*
614: * Since we're already on a macro line, we want to make
615: * sure that we don't inadvertently invoke a macro.
616: * We need to do this carefully because section names
617: * are used in troff and we don't want to escape
618: * something that needn't be escaped.
619: */
620: if (' ' == last && end - *start > 1 &&
621: isupper((int)buf[*start]) &&
622: islower((int)buf[*start + 1]) &&
623: (end - *start == 2 ||
624: ' ' == buf[*start + 2]))
625: printf("\\&");
626:
1.8 kristaps 627: if ('\n' == buf[*start])
628: putchar(last = ' ');
629: else
1.1 schwarze 630: putchar(last = buf[*start]);
1.8 kristaps 631:
632: /* Protect against character escapes. */
633: if ('\\' == last)
634: putchar('e');
635:
1.1 schwarze 636: (*start)++;
637: }
638: }
639:
640: /*
1.4 schwarze 641: * Guess at what kind of list we are.
642: * These are taken straight from the POD manual.
643: * I don't know what people do in real life.
644: */
645: static enum list
646: listguess(const char *buf, size_t start, size_t end)
647: {
648: size_t len = end - start;
649:
650: assert(end >= start);
651:
652: if (len == 1 && '*' == buf[start])
653: return(LIST_BULLET);
654: if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
655: return(LIST_ENUM);
656: else if (len == 1 && '1' == buf[start])
657: return(LIST_ENUM);
658: else
659: return(LIST_TAG);
660: }
661:
662: /*
1.1 schwarze 663: * A command paragraph, as noted in the perlpod manual, just indicates
664: * that we should do something, optionally with some text to print as
665: * well.
666: */
667: static void
668: command(struct state *st, const char *buf, size_t start, size_t end)
669: {
670: size_t len, csz;
671: enum cmd cmd;
672:
673: assert('=' == buf[start]);
674: start++;
675: len = end - start;
676:
677: for (cmd = 0; cmd < CMD__MAX; cmd++) {
678: csz = strlen(cmds[cmd]);
679: if (len < csz)
680: continue;
681: if (0 == memcmp(&buf[start], cmd[cmds], csz))
682: break;
683: }
684:
685: /* Ignore bogus commands. */
686:
687: if (CMD__MAX == cmd)
688: return;
689:
690: start += csz;
1.8 kristaps 691: while (start < end && ' ' == buf[start])
692: start++;
693:
1.1 schwarze 694: len = end - start;
695:
696: if (st->paused) {
697: st->paused = CMD_END != cmd;
698: return;
699: }
700:
701: switch (cmd) {
702: case (CMD_POD):
703: break;
704: case (CMD_HEAD1):
705: /*
706: * The behaviour of head= follows from a quick glance at
707: * how pod2man handles it.
708: */
709: printf(".Sh ");
1.11 kristaps 710: st->sect = SECT_NONE;
711: if (end - start == 4) {
1.1 schwarze 712: if (0 == memcmp(&buf[start], "NAME", 4))
1.11 kristaps 713: st->sect = SECT_NAME;
714: } else if (end - start == 8) {
715: if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
716: st->sect = SECT_SYNOPSIS;
717: }
718: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 719: putchar('\n');
720: st->haspar = 1;
721: break;
722: case (CMD_HEAD2):
723: printf(".Ss ");
1.11 kristaps 724: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 725: putchar('\n');
726: st->haspar = 1;
727: break;
728: case (CMD_HEAD3):
729: puts(".Pp");
730: printf(".Em ");
1.11 kristaps 731: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 732: putchar('\n');
733: puts(".Pp");
734: st->haspar = 1;
735: break;
736: case (CMD_HEAD4):
737: puts(".Pp");
738: printf(".No ");
1.11 kristaps 739: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 740: putchar('\n');
741: puts(".Pp");
742: st->haspar = 1;
743: break;
744: case (CMD_OVER):
1.4 schwarze 745: /*
746: * If we have an existing list that hasn't had an =item
747: * yet, then make sure that we open it now.
748: * We use the default list type, but that can't be
749: * helped (we haven't seen any items yet).
1.1 schwarze 750: */
1.4 schwarze 751: if (st->lpos > 0)
752: if (LIST__MAX == st->lstack[st->lpos - 1]) {
753: st->lstack[st->lpos - 1] = LIST_TAG;
754: puts(".Bl -tag -width Ds");
755: }
756: st->lpos++;
757: assert(st->lpos < LIST_STACKSZ);
758: st->lstack[st->lpos - 1] = LIST__MAX;
1.1 schwarze 759: break;
760: case (CMD_ITEM):
1.6 kristaps 761: if (0 == st->lpos) {
762: /*
763: * Bad markup.
764: * Try to compensate.
765: */
766: st->lstack[st->lpos] = LIST__MAX;
767: st->lpos++;
768: }
1.4 schwarze 769: assert(st->lpos > 0);
770: /*
771: * If we're the first =item, guess at what our content
772: * will be: "*" is a bullet list, "1." is a numbered
773: * list, and everything is tagged.
774: */
775: if (LIST__MAX == st->lstack[st->lpos - 1]) {
776: st->lstack[st->lpos - 1] =
777: listguess(buf, start, end);
778: switch (st->lstack[st->lpos - 1]) {
779: case (LIST_BULLET):
780: puts(".Bl -bullet");
781: break;
782: case (LIST_ENUM):
783: puts(".Bl -enum");
784: break;
785: default:
786: puts(".Bl -tag -width Ds");
787: break;
788: }
789: }
790: switch (st->lstack[st->lpos - 1]) {
791: case (LIST_TAG):
792: printf(".It ");
1.11 kristaps 793: formatcodeln(st, buf, &start, end, 0);
1.4 schwarze 794: putchar('\n');
795: break;
796: case (LIST_ENUM):
797: /* FALLTHROUGH */
798: case (LIST_BULLET):
799: /*
800: * Abandon the remainder of the paragraph
801: * because we're going to be a bulletted or
802: * numbered list.
803: */
804: puts(".It");
805: break;
806: default:
807: abort();
808: }
1.1 schwarze 809: st->haspar = 1;
810: break;
811: case (CMD_BACK):
1.4 schwarze 812: /* Make sure we don't back over the stack. */
813: if (st->lpos > 0) {
814: st->lpos--;
815: puts(".El");
816: }
1.1 schwarze 817: break;
818: case (CMD_BEGIN):
819: /*
820: * We disregard all types for now.
821: * TODO: process at least "text" in a -literal block.
822: */
823: st->paused = 1;
824: break;
825: case (CMD_FOR):
826: /*
827: * We ignore all types of encodings and formats
828: * unilaterally.
829: */
830: break;
831: case (CMD_ENCODING):
832: break;
833: case (CMD_CUT):
834: st->parsing = 0;
835: return;
836: default:
837: abort();
838: }
839:
840: /* Any command (but =cut) makes us start parsing. */
841: st->parsing = 1;
842: }
843:
844: /*
845: * Just pump out the line in a verbatim block.
846: */
847: static void
848: verbatim(struct state *st, const char *buf, size_t start, size_t end)
849: {
1.8 kristaps 850: int last;
1.1 schwarze 851:
852: if ( ! st->parsing || st->paused)
853: return;
854:
855: puts(".Bd -literal");
1.8 kristaps 856: for (last = ' '; start < end; start++) {
857: /*
858: * Handle accidental macros (newline starting with
859: * control character) and escapes.
860: */
861: if ('\n' == last)
1.7 kristaps 862: if ('.' == buf[start] || '\'' == buf[start])
863: printf("\\&");
1.8 kristaps 864: putchar(last = buf[start]);
865: if ('\\' == buf[start])
866: printf("e");
1.7 kristaps 867: }
868: putchar('\n');
1.1 schwarze 869: puts(".Ed");
870: }
871:
872: /*
1.13 kristaps 873: * See dosynopsisop().
874: */
875: static int
876: hasmatch(const char *buf, size_t start, size_t end)
877: {
878: size_t stack;
879:
880: for (stack = 0; start < end; start++)
881: if (buf[start] == '[')
882: stack++;
883: else if (buf[start] == ']' && 0 == stack)
884: return(1);
885: else if (buf[start] == ']')
886: stack--;
887: return(0);
888: }
889:
890: /*
891: * If we're in the SYNOPSIS section and we've encounter braces in an
892: * ordinary paragraph, then try to see whether we're an [-option].
893: * Do this, if we're an opening bracket, by first seeing if we have a
894: * matching end via hasmatch().
895: * If we're an ending bracket, see if we have a stack already.
896: */
897: static int
898: dosynopsisop(const char *buf, int *last,
899: size_t *start, size_t end, size_t *opstack)
900: {
901:
902: assert('[' == buf[*start] || ']' == buf[*start]);
903:
904: if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
905: if ('\n' != *last)
906: putchar('\n');
907: puts(".Oo");
908: (*opstack)++;
909: } else if ('[' == buf[*start])
910: return(0);
911:
912: if (']' == buf[*start] && *opstack > 0) {
913: if ('\n' != *last)
914: putchar('\n');
915: puts(".Oc");
916: (*opstack)--;
917: } else if (']' == buf[*start])
918: return(0);
919:
920: (*start)++;
921: *last = '\n';
922: while (' ' == buf[*start])
923: (*start)++;
924: return(1);
925: }
926:
927: /*
1.17 kristaps 928: * Format multiple "Nm" manpage names in the NAME section.
929: */
930: static void
931: donamenm(struct state *st, const char *buf, size_t *start, size_t end)
932: {
933: size_t word;
934:
935: while (*start < end && ' ' == buf[*start])
936: (*start)++;
937:
938: if (end == *start) {
939: puts(".Nm unknown");
940: return;
941: }
942:
943: while (*start < end) {
944: fputs(".Nm ", stdout);
945: for (word = *start; word < end; word++)
946: if (',' == buf[word])
947: break;
948: formatcodeln(st, buf, start, word, 1);
949: if (*start == end) {
950: putchar('\n');
951: continue;
952: }
953: assert(',' == buf[*start]);
954: puts(" ,");
955: (*start)++;
956: while (*start < end && ' ' == buf[*start])
957: (*start)++;
958: }
959: }
960:
961: /*
1.1 schwarze 962: * Ordinary paragraph.
963: * Well, this is really the hardest--POD seems to assume that, for
964: * example, a leading space implies a newline, and so on.
965: * Lots of other snakes in the grass: escaping a newline followed by a
966: * period (accidental mdoc(7) control), double-newlines after macro
967: * passages, etc.
968: */
969: static void
970: ordinary(struct state *st, const char *buf, size_t start, size_t end)
971: {
1.13 kristaps 972: size_t i, j, opstack;
1.15 kristaps 973: int seq;
1.1 schwarze 974:
975: if ( ! st->parsing || st->paused)
976: return;
977:
978: /*
979: * Special-case: the NAME section.
980: * If we find a "-" when searching from the end, assume that
981: * we're in "name - description" format.
982: * To wit, print out a "Nm" and "Nd" in that format.
983: */
1.11 kristaps 984: if (SECT_NAME == st->sect) {
1.15 kristaps 985: for (i = end - 2; i > start; i--)
986: if ('-' == buf[i] && ' ' == buf[i + 1])
1.1 schwarze 987: break;
988: if ('-' == buf[i]) {
989: j = i;
990: /* Roll over multiple "-". */
991: for ( ; i > start; i--)
992: if ('-' != buf[i])
993: break;
1.17 kristaps 994: donamenm(st, buf, &start, i + 1);
1.5 kristaps 995: start = j + 1;
1.17 kristaps 996: while (start < end && ' ' == buf[start])
997: start++;
1.15 kristaps 998: fputs(".Nd ", stdout);
1.11 kristaps 999: formatcodeln(st, buf, &start, end, 1);
1.5 kristaps 1000: putchar('\n');
1.1 schwarze 1001: return;
1002: }
1003: }
1004:
1005: if ( ! st->haspar)
1006: puts(".Pp");
1007:
1008: st->haspar = 0;
1009: last = '\n';
1.13 kristaps 1010: opstack = 0;
1.1 schwarze 1011:
1.15 kristaps 1012: for (seq = 0; start < end; seq++) {
1.1 schwarze 1013: /*
1014: * Loop til we get either to a newline or escape.
1015: * Escape initial control characters.
1016: */
1017: while (start < end) {
1018: if (start < end - 1 && '<' == buf[start + 1])
1019: break;
1020: else if ('\n' == buf[start])
1021: break;
1022: else if ('\n' == last && '.' == buf[start])
1023: printf("\\&");
1024: else if ('\n' == last && '\'' == buf[start])
1025: printf("\\&");
1.12 kristaps 1026: /*
1027: * If we're in the SYNOPSIS, have square
1028: * brackets indicate that we're opening and
1029: * closing an optional context.
1030: */
1.13 kristaps 1031: if (SECT_SYNOPSIS == st->sect &&
1032: ('[' == buf[start] ||
1033: ']' == buf[start]) &&
1034: dosynopsisop(buf, &last,
1035: &start, end, &opstack))
1036: continue;
1.1 schwarze 1037: putchar(last = buf[start++]);
1.8 kristaps 1038: if ('\\' == last)
1039: putchar('e');
1.1 schwarze 1040: }
1041:
1042: if (start < end - 1 && '<' == buf[start + 1]) {
1043: /*
1044: * We've encountered a format code.
1045: * This is going to trigger a macro no matter
1046: * what, so print a newline now.
1047: * Then print the (possibly nested) macros and
1048: * following that, a newline.
1.8 kristaps 1049: * Consume all whitespace so we don't
1050: * accidentally start an implicit literal line.
1.16 kristaps 1051: * If the macro ends with a flush comma or
1052: * period, let mdoc(7) handle it for us.
1.1 schwarze 1053: */
1.15 kristaps 1054: if (formatcode(st, buf, &start, end, 0, 0, seq)) {
1.16 kristaps 1055: if ((start == end - 1 ||
1056: (start < end - 1 &&
1057: (' ' == buf[start + 1] ||
1058: '\n' == buf[start + 1]))) &&
1059: ('.' == buf[start] ||
1060: ',' == buf[start])) {
1061: putchar(' ');
1062: putchar(buf[start++]);
1063: }
1.1 schwarze 1064: putchar(last = '\n');
1.6 kristaps 1065: while (start < end && ' ' == buf[start])
1066: start++;
1067: }
1.1 schwarze 1068: } else if (start < end && '\n' == buf[start]) {
1069: /*
1070: * Print the newline only if we haven't already
1071: * printed a newline.
1072: */
1073: if (last != '\n')
1074: putchar(last = buf[start]);
1075: if (++start >= end)
1076: continue;
1077: /*
1078: * If we have whitespace next, eat it to prevent
1079: * mdoc(7) from thinking that it's meant for
1080: * verbatim text.
1081: * It is--but if we start with that, we can't
1082: * have a macro subsequent it, which may be
1083: * possible if we have an escape next.
1084: */
1085: if (' ' == buf[start] || '\t' == buf[start]) {
1086: puts(".br");
1087: last = '\n';
1088: }
1089: for ( ; start < end; start++)
1090: if (' ' != buf[start] && '\t' != buf[start])
1091: break;
1.12 kristaps 1092: }
1.1 schwarze 1093: }
1094:
1095: if (last != '\n')
1096: putchar('\n');
1097: }
1098:
1099: /*
1100: * There are three kinds of paragraphs: verbatim (starts with whitespace
1101: * of some sort), ordinary (starts without "=" marker), or a command
1102: * (default: starts with "=").
1103: */
1104: static void
1105: dopar(struct state *st, const char *buf, size_t start, size_t end)
1106: {
1107:
1108: if (end == start)
1109: return;
1110: if (' ' == buf[start] || '\t' == buf[start])
1111: verbatim(st, buf, start, end);
1112: else if ('=' != buf[start])
1113: ordinary(st, buf, start, end);
1114: else
1115: command(st, buf, start, end);
1116: }
1117:
1118: /*
1119: * Loop around paragraphs within a document, processing each one in the
1120: * POD way.
1121: */
1122: static void
1123: dofile(const struct args *args, const char *fname,
1124: const struct tm *tm, const char *buf, size_t sz)
1125: {
1126: size_t sup, end, i, cur = 0;
1127: struct state st;
1128: const char *section, *date;
1129: char datebuf[64];
1130: char *title, *cp;
1131:
1132: if (0 == sz)
1133: return;
1134:
1135: /* Title is last path component of the filename. */
1136:
1137: if (NULL != args->title)
1138: title = strdup(args->title);
1139: else if (NULL != (cp = strrchr(fname, '/')))
1140: title = strdup(cp + 1);
1141: else
1142: title = strdup(fname);
1143:
1144: if (NULL == title) {
1145: perror(NULL);
1146: exit(EXIT_FAILURE);
1147: }
1148:
1149: /* Section is 1 unless suffix is "pm". */
1150:
1151: if (NULL == (section = args->section)) {
1152: section = "1";
1153: if (NULL != (cp = strrchr(title, '.'))) {
1154: *cp++ = '\0';
1155: if (0 == strcmp(cp, "pm"))
1.10 kristaps 1156: section = PERL_SECTION;
1.1 schwarze 1157: }
1158: }
1159:
1160: /* Date. Or the given "tm" if not supplied. */
1161:
1162: if (NULL == (date = args->date)) {
1163: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
1164: date = datebuf;
1165: }
1166:
1167: for (cp = title; '\0' != *cp; cp++)
1168: *cp = toupper((int)*cp);
1169:
1170: /* The usual mdoc(7) preamble. */
1171:
1172: printf(".Dd %s\n", date);
1173: printf(".Dt %s %s\n", title, section);
1174: puts(".Os");
1175:
1176: free(title);
1177:
1178: memset(&st, 0, sizeof(struct state));
1179: assert(sz > 0);
1180:
1181: /* Main loop over file contents. */
1182:
1183: while (cur < sz) {
1184: /* Read until next paragraph. */
1185: for (i = cur + 1; i < sz; i++)
1186: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
1187: /* Consume blank paragraphs. */
1188: while (i + 1 < sz && '\n' == buf[i + 1])
1189: i++;
1190: break;
1191: }
1192:
1193: /* Adjust end marker for EOF. */
1194: end = i < sz ? i - 1 :
1195: ('\n' == buf[sz - 1] ? sz - 1 : sz);
1196: sup = i < sz ? end + 2 : sz;
1197:
1198: /* Process paragraph and adjust start. */
1199: dopar(&st, buf, cur, end);
1200: cur = sup;
1201: }
1202: }
1203:
1204: /*
1205: * Read a single file fully into memory.
1206: * If the file is "-", do it from stdin.
1207: * If successfully read, send the input buffer to dofile() for further
1208: * processing.
1209: */
1210: static int
1211: readfile(const struct args *args, const char *fname)
1212: {
1213: int fd;
1214: char *buf;
1215: size_t bufsz, cur;
1216: ssize_t ssz;
1217: struct tm *tm;
1218: time_t ttm;
1219: struct stat st;
1220:
1221: assert(NULL != fname);
1222:
1223: fd = 0 != strcmp("-", fname) ?
1224: open(fname, O_RDONLY, 0) : STDIN_FILENO;
1225:
1226: if (-1 == fd) {
1227: perror(fname);
1228: return(0);
1229: }
1230:
1231: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
1232: ttm = time(NULL);
1233: tm = localtime(&ttm);
1234: } else
1235: tm = localtime(&st.st_mtime);
1236:
1237: /*
1238: * Arbitrarily-sized initial buffer.
1239: * Should be big enough for most files...
1240: */
1241: cur = 0;
1242: bufsz = 1 << 14;
1243: if (NULL == (buf = malloc(bufsz))) {
1244: perror(NULL);
1245: exit(EXIT_FAILURE);
1246: }
1247:
1248: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
1249: /* Double buffer size on fill. */
1250: if ((size_t)ssz == bufsz - cur) {
1251: bufsz *= 2;
1252: if (NULL == (buf = realloc(buf, bufsz))) {
1253: perror(NULL);
1254: exit(EXIT_FAILURE);
1255: }
1256: }
1257: cur += (size_t)ssz;
1258: }
1259: if (ssz < 0) {
1260: perror(fname);
1261: free(buf);
1262: return(0);
1263: }
1264:
1265: dofile(args, STDIN_FILENO == fd ?
1266: "STDIN" : fname, tm, buf, cur);
1267: free(buf);
1268: if (STDIN_FILENO != fd)
1269: close(fd);
1270: return(1);
1271: }
1272:
1273: int
1274: main(int argc, char *argv[])
1275: {
1276: const char *fname, *name;
1277: struct args args;
1278: int c;
1279:
1280: name = strrchr(argv[0], '/');
1281: if (name == NULL)
1282: name = argv[0];
1283: else
1284: ++name;
1285:
1286: memset(&args, 0, sizeof(struct args));
1287: fname = "-";
1288:
1289: /* Accept no arguments for now. */
1290:
1291: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
1292: switch (c) {
1293: case ('h'):
1294: /* FALLTHROUGH */
1295: case ('l'):
1296: /* FALLTHROUGH */
1297: case ('c'):
1298: /* FALLTHROUGH */
1299: case ('o'):
1300: /* FALLTHROUGH */
1301: case ('q'):
1302: /* FALLTHROUGH */
1303: case ('r'):
1304: /* FALLTHROUGH */
1305: case ('u'):
1306: /* FALLTHROUGH */
1307: case ('v'):
1308: /* Ignore these. */
1309: break;
1310: case ('d'):
1311: args.date = optarg;
1312: break;
1313: case ('n'):
1314: args.title = optarg;
1315: break;
1316: case ('s'):
1317: args.section = optarg;
1318: break;
1319: default:
1320: goto usage;
1321: }
1322:
1323: argc -= optind;
1324: argv += optind;
1325:
1326: /* Accept only a single input file. */
1327:
1328: if (argc > 2)
1329: return(EXIT_FAILURE);
1330: else if (1 == argc)
1331: fname = *argv;
1332:
1333: return(readfile(&args, fname) ?
1334: EXIT_SUCCESS : EXIT_FAILURE);
1335:
1336: usage:
1337: fprintf(stderr, "usage: %s [-d date] "
1338: "[-n title] [-s section]\n", name);
1339:
1340: return(EXIT_FAILURE);
1341: }
CVSweb