Annotation of pod2mdoc/pod2mdoc.c, Revision 1.27
1.27 ! schwarze 1: /* $Id: pod2mdoc.c,v 1.26 2014/07/11 09:07:33 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
1.10 kristaps 29: /*
1.19 kristaps 30: * In what section can we find Perl module manuals?
31: * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
32: * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
1.10 kristaps 33: */
34: #define PERL_SECTION "3p"
35:
1.1 schwarze 36: struct args {
37: const char *title; /* override "Dt" title */
38: const char *date; /* override "Dd" date */
39: const char *section; /* override "Dt" section */
40: };
41:
1.4 schwarze 42: enum list {
43: LIST_BULLET = 0,
44: LIST_ENUM,
45: LIST_TAG,
46: LIST__MAX
47: };
48:
1.11 kristaps 49: enum sect {
50: SECT_NONE = 0,
51: SECT_NAME, /* NAME section */
52: SECT_SYNOPSIS, /* SYNOPSIS section */
53: };
54:
1.1 schwarze 55: struct state {
56: int parsing; /* after =cut of before command */
57: int paused; /* in =begin and before =end */
58: int haspar; /* in paragraph: do we need Pp? */
1.11 kristaps 59: enum sect sect; /* which section are we in? */
1.1 schwarze 60: const char *fname; /* file being parsed */
1.4 schwarze 61: #define LIST_STACKSZ 128
62: enum list lstack[LIST_STACKSZ]; /* open lists */
63: size_t lpos; /* where in list stack */
1.1 schwarze 64: };
65:
66: enum fmt {
67: FMT_ITALIC,
68: FMT_BOLD,
69: FMT_CODE,
70: FMT_LINK,
71: FMT_ESCAPE,
72: FMT_FILE,
73: FMT_NBSP,
74: FMT_INDEX,
75: FMT_NULL,
76: FMT__MAX
77: };
78:
79: enum cmd {
80: CMD_POD = 0,
81: CMD_HEAD1,
82: CMD_HEAD2,
83: CMD_HEAD3,
84: CMD_HEAD4,
85: CMD_OVER,
86: CMD_ITEM,
87: CMD_BACK,
88: CMD_BEGIN,
89: CMD_END,
90: CMD_FOR,
91: CMD_ENCODING,
92: CMD_CUT,
93: CMD__MAX
94: };
95:
96: static const char *const cmds[CMD__MAX] = {
97: "pod", /* CMD_POD */
98: "head1", /* CMD_HEAD1 */
99: "head2", /* CMD_HEAD2 */
100: "head3", /* CMD_HEAD3 */
101: "head4", /* CMD_HEAD4 */
102: "over", /* CMD_OVER */
103: "item", /* CMD_ITEM */
104: "back", /* CMD_BACK */
105: "begin", /* CMD_BEGIN */
106: "end", /* CMD_END */
107: "for", /* CMD_FOR */
108: "encoding", /* CMD_ENCODING */
109: "cut" /* CMD_CUT */
110: };
111:
112: static const char fmts[FMT__MAX] = {
113: 'I', /* FMT_ITALIC */
114: 'B', /* FMT_BOLD */
115: 'C', /* FMT_CODE */
116: 'L', /* FMT_LINK */
117: 'E', /* FMT_ESCAPE */
118: 'F', /* FMT_FILE */
119: 'S', /* FMT_NBSP */
120: 'X', /* FMT_INDEX */
121: 'Z' /* FMT_NULL */
122: };
123:
1.6 kristaps 124: static int last;
125:
1.1 schwarze 126: /*
127: * Given buf[*start] is at the start of an escape name, read til the end
128: * of the escape ('>') then try to do something with it.
129: * Sets start to be one after the '>'.
130: */
131: static void
132: formatescape(const char *buf, size_t *start, size_t end)
133: {
134: char esc[16]; /* no more needed */
135: size_t i, max;
136:
137: max = sizeof(esc) - 1;
138: i = 0;
139: /* Read til our buffer is full. */
140: while (*start < end && '>' != buf[*start] && i < max)
141: esc[i++] = buf[(*start)++];
142: esc[i] = '\0';
143:
144: if (i == max) {
145: /* Too long... skip til we end. */
146: while (*start < end && '>' != buf[*start])
147: (*start)++;
148: return;
149: } else if (*start >= end)
150: return;
151:
152: assert('>' == buf[*start]);
153: (*start)++;
154:
155: /*
156: * TODO: right now, we only recognise the named escapes.
157: * Just let the rest of them go.
158: */
1.6 kristaps 159: if (0 == strcmp(esc, "lt"))
1.1 schwarze 160: printf("\\(la");
161: else if (0 == strcmp(esc, "gt"))
162: printf("\\(ra");
163: else if (0 == strcmp(esc, "vb"))
164: printf("\\(ba");
165: else if (0 == strcmp(esc, "sol"))
166: printf("\\(sl");
1.6 kristaps 167: else
168: return;
169:
170: last = 'a';
1.1 schwarze 171: }
172:
173: /*
1.9 kristaps 174: * Run some heuristics to intuit a link format.
1.19 kristaps 175: * I set "start" to be the end of the sequence (last right-carrot) so
1.9 kristaps 176: * that the caller can safely just continue processing.
1.19 kristaps 177: * If this is just an empty tag, I'll return 0.
1.9 kristaps 178: */
179: static int
180: trylink(const char *buf, size_t *start, size_t end, size_t dsz)
181: {
1.21 kristaps 182: size_t linkstart, realend, linkend,
183: i, j, textsz, stack;
1.18 kristaps 184: const char *text;
1.9 kristaps 185:
186: /*
187: * Scan to the start of the terminus.
188: * This function is more or less replicated in the formatcode()
189: * for null or index formatting codes.
1.23 kristaps 190: * However, we're slightly different because we might have
191: * nested escapes we need to ignore.
1.9 kristaps 192: */
1.21 kristaps 193: stack = 0;
1.19 kristaps 194: for (linkstart = realend = *start; realend < end; realend++) {
1.23 kristaps 195: if ('<' == buf[realend])
196: stack++;
1.19 kristaps 197: if ('>' != buf[realend])
1.9 kristaps 198: continue;
1.23 kristaps 199: else if (stack-- > 0)
200: continue;
201: if (dsz == 1)
1.9 kristaps 202: break;
1.19 kristaps 203: assert(realend > 0);
204: if (' ' != buf[realend - 1])
1.9 kristaps 205: continue;
1.19 kristaps 206: for (i = realend, j = 0; i < end && j < dsz; j++)
1.9 kristaps 207: if ('>' != buf[i++])
208: break;
209: if (dsz == j)
210: break;
211: }
1.19 kristaps 212:
213: /* Ignore stubs. */
214: if (realend == end || realend == *start)
1.9 kristaps 215: return(0);
216:
1.19 kristaps 217: /* Set linkend to the end of content. */
218: linkend = dsz > 1 ? realend - 1 : realend;
1.18 kristaps 219:
1.19 kristaps 220: /* Re-scan to see if we have a title or section. */
1.18 kristaps 221: text = &buf[*start];
1.19 kristaps 222: for (textsz = *start; textsz < linkend; textsz++)
223: if ('|' == buf[textsz] || '/' == buf[textsz])
1.18 kristaps 224: break;
225:
1.19 kristaps 226: if (textsz < linkend && '|' == buf[textsz]) {
1.20 kristaps 227: /* With title: set start, then end at section. */
1.19 kristaps 228: linkstart = textsz + 1;
1.18 kristaps 229: textsz = textsz - *start;
1.19 kristaps 230: for (i = linkstart; i < linkend; i++)
231: if ('/' == buf[i])
232: break;
233: if (i < linkend)
234: linkend = i;
1.20 kristaps 235: } else if (textsz < linkend && '/' == buf[textsz]) {
236: /* With section: set end at section. */
237: linkend = textsz;
238: textsz = 0;
239: } else
240: /* No title, no section. */
1.18 kristaps 241: textsz = 0;
1.19 kristaps 242:
243: *start = realend;
244: j = linkend - linkstart;
245:
1.20 kristaps 246: /* Do we have only subsection material? */
247: if (0 == j && '/' == buf[linkend]) {
248: linkstart = linkend + 1;
249: linkend = dsz > 1 ? realend - 1 : realend;
250: if (0 == (j = linkend - linkstart))
251: return(0);
252: printf("Sx %.*s", (int)j, &buf[linkstart]);
253: return(1);
254: } else if (0 == j)
1.19 kristaps 255: return(0);
256:
257: /* See if we qualify as being a link or not. */
1.20 kristaps 258: if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
259: (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
260: (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
261: (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
262: (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
263: (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
264: /* Gross. */
265: printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
266: realend) - linkstart), &buf[linkstart]);
1.19 kristaps 267: return(1);
268: }
269:
270: /* See if we qualify as a mailto. */
1.20 kristaps 271: if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
1.19 kristaps 272: printf("Mt %.*s", (int)j, &buf[linkstart]);
273: return(1);
274: }
275:
276: /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
277: if ((j > 3 && ')' == buf[linkend - 1]) &&
278: ('(' == buf[linkend - 3])) {
279: printf("Xr %.*s %c", (int)(j - 3),
280: &buf[linkstart], buf[linkend - 2]);
281: return(1);
282: } else if ((j > 4 && ')' == buf[linkend - 1]) &&
283: ('(' == buf[linkend - 4])) {
284: printf("Xr %.*s %.*s", (int)(j - 4),
285: &buf[linkstart], 2, &buf[linkend - 3]);
286: return(1);
287: } else if ((j > 5 && ')' == buf[linkend - 1]) &&
288: ('(' == buf[linkend - 5])) {
289: printf("Xr %.*s %.*s", (int)(j - 5),
290: &buf[linkstart], 3, &buf[linkend - 4]);
291: return(1);
292: }
293:
294: /* Last try: do we have a double-colon? */
295: for (i = linkstart + 1; i < linkend; i++)
296: if (':' == buf[i] && ':' == buf[i - 1])
1.18 kristaps 297: break;
1.9 kristaps 298:
1.19 kristaps 299: if (i < linkend)
1.10 kristaps 300: printf("Xr %.*s " PERL_SECTION,
1.19 kristaps 301: (int)j, &buf[linkstart]);
1.9 kristaps 302: else
1.19 kristaps 303: printf("Xr %.*s 1", (int)j, &buf[linkstart]);
1.9 kristaps 304:
305: return(1);
306: }
307:
1.13 kristaps 308: /*
309: * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
310: * then it's likely that we're a flag.
311: * Our flag might be followed by an argument, so make sure that we're
312: * accounting for that, too.
313: * If we don't have a flag at all, however, then assume we're an "Ar".
314: */
315: static void
316: dosynopsisfl(const char *buf, size_t *start, size_t end)
317: {
318: size_t i;
319: again:
1.14 kristaps 320: assert(*start + 1 < end);
321: assert('-' == buf[*start]);
322:
323: if ( ! isalnum((int)buf[*start + 1]) &&
324: '?' != buf[*start + 1] &&
325: '-' != buf[*start + 1]) {
326: (*start)--;
327: fputs("Ar ", stdout);
328: return;
329: }
330:
1.13 kristaps 331: (*start)++;
332: for (i = *start; i < end; i++)
333: if (isalnum((int)buf[i]))
334: continue;
1.14 kristaps 335: else if ('?' == buf[i])
336: continue;
1.13 kristaps 337: else if ('-' == buf[i])
338: continue;
339: else if ('_' == buf[i])
340: continue;
341: else
342: break;
343:
344: assert(i < end);
345:
346: if ( ! (' ' == buf[i] || '>' == buf[i])) {
347: printf("Ar ");
348: return;
349: }
350:
351: printf("Fl ");
352: if (end - *start > 1 &&
353: isupper((int)buf[*start]) &&
354: islower((int)buf[*start + 1]) &&
355: (end - *start == 2 ||
356: ' ' == buf[*start + 2]))
357: printf("\\&");
358: printf("%.*s ", (int)(i - *start), &buf[*start]);
359: *start = i;
360:
361: if (' ' == buf[i]) {
362: while (i < end && ' ' == buf[i])
363: i++;
364: assert(i < end);
365: if ('-' == buf[i]) {
366: *start = i;
367: goto again;
368: }
369: printf("Ar ");
370: *start = i;
371: }
372: }
373:
1.9 kristaps 374: /*
1.1 schwarze 375: * We're at the character in front of a format code, which is structured
376: * like X<...> and can contain nested format codes.
377: * This consumes the whole format code, and any nested format codes, til
378: * the end of matched production.
379: * If "reentrant", then we're being called after a macro has already
380: * been printed to the current line.
1.6 kristaps 381: * If "nomacro", then we don't print any macros, just contained data
382: * (e.g., following "Sh" or "Nm").
1.15 kristaps 383: * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
384: * as the first format code on a line (for decoration as an "Nm"),
385: * non-zero otherwise.
1.6 kristaps 386: * Return whether we've printed a macro or not--in other words, whether
387: * this should trigger a subsequent newline (this should be ignored when
388: * reentrant).
1.1 schwarze 389: */
390: static int
1.15 kristaps 391: formatcode(struct state *st, const char *buf, size_t *start,
392: size_t end, int reentrant, int nomacro, int pos)
1.1 schwarze 393: {
394: enum fmt fmt;
1.5 kristaps 395: size_t i, j, dsz;
1.1 schwarze 396:
397: assert(*start + 1 < end);
398: assert('<' == buf[*start + 1]);
399:
1.6 kristaps 400: /*
401: * First, look up the format code.
402: * If it's not valid, then exit immediately.
403: */
404: for (fmt = 0; fmt < FMT__MAX; fmt++)
405: if (buf[*start] == fmts[fmt])
406: break;
407:
408: if (FMT__MAX == fmt) {
409: putchar(last = buf[(*start)++]);
1.8 kristaps 410: if ('\\' == last)
411: putchar('e');
1.6 kristaps 412: return(0);
413: }
414:
1.5 kristaps 415: /*
416: * Determine whether we're overriding our delimiter.
417: * According to POD, if we have more than one '<' followed by a
418: * space, then we need a space followed by matching '>' to close
419: * the expression.
420: * Otherwise we use the usual '<' and '>' matched pair.
421: */
422: i = *start + 1;
423: while (i < end && '<' == buf[i])
424: i++;
425: assert(i > *start + 1);
426: dsz = i - (*start + 1);
427: if (dsz > 1 && (i >= end || ' ' != buf[i]))
428: dsz = 1;
429:
430: /* Remember, if dsz>1, to jump the trailing space. */
431: *start += dsz + 1 + (dsz > 1 ? 1 : 0);
1.1 schwarze 432:
433: /*
1.6 kristaps 434: * Escapes and ignored codes (NULL and INDEX) don't print macro
435: * sequences, so just output them like normal text before
436: * processing for real macros.
1.1 schwarze 437: */
438: if (FMT_ESCAPE == fmt) {
439: formatescape(buf, start, end);
440: return(0);
441: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
1.5 kristaps 442: /*
1.6 kristaps 443: * Just consume til the end delimiter, accounting for
444: * whether it's a custom one.
1.5 kristaps 445: */
446: for ( ; *start < end; (*start)++) {
447: if ('>' != buf[*start])
448: continue;
449: else if (dsz == 1)
450: break;
451: assert(*start > 0);
452: if (' ' != buf[*start - 1])
453: continue;
454: i = *start;
455: for (j = 0; i < end && j < dsz; j++)
456: if ('>' != buf[i++])
457: break;
458: if (dsz != j)
459: continue;
460: (*start) += dsz;
461: break;
462: }
1.24 kristaps 463: if (*start < end) {
464: assert('>' == buf[*start]);
465: (*start)++;
466: }
467: if (isspace(last))
468: while (*start < end && isspace((int)buf[*start]))
469: (*start)++;
1.1 schwarze 470: return(0);
471: }
472:
1.6 kristaps 473: /*
474: * Check whether we're supposed to print macro stuff (this is
475: * suppressed in, e.g., "Nm" and "Sh" macros).
476: */
1.1 schwarze 477: if ( ! nomacro) {
478: /*
479: * Print out the macro describing this format code.
480: * If we're not "reentrant" (not yet on a macro line)
481: * then print a newline, if necessary, and the macro
482: * indicator.
483: * Otherwise, offset us with a space.
484: */
1.6 kristaps 485: if ( ! reentrant) {
486: if (last != '\n')
487: putchar('\n');
1.1 schwarze 488: putchar('.');
1.6 kristaps 489: } else
1.1 schwarze 490: putchar(' ');
491:
492: /*
1.6 kristaps 493: * If we don't have whitespace before us (and none after
494: * the opening delimiter), then suppress macro
495: * whitespace with Pf.
1.1 schwarze 496: */
1.6 kristaps 497: if (' ' != last && '\n' != last && ' ' != buf[*start])
498: printf("Pf ");
499:
1.1 schwarze 500: switch (fmt) {
501: case (FMT_ITALIC):
502: printf("Em ");
503: break;
504: case (FMT_BOLD):
1.14 kristaps 505: if (SECT_SYNOPSIS == st->sect) {
506: if (1 == dsz && '-' == buf[*start])
507: dosynopsisfl(buf, start, end);
1.15 kristaps 508: else if (0 == pos)
509: printf("Nm ");
1.14 kristaps 510: else
511: printf("Ar ");
512: break;
513: }
1.27 ! schwarze 514: if (0 == strncmp(buf + *start, "NULL", 4) &&
! 515: ('=' == buf[*start + 4] ||
! 516: '>' == buf[*start + 4]))
! 517: printf("Dv ");
! 518: else
! 519: printf("Sy ");
1.1 schwarze 520: break;
521: case (FMT_CODE):
1.2 schwarze 522: printf("Qo Li ");
1.1 schwarze 523: break;
524: case (FMT_LINK):
1.19 kristaps 525: /* Try to link; use "No" if it's empty. */
1.9 kristaps 526: if ( ! trylink(buf, start, end, dsz))
527: printf("No ");
1.1 schwarze 528: break;
529: case (FMT_FILE):
530: printf("Pa ");
531: break;
532: case (FMT_NBSP):
533: printf("No ");
534: break;
535: default:
536: abort();
537: }
538: }
539:
540: /*
1.6 kristaps 541: * Process until we reach the end marker (e.g., '>') or until we
1.5 kristaps 542: * find a nested format code.
1.1 schwarze 543: * Don't emit any newlines: since we're on a macro line, we
544: * don't want to break the line.
545: */
546: while (*start < end) {
1.5 kristaps 547: if ('>' == buf[*start] && 1 == dsz) {
1.1 schwarze 548: (*start)++;
549: break;
1.5 kristaps 550: } else if ('>' == buf[*start] &&
551: ' ' == buf[*start - 1]) {
552: /*
553: * Handle custom delimiters.
554: * These require a certain number of
555: * space-preceded carrots before we're really at
556: * the end.
557: */
558: i = *start;
559: for (j = 0; i < end && j < dsz; j++)
560: if ('>' != buf[i++])
561: break;
562: if (dsz == j) {
563: *start += dsz;
564: break;
565: }
1.1 schwarze 566: }
567: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 568: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 569: continue;
570: }
1.3 schwarze 571:
1.4 schwarze 572: /*
573: * Make sure that any macro-like words (or
574: * really any word starting with a capital
575: * letter) is assumed to be a macro that must be
576: * escaped.
577: * This matches "Xx " and "XxEOLN".
578: */
579: if ((' ' == last || '\n' == last) &&
580: end - *start > 1 &&
581: isupper((int)buf[*start]) &&
582: islower((int)buf[*start + 1]) &&
583: (end - *start == 2 ||
584: ' ' == buf[*start + 2]))
585: printf("\\&");
1.3 schwarze 586:
1.4 schwarze 587: /* Suppress newline. */
1.6 kristaps 588: if ('\n' == buf[*start])
589: putchar(last = ' ');
590: else
591: putchar(last = buf[*start]);
1.4 schwarze 592:
1.8 kristaps 593: /* Protect against character escapes. */
594: if ('\\' == last)
595: putchar('e');
596:
1.6 kristaps 597: (*start)++;
598:
599: if (' ' == last)
600: while (*start < end && ' ' == buf[*start])
601: (*start)++;
1.1 schwarze 602: }
1.2 schwarze 603:
604: if ( ! nomacro && FMT_CODE == fmt)
605: printf(" Qc ");
1.1 schwarze 606:
607: /*
1.6 kristaps 608: * We're now subsequent the format code.
609: * If there isn't a space (or newline) here, and we haven't just
610: * printed a space, then suppress space.
1.1 schwarze 611: */
1.6 kristaps 612: if ( ! nomacro && ' ' != last)
613: if (' ' != buf[*start] && '\n' != buf[*start])
614: printf(" Ns ");
1.5 kristaps 615:
1.1 schwarze 616: return(1);
617: }
618:
619: /*
620: * Calls formatcode() til the end of a paragraph.
621: */
622: static void
1.11 kristaps 623: formatcodeln(struct state *st, const char *buf,
624: size_t *start, size_t end, int nomacro)
1.1 schwarze 625: {
626:
1.4 schwarze 627: last = ' ';
1.1 schwarze 628: while (*start < end) {
629: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 630: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 631: continue;
632: }
1.4 schwarze 633: /*
634: * Since we're already on a macro line, we want to make
635: * sure that we don't inadvertently invoke a macro.
636: * We need to do this carefully because section names
637: * are used in troff and we don't want to escape
638: * something that needn't be escaped.
639: */
640: if (' ' == last && end - *start > 1 &&
641: isupper((int)buf[*start]) &&
642: islower((int)buf[*start + 1]) &&
643: (end - *start == 2 ||
644: ' ' == buf[*start + 2]))
645: printf("\\&");
646:
1.8 kristaps 647: if ('\n' == buf[*start])
648: putchar(last = ' ');
649: else
1.1 schwarze 650: putchar(last = buf[*start]);
1.8 kristaps 651:
652: /* Protect against character escapes. */
653: if ('\\' == last)
654: putchar('e');
655:
1.1 schwarze 656: (*start)++;
657: }
658: }
659:
660: /*
1.4 schwarze 661: * Guess at what kind of list we are.
662: * These are taken straight from the POD manual.
663: * I don't know what people do in real life.
664: */
665: static enum list
666: listguess(const char *buf, size_t start, size_t end)
667: {
668: size_t len = end - start;
669:
670: assert(end >= start);
671:
672: if (len == 1 && '*' == buf[start])
673: return(LIST_BULLET);
674: if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
675: return(LIST_ENUM);
676: else if (len == 1 && '1' == buf[start])
677: return(LIST_ENUM);
678: else
679: return(LIST_TAG);
680: }
681:
682: /*
1.1 schwarze 683: * A command paragraph, as noted in the perlpod manual, just indicates
684: * that we should do something, optionally with some text to print as
685: * well.
686: */
687: static void
688: command(struct state *st, const char *buf, size_t start, size_t end)
689: {
690: size_t len, csz;
691: enum cmd cmd;
692:
693: assert('=' == buf[start]);
694: start++;
695: len = end - start;
696:
697: for (cmd = 0; cmd < CMD__MAX; cmd++) {
698: csz = strlen(cmds[cmd]);
699: if (len < csz)
700: continue;
701: if (0 == memcmp(&buf[start], cmd[cmds], csz))
702: break;
703: }
704:
705: /* Ignore bogus commands. */
706:
707: if (CMD__MAX == cmd)
708: return;
709:
710: start += csz;
1.8 kristaps 711: while (start < end && ' ' == buf[start])
712: start++;
713:
1.1 schwarze 714: len = end - start;
715:
716: if (st->paused) {
717: st->paused = CMD_END != cmd;
718: return;
719: }
720:
721: switch (cmd) {
722: case (CMD_POD):
723: break;
724: case (CMD_HEAD1):
725: /*
726: * The behaviour of head= follows from a quick glance at
727: * how pod2man handles it.
728: */
729: printf(".Sh ");
1.11 kristaps 730: st->sect = SECT_NONE;
731: if (end - start == 4) {
1.1 schwarze 732: if (0 == memcmp(&buf[start], "NAME", 4))
1.11 kristaps 733: st->sect = SECT_NAME;
734: } else if (end - start == 8) {
735: if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
736: st->sect = SECT_SYNOPSIS;
737: }
738: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 739: putchar('\n');
740: st->haspar = 1;
741: break;
742: case (CMD_HEAD2):
743: printf(".Ss ");
1.11 kristaps 744: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 745: putchar('\n');
746: st->haspar = 1;
747: break;
748: case (CMD_HEAD3):
749: puts(".Pp");
750: printf(".Em ");
1.11 kristaps 751: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 752: putchar('\n');
753: puts(".Pp");
754: st->haspar = 1;
755: break;
756: case (CMD_HEAD4):
757: puts(".Pp");
758: printf(".No ");
1.11 kristaps 759: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 760: putchar('\n');
761: puts(".Pp");
762: st->haspar = 1;
763: break;
764: case (CMD_OVER):
1.4 schwarze 765: /*
766: * If we have an existing list that hasn't had an =item
767: * yet, then make sure that we open it now.
768: * We use the default list type, but that can't be
769: * helped (we haven't seen any items yet).
1.1 schwarze 770: */
1.4 schwarze 771: if (st->lpos > 0)
772: if (LIST__MAX == st->lstack[st->lpos - 1]) {
773: st->lstack[st->lpos - 1] = LIST_TAG;
774: puts(".Bl -tag -width Ds");
775: }
776: st->lpos++;
777: assert(st->lpos < LIST_STACKSZ);
778: st->lstack[st->lpos - 1] = LIST__MAX;
1.1 schwarze 779: break;
780: case (CMD_ITEM):
1.6 kristaps 781: if (0 == st->lpos) {
782: /*
783: * Bad markup.
784: * Try to compensate.
785: */
786: st->lstack[st->lpos] = LIST__MAX;
787: st->lpos++;
788: }
1.4 schwarze 789: assert(st->lpos > 0);
790: /*
791: * If we're the first =item, guess at what our content
792: * will be: "*" is a bullet list, "1." is a numbered
793: * list, and everything is tagged.
794: */
795: if (LIST__MAX == st->lstack[st->lpos - 1]) {
796: st->lstack[st->lpos - 1] =
797: listguess(buf, start, end);
798: switch (st->lstack[st->lpos - 1]) {
799: case (LIST_BULLET):
800: puts(".Bl -bullet");
801: break;
802: case (LIST_ENUM):
803: puts(".Bl -enum");
804: break;
805: default:
806: puts(".Bl -tag -width Ds");
807: break;
808: }
809: }
810: switch (st->lstack[st->lpos - 1]) {
811: case (LIST_TAG):
812: printf(".It ");
1.11 kristaps 813: formatcodeln(st, buf, &start, end, 0);
1.4 schwarze 814: putchar('\n');
815: break;
816: case (LIST_ENUM):
817: /* FALLTHROUGH */
818: case (LIST_BULLET):
819: /*
820: * Abandon the remainder of the paragraph
821: * because we're going to be a bulletted or
822: * numbered list.
823: */
824: puts(".It");
825: break;
826: default:
827: abort();
828: }
1.1 schwarze 829: st->haspar = 1;
830: break;
831: case (CMD_BACK):
1.4 schwarze 832: /* Make sure we don't back over the stack. */
833: if (st->lpos > 0) {
834: st->lpos--;
835: puts(".El");
836: }
1.1 schwarze 837: break;
838: case (CMD_BEGIN):
839: /*
840: * We disregard all types for now.
841: * TODO: process at least "text" in a -literal block.
842: */
843: st->paused = 1;
844: break;
845: case (CMD_FOR):
846: /*
847: * We ignore all types of encodings and formats
848: * unilaterally.
849: */
850: break;
851: case (CMD_ENCODING):
852: break;
853: case (CMD_CUT):
854: st->parsing = 0;
855: return;
856: default:
857: abort();
858: }
859:
860: /* Any command (but =cut) makes us start parsing. */
861: st->parsing = 1;
862: }
863:
864: /*
865: * Just pump out the line in a verbatim block.
866: */
867: static void
868: verbatim(struct state *st, const char *buf, size_t start, size_t end)
869: {
1.8 kristaps 870: int last;
1.22 kristaps 871: size_t i;
1.1 schwarze 872:
873: if ( ! st->parsing || st->paused)
874: return;
1.22 kristaps 875: again:
876: /*
877: * If we're in the SYNOPSIS, see if we're an #include block.
878: * If we are, then print the "In" macro and re-loop.
879: * This handles any number of inclusions, but only when they
880: * come before the remaining parts...
881: */
882: if (SECT_SYNOPSIS == st->sect) {
883: i = start;
884: for (i = start; i < end && ' ' == buf[i]; i++)
885: /* Spin. */ ;
886: if (i == end)
887: return;
888: /* We're an include block! */
889: if (end - i > 10 &&
890: 0 == memcmp(&buf[i], "#include <", 10)) {
891: start = i + 10;
892: while (start < end && ' ' == buf[start])
893: start++;
894: fputs(".In ", stdout);
895: /* Stop til the '>' marker or we hit eoln. */
896: while (start < end &&
897: '>' != buf[start] && '\n' != buf[start])
898: putchar(buf[start++]);
899: putchar('\n');
900: if (start < end && '>' == buf[start])
901: start++;
902: if (start < end && '\n' == buf[start])
903: start++;
904: if (start < end)
905: goto again;
906: return;
907: }
908: }
909:
910: if (start == end)
911: return;
1.1 schwarze 912: puts(".Bd -literal");
1.8 kristaps 913: for (last = ' '; start < end; start++) {
914: /*
915: * Handle accidental macros (newline starting with
916: * control character) and escapes.
917: */
918: if ('\n' == last)
1.7 kristaps 919: if ('.' == buf[start] || '\'' == buf[start])
920: printf("\\&");
1.8 kristaps 921: putchar(last = buf[start]);
922: if ('\\' == buf[start])
923: printf("e");
1.7 kristaps 924: }
925: putchar('\n');
1.1 schwarze 926: puts(".Ed");
927: }
928:
929: /*
1.13 kristaps 930: * See dosynopsisop().
931: */
932: static int
933: hasmatch(const char *buf, size_t start, size_t end)
934: {
935: size_t stack;
936:
937: for (stack = 0; start < end; start++)
938: if (buf[start] == '[')
939: stack++;
940: else if (buf[start] == ']' && 0 == stack)
941: return(1);
942: else if (buf[start] == ']')
943: stack--;
944: return(0);
945: }
946:
947: /*
948: * If we're in the SYNOPSIS section and we've encounter braces in an
949: * ordinary paragraph, then try to see whether we're an [-option].
950: * Do this, if we're an opening bracket, by first seeing if we have a
951: * matching end via hasmatch().
952: * If we're an ending bracket, see if we have a stack already.
953: */
954: static int
955: dosynopsisop(const char *buf, int *last,
956: size_t *start, size_t end, size_t *opstack)
957: {
958:
959: assert('[' == buf[*start] || ']' == buf[*start]);
960:
961: if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
962: if ('\n' != *last)
963: putchar('\n');
964: puts(".Oo");
965: (*opstack)++;
966: } else if ('[' == buf[*start])
967: return(0);
968:
969: if (']' == buf[*start] && *opstack > 0) {
970: if ('\n' != *last)
971: putchar('\n');
972: puts(".Oc");
973: (*opstack)--;
974: } else if (']' == buf[*start])
975: return(0);
976:
977: (*start)++;
978: *last = '\n';
979: while (' ' == buf[*start])
980: (*start)++;
981: return(1);
982: }
983:
984: /*
1.17 kristaps 985: * Format multiple "Nm" manpage names in the NAME section.
986: */
987: static void
988: donamenm(struct state *st, const char *buf, size_t *start, size_t end)
989: {
990: size_t word;
991:
992: while (*start < end && ' ' == buf[*start])
993: (*start)++;
994:
995: if (end == *start) {
996: puts(".Nm unknown");
997: return;
998: }
999:
1000: while (*start < end) {
1001: fputs(".Nm ", stdout);
1002: for (word = *start; word < end; word++)
1003: if (',' == buf[word])
1004: break;
1005: formatcodeln(st, buf, start, word, 1);
1006: if (*start == end) {
1007: putchar('\n');
1008: continue;
1009: }
1010: assert(',' == buf[*start]);
1011: puts(" ,");
1012: (*start)++;
1013: while (*start < end && ' ' == buf[*start])
1014: (*start)++;
1015: }
1016: }
1017:
1018: /*
1.1 schwarze 1019: * Ordinary paragraph.
1020: * Well, this is really the hardest--POD seems to assume that, for
1021: * example, a leading space implies a newline, and so on.
1022: * Lots of other snakes in the grass: escaping a newline followed by a
1023: * period (accidental mdoc(7) control), double-newlines after macro
1024: * passages, etc.
1025: */
1026: static void
1027: ordinary(struct state *st, const char *buf, size_t start, size_t end)
1028: {
1.13 kristaps 1029: size_t i, j, opstack;
1.15 kristaps 1030: int seq;
1.1 schwarze 1031:
1032: if ( ! st->parsing || st->paused)
1033: return;
1034:
1035: /*
1036: * Special-case: the NAME section.
1037: * If we find a "-" when searching from the end, assume that
1038: * we're in "name - description" format.
1039: * To wit, print out a "Nm" and "Nd" in that format.
1040: */
1.11 kristaps 1041: if (SECT_NAME == st->sect) {
1.15 kristaps 1042: for (i = end - 2; i > start; i--)
1043: if ('-' == buf[i] && ' ' == buf[i + 1])
1.1 schwarze 1044: break;
1045: if ('-' == buf[i]) {
1046: j = i;
1047: /* Roll over multiple "-". */
1048: for ( ; i > start; i--)
1049: if ('-' != buf[i])
1050: break;
1.17 kristaps 1051: donamenm(st, buf, &start, i + 1);
1.5 kristaps 1052: start = j + 1;
1.17 kristaps 1053: while (start < end && ' ' == buf[start])
1054: start++;
1.15 kristaps 1055: fputs(".Nd ", stdout);
1.11 kristaps 1056: formatcodeln(st, buf, &start, end, 1);
1.5 kristaps 1057: putchar('\n');
1.1 schwarze 1058: return;
1059: }
1060: }
1061:
1062: if ( ! st->haspar)
1063: puts(".Pp");
1064:
1065: st->haspar = 0;
1066: last = '\n';
1.13 kristaps 1067: opstack = 0;
1.1 schwarze 1068:
1.15 kristaps 1069: for (seq = 0; start < end; seq++) {
1.1 schwarze 1070: /*
1071: * Loop til we get either to a newline or escape.
1072: * Escape initial control characters.
1073: */
1074: while (start < end) {
1075: if (start < end - 1 && '<' == buf[start + 1])
1076: break;
1077: else if ('\n' == buf[start])
1078: break;
1079: else if ('\n' == last && '.' == buf[start])
1080: printf("\\&");
1081: else if ('\n' == last && '\'' == buf[start])
1082: printf("\\&");
1.12 kristaps 1083: /*
1084: * If we're in the SYNOPSIS, have square
1085: * brackets indicate that we're opening and
1086: * closing an optional context.
1087: */
1.13 kristaps 1088: if (SECT_SYNOPSIS == st->sect &&
1089: ('[' == buf[start] ||
1090: ']' == buf[start]) &&
1091: dosynopsisop(buf, &last,
1092: &start, end, &opstack))
1093: continue;
1.1 schwarze 1094: putchar(last = buf[start++]);
1.8 kristaps 1095: if ('\\' == last)
1096: putchar('e');
1.1 schwarze 1097: }
1098:
1099: if (start < end - 1 && '<' == buf[start + 1]) {
1100: /*
1101: * We've encountered a format code.
1102: * This is going to trigger a macro no matter
1103: * what, so print a newline now.
1104: * Then print the (possibly nested) macros and
1105: * following that, a newline.
1.8 kristaps 1106: * Consume all whitespace so we don't
1107: * accidentally start an implicit literal line.
1.16 kristaps 1108: * If the macro ends with a flush comma or
1109: * period, let mdoc(7) handle it for us.
1.1 schwarze 1110: */
1.15 kristaps 1111: if (formatcode(st, buf, &start, end, 0, 0, seq)) {
1.16 kristaps 1112: if ((start == end - 1 ||
1113: (start < end - 1 &&
1114: (' ' == buf[start + 1] ||
1115: '\n' == buf[start + 1]))) &&
1116: ('.' == buf[start] ||
1117: ',' == buf[start])) {
1118: putchar(' ');
1119: putchar(buf[start++]);
1120: }
1.1 schwarze 1121: putchar(last = '\n');
1.6 kristaps 1122: while (start < end && ' ' == buf[start])
1123: start++;
1124: }
1.1 schwarze 1125: } else if (start < end && '\n' == buf[start]) {
1126: /*
1127: * Print the newline only if we haven't already
1128: * printed a newline.
1129: */
1130: if (last != '\n')
1131: putchar(last = buf[start]);
1132: if (++start >= end)
1133: continue;
1134: /*
1135: * If we have whitespace next, eat it to prevent
1136: * mdoc(7) from thinking that it's meant for
1137: * verbatim text.
1138: * It is--but if we start with that, we can't
1139: * have a macro subsequent it, which may be
1140: * possible if we have an escape next.
1141: */
1142: if (' ' == buf[start] || '\t' == buf[start]) {
1143: puts(".br");
1144: last = '\n';
1145: }
1146: for ( ; start < end; start++)
1147: if (' ' != buf[start] && '\t' != buf[start])
1148: break;
1.12 kristaps 1149: }
1.1 schwarze 1150: }
1151:
1152: if (last != '\n')
1153: putchar('\n');
1154: }
1155:
1156: /*
1157: * There are three kinds of paragraphs: verbatim (starts with whitespace
1158: * of some sort), ordinary (starts without "=" marker), or a command
1159: * (default: starts with "=").
1160: */
1161: static void
1162: dopar(struct state *st, const char *buf, size_t start, size_t end)
1163: {
1164:
1165: if (end == start)
1166: return;
1167: if (' ' == buf[start] || '\t' == buf[start])
1168: verbatim(st, buf, start, end);
1169: else if ('=' != buf[start])
1170: ordinary(st, buf, start, end);
1171: else
1172: command(st, buf, start, end);
1173: }
1174:
1175: /*
1176: * Loop around paragraphs within a document, processing each one in the
1177: * POD way.
1178: */
1179: static void
1180: dofile(const struct args *args, const char *fname,
1181: const struct tm *tm, const char *buf, size_t sz)
1182: {
1183: size_t sup, end, i, cur = 0;
1184: struct state st;
1185: const char *section, *date;
1186: char datebuf[64];
1187: char *title, *cp;
1188:
1189: if (0 == sz)
1190: return;
1191:
1192: /* Title is last path component of the filename. */
1193:
1194: if (NULL != args->title)
1195: title = strdup(args->title);
1196: else if (NULL != (cp = strrchr(fname, '/')))
1197: title = strdup(cp + 1);
1198: else
1199: title = strdup(fname);
1200:
1201: if (NULL == title) {
1202: perror(NULL);
1203: exit(EXIT_FAILURE);
1204: }
1205:
1206: /* Section is 1 unless suffix is "pm". */
1207:
1208: if (NULL == (section = args->section)) {
1209: section = "1";
1210: if (NULL != (cp = strrchr(title, '.'))) {
1211: *cp++ = '\0';
1212: if (0 == strcmp(cp, "pm"))
1.10 kristaps 1213: section = PERL_SECTION;
1.1 schwarze 1214: }
1215: }
1216:
1217: /* Date. Or the given "tm" if not supplied. */
1218:
1219: if (NULL == (date = args->date)) {
1220: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
1221: date = datebuf;
1222: }
1223:
1224: for (cp = title; '\0' != *cp; cp++)
1225: *cp = toupper((int)*cp);
1226:
1227: /* The usual mdoc(7) preamble. */
1228:
1229: printf(".Dd %s\n", date);
1230: printf(".Dt %s %s\n", title, section);
1231: puts(".Os");
1232:
1233: free(title);
1234:
1235: memset(&st, 0, sizeof(struct state));
1236: assert(sz > 0);
1237:
1238: /* Main loop over file contents. */
1239:
1240: while (cur < sz) {
1241: /* Read until next paragraph. */
1242: for (i = cur + 1; i < sz; i++)
1243: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
1244: /* Consume blank paragraphs. */
1245: while (i + 1 < sz && '\n' == buf[i + 1])
1246: i++;
1247: break;
1248: }
1249:
1250: /* Adjust end marker for EOF. */
1251: end = i < sz ? i - 1 :
1252: ('\n' == buf[sz - 1] ? sz - 1 : sz);
1253: sup = i < sz ? end + 2 : sz;
1254:
1255: /* Process paragraph and adjust start. */
1256: dopar(&st, buf, cur, end);
1257: cur = sup;
1258: }
1259: }
1260:
1261: /*
1262: * Read a single file fully into memory.
1263: * If the file is "-", do it from stdin.
1264: * If successfully read, send the input buffer to dofile() for further
1265: * processing.
1266: */
1267: static int
1268: readfile(const struct args *args, const char *fname)
1269: {
1270: int fd;
1271: char *buf;
1272: size_t bufsz, cur;
1273: ssize_t ssz;
1274: struct tm *tm;
1275: time_t ttm;
1276: struct stat st;
1277:
1278: fd = 0 != strcmp("-", fname) ?
1279: open(fname, O_RDONLY, 0) : STDIN_FILENO;
1280:
1281: if (-1 == fd) {
1282: perror(fname);
1283: return(0);
1284: }
1285:
1286: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
1287: ttm = time(NULL);
1288: tm = localtime(&ttm);
1289: } else
1290: tm = localtime(&st.st_mtime);
1291:
1292: /*
1293: * Arbitrarily-sized initial buffer.
1294: * Should be big enough for most files...
1295: */
1296: cur = 0;
1297: bufsz = 1 << 14;
1298: if (NULL == (buf = malloc(bufsz))) {
1299: perror(NULL);
1300: exit(EXIT_FAILURE);
1301: }
1302:
1303: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
1304: /* Double buffer size on fill. */
1305: if ((size_t)ssz == bufsz - cur) {
1306: bufsz *= 2;
1307: if (NULL == (buf = realloc(buf, bufsz))) {
1308: perror(NULL);
1309: exit(EXIT_FAILURE);
1310: }
1311: }
1312: cur += (size_t)ssz;
1313: }
1314: if (ssz < 0) {
1315: perror(fname);
1316: free(buf);
1317: return(0);
1318: }
1319:
1320: dofile(args, STDIN_FILENO == fd ?
1321: "STDIN" : fname, tm, buf, cur);
1322: free(buf);
1323: if (STDIN_FILENO != fd)
1324: close(fd);
1325: return(1);
1326: }
1327:
1328: int
1329: main(int argc, char *argv[])
1330: {
1331: const char *fname, *name;
1332: struct args args;
1333: int c;
1334:
1335: name = strrchr(argv[0], '/');
1336: if (name == NULL)
1337: name = argv[0];
1338: else
1339: ++name;
1340:
1341: memset(&args, 0, sizeof(struct args));
1342: fname = "-";
1343:
1344: /* Accept no arguments for now. */
1345:
1346: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
1347: switch (c) {
1348: case ('h'):
1349: /* FALLTHROUGH */
1350: case ('l'):
1351: /* FALLTHROUGH */
1352: case ('c'):
1353: /* FALLTHROUGH */
1354: case ('o'):
1355: /* FALLTHROUGH */
1356: case ('q'):
1357: /* FALLTHROUGH */
1358: case ('r'):
1359: /* FALLTHROUGH */
1360: case ('u'):
1361: /* FALLTHROUGH */
1362: case ('v'):
1363: /* Ignore these. */
1364: break;
1365: case ('d'):
1366: args.date = optarg;
1367: break;
1368: case ('n'):
1369: args.title = optarg;
1370: break;
1371: case ('s'):
1372: args.section = optarg;
1373: break;
1374: default:
1375: goto usage;
1376: }
1377:
1378: argc -= optind;
1379: argv += optind;
1380:
1381: /* Accept only a single input file. */
1382:
1.25 schwarze 1383: if (argc > 1)
1384: goto usage;
1.1 schwarze 1385: else if (1 == argc)
1386: fname = *argv;
1387:
1388: return(readfile(&args, fname) ?
1389: EXIT_SUCCESS : EXIT_FAILURE);
1390:
1391: usage:
1392: fprintf(stderr, "usage: %s [-d date] "
1.25 schwarze 1393: "[-n title] [-s section] [file]\n", name);
1.1 schwarze 1394:
1395: return(EXIT_FAILURE);
1396: }
CVSweb