Annotation of pod2mdoc/pod2mdoc.c, Revision 1.30
1.30 ! schwarze 1: /* $Id: pod2mdoc.c,v 1.29 2014/07/11 20:45:55 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
1.10 kristaps 29: /*
1.19 kristaps 30: * In what section can we find Perl module manuals?
31: * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
32: * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
1.10 kristaps 33: */
34: #define PERL_SECTION "3p"
35:
1.1 schwarze 36: struct args {
37: const char *title; /* override "Dt" title */
38: const char *date; /* override "Dd" date */
39: const char *section; /* override "Dt" section */
40: };
41:
1.4 schwarze 42: enum list {
43: LIST_BULLET = 0,
44: LIST_ENUM,
45: LIST_TAG,
46: LIST__MAX
47: };
48:
1.11 kristaps 49: enum sect {
50: SECT_NONE = 0,
51: SECT_NAME, /* NAME section */
52: SECT_SYNOPSIS, /* SYNOPSIS section */
53: };
54:
1.1 schwarze 55: struct state {
56: int parsing; /* after =cut of before command */
57: int paused; /* in =begin and before =end */
58: int haspar; /* in paragraph: do we need Pp? */
1.11 kristaps 59: enum sect sect; /* which section are we in? */
1.1 schwarze 60: const char *fname; /* file being parsed */
1.4 schwarze 61: #define LIST_STACKSZ 128
62: enum list lstack[LIST_STACKSZ]; /* open lists */
63: size_t lpos; /* where in list stack */
1.1 schwarze 64: };
65:
66: enum fmt {
67: FMT_ITALIC,
68: FMT_BOLD,
69: FMT_CODE,
70: FMT_LINK,
71: FMT_ESCAPE,
72: FMT_FILE,
73: FMT_NBSP,
74: FMT_INDEX,
75: FMT_NULL,
76: FMT__MAX
77: };
78:
79: enum cmd {
80: CMD_POD = 0,
81: CMD_HEAD1,
82: CMD_HEAD2,
83: CMD_HEAD3,
84: CMD_HEAD4,
85: CMD_OVER,
86: CMD_ITEM,
87: CMD_BACK,
88: CMD_BEGIN,
89: CMD_END,
90: CMD_FOR,
91: CMD_ENCODING,
92: CMD_CUT,
93: CMD__MAX
94: };
95:
96: static const char *const cmds[CMD__MAX] = {
97: "pod", /* CMD_POD */
98: "head1", /* CMD_HEAD1 */
99: "head2", /* CMD_HEAD2 */
100: "head3", /* CMD_HEAD3 */
101: "head4", /* CMD_HEAD4 */
102: "over", /* CMD_OVER */
103: "item", /* CMD_ITEM */
104: "back", /* CMD_BACK */
105: "begin", /* CMD_BEGIN */
106: "end", /* CMD_END */
107: "for", /* CMD_FOR */
108: "encoding", /* CMD_ENCODING */
109: "cut" /* CMD_CUT */
110: };
111:
112: static const char fmts[FMT__MAX] = {
113: 'I', /* FMT_ITALIC */
114: 'B', /* FMT_BOLD */
115: 'C', /* FMT_CODE */
116: 'L', /* FMT_LINK */
117: 'E', /* FMT_ESCAPE */
118: 'F', /* FMT_FILE */
119: 'S', /* FMT_NBSP */
120: 'X', /* FMT_INDEX */
121: 'Z' /* FMT_NULL */
122: };
123:
1.6 kristaps 124: static int last;
125:
1.1 schwarze 126: /*
127: * Given buf[*start] is at the start of an escape name, read til the end
128: * of the escape ('>') then try to do something with it.
129: * Sets start to be one after the '>'.
130: */
131: static void
132: formatescape(const char *buf, size_t *start, size_t end)
133: {
134: char esc[16]; /* no more needed */
135: size_t i, max;
136:
137: max = sizeof(esc) - 1;
138: i = 0;
139: /* Read til our buffer is full. */
140: while (*start < end && '>' != buf[*start] && i < max)
141: esc[i++] = buf[(*start)++];
142: esc[i] = '\0';
143:
144: if (i == max) {
145: /* Too long... skip til we end. */
146: while (*start < end && '>' != buf[*start])
147: (*start)++;
148: return;
149: } else if (*start >= end)
150: return;
151:
152: assert('>' == buf[*start]);
153: (*start)++;
154:
155: /*
156: * TODO: right now, we only recognise the named escapes.
157: * Just let the rest of them go.
158: */
1.6 kristaps 159: if (0 == strcmp(esc, "lt"))
1.1 schwarze 160: printf("\\(la");
161: else if (0 == strcmp(esc, "gt"))
162: printf("\\(ra");
163: else if (0 == strcmp(esc, "vb"))
164: printf("\\(ba");
165: else if (0 == strcmp(esc, "sol"))
166: printf("\\(sl");
1.6 kristaps 167: else
168: return;
169:
170: last = 'a';
1.1 schwarze 171: }
172:
173: /*
1.9 kristaps 174: * Run some heuristics to intuit a link format.
1.19 kristaps 175: * I set "start" to be the end of the sequence (last right-carrot) so
1.9 kristaps 176: * that the caller can safely just continue processing.
1.19 kristaps 177: * If this is just an empty tag, I'll return 0.
1.9 kristaps 178: */
179: static int
180: trylink(const char *buf, size_t *start, size_t end, size_t dsz)
181: {
1.21 kristaps 182: size_t linkstart, realend, linkend,
183: i, j, textsz, stack;
1.9 kristaps 184:
185: /*
186: * Scan to the start of the terminus.
187: * This function is more or less replicated in the formatcode()
188: * for null or index formatting codes.
1.23 kristaps 189: * However, we're slightly different because we might have
190: * nested escapes we need to ignore.
1.9 kristaps 191: */
1.21 kristaps 192: stack = 0;
1.19 kristaps 193: for (linkstart = realend = *start; realend < end; realend++) {
1.23 kristaps 194: if ('<' == buf[realend])
195: stack++;
1.19 kristaps 196: if ('>' != buf[realend])
1.9 kristaps 197: continue;
1.23 kristaps 198: else if (stack-- > 0)
199: continue;
200: if (dsz == 1)
1.9 kristaps 201: break;
1.19 kristaps 202: assert(realend > 0);
203: if (' ' != buf[realend - 1])
1.9 kristaps 204: continue;
1.19 kristaps 205: for (i = realend, j = 0; i < end && j < dsz; j++)
1.9 kristaps 206: if ('>' != buf[i++])
207: break;
208: if (dsz == j)
209: break;
210: }
1.19 kristaps 211:
212: /* Ignore stubs. */
213: if (realend == end || realend == *start)
1.9 kristaps 214: return(0);
215:
1.19 kristaps 216: /* Set linkend to the end of content. */
217: linkend = dsz > 1 ? realend - 1 : realend;
1.18 kristaps 218:
1.19 kristaps 219: /* Re-scan to see if we have a title or section. */
220: for (textsz = *start; textsz < linkend; textsz++)
221: if ('|' == buf[textsz] || '/' == buf[textsz])
1.18 kristaps 222: break;
223:
1.19 kristaps 224: if (textsz < linkend && '|' == buf[textsz]) {
1.20 kristaps 225: /* With title: set start, then end at section. */
1.19 kristaps 226: linkstart = textsz + 1;
1.18 kristaps 227: textsz = textsz - *start;
1.19 kristaps 228: for (i = linkstart; i < linkend; i++)
229: if ('/' == buf[i])
230: break;
231: if (i < linkend)
232: linkend = i;
1.20 kristaps 233: } else if (textsz < linkend && '/' == buf[textsz]) {
234: /* With section: set end at section. */
235: linkend = textsz;
236: textsz = 0;
237: } else
238: /* No title, no section. */
1.18 kristaps 239: textsz = 0;
1.19 kristaps 240:
241: *start = realend;
242: j = linkend - linkstart;
243:
1.20 kristaps 244: /* Do we have only subsection material? */
245: if (0 == j && '/' == buf[linkend]) {
246: linkstart = linkend + 1;
247: linkend = dsz > 1 ? realend - 1 : realend;
248: if (0 == (j = linkend - linkstart))
249: return(0);
250: printf("Sx %.*s", (int)j, &buf[linkstart]);
251: return(1);
252: } else if (0 == j)
1.19 kristaps 253: return(0);
254:
255: /* See if we qualify as being a link or not. */
1.20 kristaps 256: if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
257: (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
258: (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
259: (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
260: (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
261: (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
262: /* Gross. */
263: printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
264: realend) - linkstart), &buf[linkstart]);
1.19 kristaps 265: return(1);
266: }
267:
268: /* See if we qualify as a mailto. */
1.20 kristaps 269: if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
1.19 kristaps 270: printf("Mt %.*s", (int)j, &buf[linkstart]);
271: return(1);
272: }
273:
274: /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
275: if ((j > 3 && ')' == buf[linkend - 1]) &&
276: ('(' == buf[linkend - 3])) {
277: printf("Xr %.*s %c", (int)(j - 3),
278: &buf[linkstart], buf[linkend - 2]);
279: return(1);
280: } else if ((j > 4 && ')' == buf[linkend - 1]) &&
281: ('(' == buf[linkend - 4])) {
282: printf("Xr %.*s %.*s", (int)(j - 4),
283: &buf[linkstart], 2, &buf[linkend - 3]);
284: return(1);
285: } else if ((j > 5 && ')' == buf[linkend - 1]) &&
286: ('(' == buf[linkend - 5])) {
287: printf("Xr %.*s %.*s", (int)(j - 5),
288: &buf[linkstart], 3, &buf[linkend - 4]);
289: return(1);
290: }
291:
292: /* Last try: do we have a double-colon? */
293: for (i = linkstart + 1; i < linkend; i++)
294: if (':' == buf[i] && ':' == buf[i - 1])
1.18 kristaps 295: break;
1.9 kristaps 296:
1.19 kristaps 297: if (i < linkend)
1.10 kristaps 298: printf("Xr %.*s " PERL_SECTION,
1.19 kristaps 299: (int)j, &buf[linkstart]);
1.9 kristaps 300: else
1.19 kristaps 301: printf("Xr %.*s 1", (int)j, &buf[linkstart]);
1.9 kristaps 302:
303: return(1);
304: }
305:
1.13 kristaps 306: /*
307: * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
308: * then it's likely that we're a flag.
309: * Our flag might be followed by an argument, so make sure that we're
310: * accounting for that, too.
311: * If we don't have a flag at all, however, then assume we're an "Ar".
312: */
313: static void
314: dosynopsisfl(const char *buf, size_t *start, size_t end)
315: {
316: size_t i;
317: again:
1.14 kristaps 318: assert(*start + 1 < end);
319: assert('-' == buf[*start]);
320:
321: if ( ! isalnum((int)buf[*start + 1]) &&
322: '?' != buf[*start + 1] &&
323: '-' != buf[*start + 1]) {
324: (*start)--;
325: fputs("Ar ", stdout);
326: return;
327: }
328:
1.13 kristaps 329: (*start)++;
330: for (i = *start; i < end; i++)
331: if (isalnum((int)buf[i]))
332: continue;
1.14 kristaps 333: else if ('?' == buf[i])
334: continue;
1.13 kristaps 335: else if ('-' == buf[i])
336: continue;
337: else if ('_' == buf[i])
338: continue;
339: else
340: break;
341:
342: assert(i < end);
343:
344: if ( ! (' ' == buf[i] || '>' == buf[i])) {
345: printf("Ar ");
346: return;
347: }
348:
349: printf("Fl ");
350: if (end - *start > 1 &&
351: isupper((int)buf[*start]) &&
352: islower((int)buf[*start + 1]) &&
353: (end - *start == 2 ||
354: ' ' == buf[*start + 2]))
355: printf("\\&");
356: printf("%.*s ", (int)(i - *start), &buf[*start]);
357: *start = i;
358:
359: if (' ' == buf[i]) {
360: while (i < end && ' ' == buf[i])
361: i++;
362: assert(i < end);
363: if ('-' == buf[i]) {
364: *start = i;
365: goto again;
366: }
367: printf("Ar ");
368: *start = i;
369: }
370: }
371:
1.9 kristaps 372: /*
1.1 schwarze 373: * We're at the character in front of a format code, which is structured
374: * like X<...> and can contain nested format codes.
375: * This consumes the whole format code, and any nested format codes, til
376: * the end of matched production.
377: * If "reentrant", then we're being called after a macro has already
378: * been printed to the current line.
1.6 kristaps 379: * If "nomacro", then we don't print any macros, just contained data
380: * (e.g., following "Sh" or "Nm").
1.15 kristaps 381: * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
382: * as the first format code on a line (for decoration as an "Nm"),
383: * non-zero otherwise.
1.6 kristaps 384: * Return whether we've printed a macro or not--in other words, whether
385: * this should trigger a subsequent newline (this should be ignored when
386: * reentrant).
1.1 schwarze 387: */
388: static int
1.15 kristaps 389: formatcode(struct state *st, const char *buf, size_t *start,
390: size_t end, int reentrant, int nomacro, int pos)
1.1 schwarze 391: {
392: enum fmt fmt;
1.5 kristaps 393: size_t i, j, dsz;
1.1 schwarze 394:
395: assert(*start + 1 < end);
396: assert('<' == buf[*start + 1]);
397:
1.6 kristaps 398: /*
399: * First, look up the format code.
1.30 ! schwarze 400: * If it's not valid, treat it as a NOOP.
1.6 kristaps 401: */
402: for (fmt = 0; fmt < FMT__MAX; fmt++)
403: if (buf[*start] == fmts[fmt])
404: break;
405:
1.5 kristaps 406: /*
407: * Determine whether we're overriding our delimiter.
408: * According to POD, if we have more than one '<' followed by a
409: * space, then we need a space followed by matching '>' to close
410: * the expression.
411: * Otherwise we use the usual '<' and '>' matched pair.
412: */
413: i = *start + 1;
414: while (i < end && '<' == buf[i])
415: i++;
416: assert(i > *start + 1);
417: dsz = i - (*start + 1);
418: if (dsz > 1 && (i >= end || ' ' != buf[i]))
419: dsz = 1;
420:
421: /* Remember, if dsz>1, to jump the trailing space. */
422: *start += dsz + 1 + (dsz > 1 ? 1 : 0);
1.1 schwarze 423:
424: /*
1.6 kristaps 425: * Escapes and ignored codes (NULL and INDEX) don't print macro
426: * sequences, so just output them like normal text before
427: * processing for real macros.
1.1 schwarze 428: */
429: if (FMT_ESCAPE == fmt) {
430: formatescape(buf, start, end);
431: return(0);
432: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
1.5 kristaps 433: /*
1.6 kristaps 434: * Just consume til the end delimiter, accounting for
435: * whether it's a custom one.
1.5 kristaps 436: */
437: for ( ; *start < end; (*start)++) {
438: if ('>' != buf[*start])
439: continue;
440: else if (dsz == 1)
441: break;
442: assert(*start > 0);
443: if (' ' != buf[*start - 1])
444: continue;
445: i = *start;
446: for (j = 0; i < end && j < dsz; j++)
447: if ('>' != buf[i++])
448: break;
449: if (dsz != j)
450: continue;
451: (*start) += dsz;
452: break;
453: }
1.24 kristaps 454: if (*start < end) {
455: assert('>' == buf[*start]);
456: (*start)++;
457: }
458: if (isspace(last))
459: while (*start < end && isspace((int)buf[*start]))
460: (*start)++;
1.1 schwarze 461: return(0);
462: }
463:
1.6 kristaps 464: /*
465: * Check whether we're supposed to print macro stuff (this is
466: * suppressed in, e.g., "Nm" and "Sh" macros).
467: */
1.30 ! schwarze 468: if (FMT__MAX != fmt && !nomacro) {
1.1 schwarze 469: /*
470: * Print out the macro describing this format code.
471: * If we're not "reentrant" (not yet on a macro line)
472: * then print a newline, if necessary, and the macro
473: * indicator.
474: * Otherwise, offset us with a space.
475: */
1.6 kristaps 476: if ( ! reentrant) {
477: if (last != '\n')
478: putchar('\n');
1.1 schwarze 479: putchar('.');
1.6 kristaps 480: } else
1.1 schwarze 481: putchar(' ');
482:
483: /*
1.6 kristaps 484: * If we don't have whitespace before us (and none after
485: * the opening delimiter), then suppress macro
486: * whitespace with Pf.
1.1 schwarze 487: */
1.6 kristaps 488: if (' ' != last && '\n' != last && ' ' != buf[*start])
489: printf("Pf ");
490:
1.1 schwarze 491: switch (fmt) {
492: case (FMT_ITALIC):
493: printf("Em ");
494: break;
495: case (FMT_BOLD):
1.14 kristaps 496: if (SECT_SYNOPSIS == st->sect) {
497: if (1 == dsz && '-' == buf[*start])
498: dosynopsisfl(buf, start, end);
1.15 kristaps 499: else if (0 == pos)
500: printf("Nm ");
1.14 kristaps 501: else
502: printf("Ar ");
503: break;
504: }
1.27 schwarze 505: if (0 == strncmp(buf + *start, "NULL", 4) &&
506: ('=' == buf[*start + 4] ||
507: '>' == buf[*start + 4]))
508: printf("Dv ");
509: else
510: printf("Sy ");
1.1 schwarze 511: break;
512: case (FMT_CODE):
1.2 schwarze 513: printf("Qo Li ");
1.1 schwarze 514: break;
515: case (FMT_LINK):
1.19 kristaps 516: /* Try to link; use "No" if it's empty. */
1.9 kristaps 517: if ( ! trylink(buf, start, end, dsz))
518: printf("No ");
1.1 schwarze 519: break;
520: case (FMT_FILE):
521: printf("Pa ");
522: break;
523: case (FMT_NBSP):
524: printf("No ");
525: break;
526: default:
527: abort();
528: }
529: }
530:
531: /*
1.6 kristaps 532: * Process until we reach the end marker (e.g., '>') or until we
1.5 kristaps 533: * find a nested format code.
1.1 schwarze 534: * Don't emit any newlines: since we're on a macro line, we
535: * don't want to break the line.
536: */
537: while (*start < end) {
1.5 kristaps 538: if ('>' == buf[*start] && 1 == dsz) {
1.1 schwarze 539: (*start)++;
540: break;
1.5 kristaps 541: } else if ('>' == buf[*start] &&
542: ' ' == buf[*start - 1]) {
543: /*
544: * Handle custom delimiters.
545: * These require a certain number of
546: * space-preceded carrots before we're really at
547: * the end.
548: */
549: i = *start;
550: for (j = 0; i < end && j < dsz; j++)
551: if ('>' != buf[i++])
552: break;
553: if (dsz == j) {
554: *start += dsz;
555: break;
556: }
1.1 schwarze 557: }
558: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 559: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 560: continue;
561: }
1.3 schwarze 562:
1.4 schwarze 563: /*
564: * Make sure that any macro-like words (or
565: * really any word starting with a capital
566: * letter) is assumed to be a macro that must be
567: * escaped.
568: * This matches "Xx " and "XxEOLN".
569: */
570: if ((' ' == last || '\n' == last) &&
571: end - *start > 1 &&
572: isupper((int)buf[*start]) &&
573: islower((int)buf[*start + 1]) &&
574: (end - *start == 2 ||
575: ' ' == buf[*start + 2]))
576: printf("\\&");
1.3 schwarze 577:
1.4 schwarze 578: /* Suppress newline. */
1.6 kristaps 579: if ('\n' == buf[*start])
580: putchar(last = ' ');
581: else
582: putchar(last = buf[*start]);
1.4 schwarze 583:
1.8 kristaps 584: /* Protect against character escapes. */
585: if ('\\' == last)
586: putchar('e');
587:
1.6 kristaps 588: (*start)++;
589:
590: if (' ' == last)
591: while (*start < end && ' ' == buf[*start])
592: (*start)++;
1.1 schwarze 593: }
1.2 schwarze 594:
1.30 ! schwarze 595: if (FMT__MAX == fmt)
! 596: return(0);
! 597:
1.2 schwarze 598: if ( ! nomacro && FMT_CODE == fmt)
599: printf(" Qc ");
1.1 schwarze 600:
601: /*
1.6 kristaps 602: * We're now subsequent the format code.
603: * If there isn't a space (or newline) here, and we haven't just
604: * printed a space, then suppress space.
1.1 schwarze 605: */
1.6 kristaps 606: if ( ! nomacro && ' ' != last)
607: if (' ' != buf[*start] && '\n' != buf[*start])
608: printf(" Ns ");
1.5 kristaps 609:
1.1 schwarze 610: return(1);
611: }
612:
613: /*
614: * Calls formatcode() til the end of a paragraph.
615: */
616: static void
1.11 kristaps 617: formatcodeln(struct state *st, const char *buf,
618: size_t *start, size_t end, int nomacro)
1.1 schwarze 619: {
620:
1.4 schwarze 621: last = ' ';
1.1 schwarze 622: while (*start < end) {
623: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 624: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 625: continue;
626: }
1.4 schwarze 627: /*
628: * Since we're already on a macro line, we want to make
629: * sure that we don't inadvertently invoke a macro.
630: * We need to do this carefully because section names
631: * are used in troff and we don't want to escape
632: * something that needn't be escaped.
633: */
634: if (' ' == last && end - *start > 1 &&
635: isupper((int)buf[*start]) &&
636: islower((int)buf[*start + 1]) &&
637: (end - *start == 2 ||
638: ' ' == buf[*start + 2]))
639: printf("\\&");
640:
1.8 kristaps 641: if ('\n' == buf[*start])
642: putchar(last = ' ');
643: else
1.1 schwarze 644: putchar(last = buf[*start]);
1.8 kristaps 645:
646: /* Protect against character escapes. */
647: if ('\\' == last)
648: putchar('e');
649:
1.1 schwarze 650: (*start)++;
651: }
652: }
653:
654: /*
1.4 schwarze 655: * Guess at what kind of list we are.
656: * These are taken straight from the POD manual.
657: * I don't know what people do in real life.
658: */
659: static enum list
660: listguess(const char *buf, size_t start, size_t end)
661: {
662: size_t len = end - start;
663:
664: assert(end >= start);
665:
666: if (len == 1 && '*' == buf[start])
667: return(LIST_BULLET);
668: if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
669: return(LIST_ENUM);
670: else if (len == 1 && '1' == buf[start])
671: return(LIST_ENUM);
672: else
673: return(LIST_TAG);
674: }
675:
676: /*
1.1 schwarze 677: * A command paragraph, as noted in the perlpod manual, just indicates
678: * that we should do something, optionally with some text to print as
679: * well.
680: */
681: static void
682: command(struct state *st, const char *buf, size_t start, size_t end)
683: {
684: size_t len, csz;
685: enum cmd cmd;
686:
687: assert('=' == buf[start]);
688: start++;
689: len = end - start;
690:
691: for (cmd = 0; cmd < CMD__MAX; cmd++) {
692: csz = strlen(cmds[cmd]);
693: if (len < csz)
694: continue;
695: if (0 == memcmp(&buf[start], cmd[cmds], csz))
696: break;
697: }
698:
699: /* Ignore bogus commands. */
700:
701: if (CMD__MAX == cmd)
702: return;
703:
704: start += csz;
1.8 kristaps 705: while (start < end && ' ' == buf[start])
706: start++;
707:
1.1 schwarze 708: len = end - start;
709:
710: if (st->paused) {
711: st->paused = CMD_END != cmd;
712: return;
713: }
714:
715: switch (cmd) {
716: case (CMD_POD):
717: break;
718: case (CMD_HEAD1):
719: /*
720: * The behaviour of head= follows from a quick glance at
721: * how pod2man handles it.
722: */
723: printf(".Sh ");
1.11 kristaps 724: st->sect = SECT_NONE;
725: if (end - start == 4) {
1.1 schwarze 726: if (0 == memcmp(&buf[start], "NAME", 4))
1.11 kristaps 727: st->sect = SECT_NAME;
728: } else if (end - start == 8) {
729: if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
730: st->sect = SECT_SYNOPSIS;
731: }
732: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 733: putchar('\n');
734: st->haspar = 1;
735: break;
736: case (CMD_HEAD2):
737: printf(".Ss ");
1.11 kristaps 738: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 739: putchar('\n');
740: st->haspar = 1;
741: break;
742: case (CMD_HEAD3):
743: puts(".Pp");
744: printf(".Em ");
1.11 kristaps 745: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 746: putchar('\n');
747: puts(".Pp");
748: st->haspar = 1;
749: break;
750: case (CMD_HEAD4):
751: puts(".Pp");
752: printf(".No ");
1.11 kristaps 753: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 754: putchar('\n');
755: puts(".Pp");
756: st->haspar = 1;
757: break;
758: case (CMD_OVER):
1.4 schwarze 759: /*
760: * If we have an existing list that hasn't had an =item
761: * yet, then make sure that we open it now.
762: * We use the default list type, but that can't be
763: * helped (we haven't seen any items yet).
1.1 schwarze 764: */
1.4 schwarze 765: if (st->lpos > 0)
766: if (LIST__MAX == st->lstack[st->lpos - 1]) {
767: st->lstack[st->lpos - 1] = LIST_TAG;
768: puts(".Bl -tag -width Ds");
769: }
770: st->lpos++;
771: assert(st->lpos < LIST_STACKSZ);
772: st->lstack[st->lpos - 1] = LIST__MAX;
1.1 schwarze 773: break;
774: case (CMD_ITEM):
1.6 kristaps 775: if (0 == st->lpos) {
776: /*
777: * Bad markup.
778: * Try to compensate.
779: */
780: st->lstack[st->lpos] = LIST__MAX;
781: st->lpos++;
782: }
1.4 schwarze 783: assert(st->lpos > 0);
784: /*
785: * If we're the first =item, guess at what our content
786: * will be: "*" is a bullet list, "1." is a numbered
787: * list, and everything is tagged.
788: */
789: if (LIST__MAX == st->lstack[st->lpos - 1]) {
790: st->lstack[st->lpos - 1] =
791: listguess(buf, start, end);
792: switch (st->lstack[st->lpos - 1]) {
793: case (LIST_BULLET):
794: puts(".Bl -bullet");
795: break;
796: case (LIST_ENUM):
797: puts(".Bl -enum");
798: break;
799: default:
800: puts(".Bl -tag -width Ds");
801: break;
802: }
803: }
804: switch (st->lstack[st->lpos - 1]) {
805: case (LIST_TAG):
806: printf(".It ");
1.11 kristaps 807: formatcodeln(st, buf, &start, end, 0);
1.4 schwarze 808: putchar('\n');
809: break;
810: case (LIST_ENUM):
811: /* FALLTHROUGH */
812: case (LIST_BULLET):
813: /*
814: * Abandon the remainder of the paragraph
815: * because we're going to be a bulletted or
816: * numbered list.
817: */
818: puts(".It");
819: break;
820: default:
821: abort();
822: }
1.1 schwarze 823: st->haspar = 1;
824: break;
825: case (CMD_BACK):
1.4 schwarze 826: /* Make sure we don't back over the stack. */
827: if (st->lpos > 0) {
828: st->lpos--;
829: puts(".El");
830: }
1.1 schwarze 831: break;
832: case (CMD_BEGIN):
833: /*
834: * We disregard all types for now.
835: * TODO: process at least "text" in a -literal block.
836: */
837: st->paused = 1;
838: break;
839: case (CMD_FOR):
840: /*
841: * We ignore all types of encodings and formats
842: * unilaterally.
843: */
844: break;
845: case (CMD_ENCODING):
846: break;
847: case (CMD_CUT):
848: st->parsing = 0;
849: return;
850: default:
851: abort();
852: }
853:
854: /* Any command (but =cut) makes us start parsing. */
855: st->parsing = 1;
856: }
857:
858: /*
859: * Just pump out the line in a verbatim block.
860: */
861: static void
862: verbatim(struct state *st, const char *buf, size_t start, size_t end)
863: {
1.8 kristaps 864: int last;
1.22 kristaps 865: size_t i;
1.1 schwarze 866:
867: if ( ! st->parsing || st->paused)
868: return;
1.22 kristaps 869: again:
870: /*
871: * If we're in the SYNOPSIS, see if we're an #include block.
872: * If we are, then print the "In" macro and re-loop.
873: * This handles any number of inclusions, but only when they
874: * come before the remaining parts...
875: */
876: if (SECT_SYNOPSIS == st->sect) {
877: i = start;
878: for (i = start; i < end && ' ' == buf[i]; i++)
879: /* Spin. */ ;
880: if (i == end)
881: return;
882: /* We're an include block! */
883: if (end - i > 10 &&
884: 0 == memcmp(&buf[i], "#include <", 10)) {
885: start = i + 10;
886: while (start < end && ' ' == buf[start])
887: start++;
888: fputs(".In ", stdout);
889: /* Stop til the '>' marker or we hit eoln. */
890: while (start < end &&
891: '>' != buf[start] && '\n' != buf[start])
892: putchar(buf[start++]);
893: putchar('\n');
894: if (start < end && '>' == buf[start])
895: start++;
896: if (start < end && '\n' == buf[start])
897: start++;
898: if (start < end)
899: goto again;
900: return;
901: }
902: }
903:
904: if (start == end)
905: return;
1.1 schwarze 906: puts(".Bd -literal");
1.8 kristaps 907: for (last = ' '; start < end; start++) {
908: /*
909: * Handle accidental macros (newline starting with
910: * control character) and escapes.
911: */
912: if ('\n' == last)
1.7 kristaps 913: if ('.' == buf[start] || '\'' == buf[start])
914: printf("\\&");
1.8 kristaps 915: putchar(last = buf[start]);
916: if ('\\' == buf[start])
917: printf("e");
1.7 kristaps 918: }
919: putchar('\n');
1.1 schwarze 920: puts(".Ed");
921: }
922:
923: /*
1.13 kristaps 924: * See dosynopsisop().
925: */
926: static int
927: hasmatch(const char *buf, size_t start, size_t end)
928: {
929: size_t stack;
930:
931: for (stack = 0; start < end; start++)
932: if (buf[start] == '[')
933: stack++;
934: else if (buf[start] == ']' && 0 == stack)
935: return(1);
936: else if (buf[start] == ']')
937: stack--;
938: return(0);
939: }
940:
941: /*
942: * If we're in the SYNOPSIS section and we've encounter braces in an
943: * ordinary paragraph, then try to see whether we're an [-option].
944: * Do this, if we're an opening bracket, by first seeing if we have a
945: * matching end via hasmatch().
946: * If we're an ending bracket, see if we have a stack already.
947: */
948: static int
949: dosynopsisop(const char *buf, int *last,
950: size_t *start, size_t end, size_t *opstack)
951: {
952:
953: assert('[' == buf[*start] || ']' == buf[*start]);
954:
955: if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
956: if ('\n' != *last)
957: putchar('\n');
958: puts(".Oo");
959: (*opstack)++;
960: } else if ('[' == buf[*start])
961: return(0);
962:
963: if (']' == buf[*start] && *opstack > 0) {
964: if ('\n' != *last)
965: putchar('\n');
966: puts(".Oc");
967: (*opstack)--;
968: } else if (']' == buf[*start])
969: return(0);
970:
971: (*start)++;
972: *last = '\n';
973: while (' ' == buf[*start])
974: (*start)++;
975: return(1);
976: }
977:
978: /*
1.17 kristaps 979: * Format multiple "Nm" manpage names in the NAME section.
980: */
981: static void
982: donamenm(struct state *st, const char *buf, size_t *start, size_t end)
983: {
984: size_t word;
985:
986: while (*start < end && ' ' == buf[*start])
987: (*start)++;
988:
989: if (end == *start) {
990: puts(".Nm unknown");
991: return;
992: }
993:
994: while (*start < end) {
995: fputs(".Nm ", stdout);
996: for (word = *start; word < end; word++)
997: if (',' == buf[word])
998: break;
999: formatcodeln(st, buf, start, word, 1);
1000: if (*start == end) {
1001: putchar('\n');
1002: continue;
1003: }
1004: assert(',' == buf[*start]);
1005: puts(" ,");
1006: (*start)++;
1007: while (*start < end && ' ' == buf[*start])
1008: (*start)++;
1009: }
1010: }
1011:
1012: /*
1.1 schwarze 1013: * Ordinary paragraph.
1014: * Well, this is really the hardest--POD seems to assume that, for
1015: * example, a leading space implies a newline, and so on.
1016: * Lots of other snakes in the grass: escaping a newline followed by a
1017: * period (accidental mdoc(7) control), double-newlines after macro
1018: * passages, etc.
1019: */
1020: static void
1021: ordinary(struct state *st, const char *buf, size_t start, size_t end)
1022: {
1.13 kristaps 1023: size_t i, j, opstack;
1.15 kristaps 1024: int seq;
1.1 schwarze 1025:
1026: if ( ! st->parsing || st->paused)
1027: return;
1028:
1029: /*
1030: * Special-case: the NAME section.
1031: * If we find a "-" when searching from the end, assume that
1032: * we're in "name - description" format.
1033: * To wit, print out a "Nm" and "Nd" in that format.
1034: */
1.11 kristaps 1035: if (SECT_NAME == st->sect) {
1.15 kristaps 1036: for (i = end - 2; i > start; i--)
1037: if ('-' == buf[i] && ' ' == buf[i + 1])
1.1 schwarze 1038: break;
1039: if ('-' == buf[i]) {
1040: j = i;
1041: /* Roll over multiple "-". */
1042: for ( ; i > start; i--)
1043: if ('-' != buf[i])
1044: break;
1.17 kristaps 1045: donamenm(st, buf, &start, i + 1);
1.5 kristaps 1046: start = j + 1;
1.17 kristaps 1047: while (start < end && ' ' == buf[start])
1048: start++;
1.15 kristaps 1049: fputs(".Nd ", stdout);
1.11 kristaps 1050: formatcodeln(st, buf, &start, end, 1);
1.5 kristaps 1051: putchar('\n');
1.1 schwarze 1052: return;
1053: }
1054: }
1055:
1056: if ( ! st->haspar)
1057: puts(".Pp");
1058:
1059: st->haspar = 0;
1060: last = '\n';
1.13 kristaps 1061: opstack = 0;
1.1 schwarze 1062:
1.15 kristaps 1063: for (seq = 0; start < end; seq++) {
1.1 schwarze 1064: /*
1065: * Loop til we get either to a newline or escape.
1066: * Escape initial control characters.
1067: */
1068: while (start < end) {
1069: if (start < end - 1 && '<' == buf[start + 1])
1070: break;
1071: else if ('\n' == buf[start])
1072: break;
1073: else if ('\n' == last && '.' == buf[start])
1074: printf("\\&");
1075: else if ('\n' == last && '\'' == buf[start])
1076: printf("\\&");
1.12 kristaps 1077: /*
1078: * If we're in the SYNOPSIS, have square
1079: * brackets indicate that we're opening and
1080: * closing an optional context.
1081: */
1.13 kristaps 1082: if (SECT_SYNOPSIS == st->sect &&
1083: ('[' == buf[start] ||
1084: ']' == buf[start]) &&
1085: dosynopsisop(buf, &last,
1086: &start, end, &opstack))
1087: continue;
1.1 schwarze 1088: putchar(last = buf[start++]);
1.8 kristaps 1089: if ('\\' == last)
1090: putchar('e');
1.1 schwarze 1091: }
1092:
1093: if (start < end - 1 && '<' == buf[start + 1]) {
1.15 kristaps 1094: if (formatcode(st, buf, &start, end, 0, 0, seq)) {
1.30 ! schwarze 1095: /*
! 1096: * Let mdoc(7) handle trailing punctuation.
! 1097: * XXX Some punctuation characters
! 1098: * are not handled yet.
! 1099: */
1.16 kristaps 1100: if ((start == end - 1 ||
1101: (start < end - 1 &&
1102: (' ' == buf[start + 1] ||
1103: '\n' == buf[start + 1]))) &&
1104: ('.' == buf[start] ||
1105: ',' == buf[start])) {
1106: putchar(' ');
1107: putchar(buf[start++]);
1108: }
1.30 ! schwarze 1109: /* End the macro line. */
1.1 schwarze 1110: putchar(last = '\n');
1.30 ! schwarze 1111: /*
! 1112: * Consume all whitespace
! 1113: * so we don't accidentally start
! 1114: * an implicit literal line.
! 1115: */
1.6 kristaps 1116: while (start < end && ' ' == buf[start])
1117: start++;
1118: }
1.1 schwarze 1119: } else if (start < end && '\n' == buf[start]) {
1120: /*
1121: * Print the newline only if we haven't already
1122: * printed a newline.
1123: */
1124: if (last != '\n')
1125: putchar(last = buf[start]);
1126: if (++start >= end)
1127: continue;
1128: /*
1129: * If we have whitespace next, eat it to prevent
1130: * mdoc(7) from thinking that it's meant for
1131: * verbatim text.
1132: * It is--but if we start with that, we can't
1133: * have a macro subsequent it, which may be
1134: * possible if we have an escape next.
1135: */
1136: if (' ' == buf[start] || '\t' == buf[start]) {
1137: puts(".br");
1138: last = '\n';
1139: }
1140: for ( ; start < end; start++)
1141: if (' ' != buf[start] && '\t' != buf[start])
1142: break;
1.12 kristaps 1143: }
1.1 schwarze 1144: }
1145:
1146: if (last != '\n')
1147: putchar('\n');
1148: }
1149:
1150: /*
1151: * There are three kinds of paragraphs: verbatim (starts with whitespace
1152: * of some sort), ordinary (starts without "=" marker), or a command
1153: * (default: starts with "=").
1154: */
1155: static void
1156: dopar(struct state *st, const char *buf, size_t start, size_t end)
1157: {
1158:
1159: if (end == start)
1160: return;
1161: if (' ' == buf[start] || '\t' == buf[start])
1162: verbatim(st, buf, start, end);
1163: else if ('=' != buf[start])
1164: ordinary(st, buf, start, end);
1165: else
1166: command(st, buf, start, end);
1167: }
1168:
1169: /*
1170: * Loop around paragraphs within a document, processing each one in the
1171: * POD way.
1172: */
1173: static void
1174: dofile(const struct args *args, const char *fname,
1175: const struct tm *tm, const char *buf, size_t sz)
1176: {
1.29 schwarze 1177: char datebuf[64];
1.1 schwarze 1178: struct state st;
1.29 schwarze 1179: const char *fbase, *fext, *section, *date;
1.1 schwarze 1180: char *title, *cp;
1.29 schwarze 1181: size_t sup, end, i, cur = 0;
1.1 schwarze 1182:
1183: if (0 == sz)
1184: return;
1185:
1.29 schwarze 1186: /*
1187: * Parsing the filename is almost always required,
1188: * except when both the title and the section
1189: * are provided on the command line.
1190: */
1191:
1192: if (NULL == args->title || NULL == args->section) {
1193: fbase = strrchr(fname, '/');
1194: if (NULL == fbase)
1195: fbase = fname;
1196: else
1197: fbase++;
1198: fext = strrchr(fbase, '.');
1199: } else
1200: fext = NULL;
1201:
1202: /*
1203: * The title will be converted to uppercase,
1204: * so it needs to be copied.
1205: */
1206:
1207: title = (NULL != args->title) ? strdup(args->title) :
1208: (NULL != fext) ? strndup(fbase, fext - fbase) :
1209: strdup(fbase);
1.1 schwarze 1210:
1211: if (NULL == title) {
1212: perror(NULL);
1213: exit(EXIT_FAILURE);
1214: }
1215:
1216: /* Section is 1 unless suffix is "pm". */
1217:
1.29 schwarze 1218: section = (NULL != args->section) ? args->section :
1219: (NULL == fext || strcmp(fext + 1, "pm")) ? "1" :
1220: PERL_SECTION;
1.1 schwarze 1221:
1222: /* Date. Or the given "tm" if not supplied. */
1223:
1224: if (NULL == (date = args->date)) {
1225: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
1226: date = datebuf;
1227: }
1228:
1229: for (cp = title; '\0' != *cp; cp++)
1230: *cp = toupper((int)*cp);
1231:
1232: /* The usual mdoc(7) preamble. */
1233:
1234: printf(".Dd %s\n", date);
1235: printf(".Dt %s %s\n", title, section);
1236: puts(".Os");
1237:
1238: free(title);
1239:
1240: memset(&st, 0, sizeof(struct state));
1241: assert(sz > 0);
1242:
1243: /* Main loop over file contents. */
1244:
1245: while (cur < sz) {
1246: /* Read until next paragraph. */
1247: for (i = cur + 1; i < sz; i++)
1248: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
1249: /* Consume blank paragraphs. */
1250: while (i + 1 < sz && '\n' == buf[i + 1])
1251: i++;
1252: break;
1253: }
1254:
1255: /* Adjust end marker for EOF. */
1256: end = i < sz ? i - 1 :
1257: ('\n' == buf[sz - 1] ? sz - 1 : sz);
1258: sup = i < sz ? end + 2 : sz;
1259:
1260: /* Process paragraph and adjust start. */
1261: dopar(&st, buf, cur, end);
1262: cur = sup;
1263: }
1264: }
1265:
1266: /*
1267: * Read a single file fully into memory.
1268: * If the file is "-", do it from stdin.
1269: * If successfully read, send the input buffer to dofile() for further
1270: * processing.
1271: */
1272: static int
1273: readfile(const struct args *args, const char *fname)
1274: {
1275: int fd;
1276: char *buf;
1277: size_t bufsz, cur;
1278: ssize_t ssz;
1279: struct tm *tm;
1280: time_t ttm;
1281: struct stat st;
1282:
1283: fd = 0 != strcmp("-", fname) ?
1284: open(fname, O_RDONLY, 0) : STDIN_FILENO;
1285:
1286: if (-1 == fd) {
1287: perror(fname);
1288: return(0);
1289: }
1290:
1291: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
1292: ttm = time(NULL);
1293: tm = localtime(&ttm);
1294: } else
1295: tm = localtime(&st.st_mtime);
1296:
1297: /*
1298: * Arbitrarily-sized initial buffer.
1299: * Should be big enough for most files...
1300: */
1301: cur = 0;
1302: bufsz = 1 << 14;
1303: if (NULL == (buf = malloc(bufsz))) {
1304: perror(NULL);
1305: exit(EXIT_FAILURE);
1306: }
1307:
1308: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
1309: /* Double buffer size on fill. */
1310: if ((size_t)ssz == bufsz - cur) {
1311: bufsz *= 2;
1312: if (NULL == (buf = realloc(buf, bufsz))) {
1313: perror(NULL);
1314: exit(EXIT_FAILURE);
1315: }
1316: }
1317: cur += (size_t)ssz;
1318: }
1319: if (ssz < 0) {
1320: perror(fname);
1321: free(buf);
1322: return(0);
1323: }
1324:
1325: dofile(args, STDIN_FILENO == fd ?
1326: "STDIN" : fname, tm, buf, cur);
1327: free(buf);
1328: if (STDIN_FILENO != fd)
1329: close(fd);
1330: return(1);
1331: }
1332:
1333: int
1334: main(int argc, char *argv[])
1335: {
1336: const char *fname, *name;
1337: struct args args;
1338: int c;
1339:
1340: name = strrchr(argv[0], '/');
1341: if (name == NULL)
1342: name = argv[0];
1343: else
1344: ++name;
1345:
1346: memset(&args, 0, sizeof(struct args));
1347: fname = "-";
1348:
1349: /* Accept no arguments for now. */
1350:
1351: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
1352: switch (c) {
1353: case ('h'):
1354: /* FALLTHROUGH */
1355: case ('l'):
1356: /* FALLTHROUGH */
1357: case ('c'):
1358: /* FALLTHROUGH */
1359: case ('o'):
1360: /* FALLTHROUGH */
1361: case ('q'):
1362: /* FALLTHROUGH */
1363: case ('r'):
1364: /* FALLTHROUGH */
1365: case ('u'):
1366: /* FALLTHROUGH */
1367: case ('v'):
1368: /* Ignore these. */
1369: break;
1370: case ('d'):
1371: args.date = optarg;
1372: break;
1373: case ('n'):
1374: args.title = optarg;
1375: break;
1376: case ('s'):
1377: args.section = optarg;
1378: break;
1379: default:
1380: goto usage;
1381: }
1382:
1383: argc -= optind;
1384: argv += optind;
1385:
1386: /* Accept only a single input file. */
1387:
1.25 schwarze 1388: if (argc > 1)
1389: goto usage;
1.1 schwarze 1390: else if (1 == argc)
1391: fname = *argv;
1392:
1393: return(readfile(&args, fname) ?
1394: EXIT_SUCCESS : EXIT_FAILURE);
1395:
1396: usage:
1397: fprintf(stderr, "usage: %s [-d date] "
1.25 schwarze 1398: "[-n title] [-s section] [file]\n", name);
1.1 schwarze 1399:
1400: return(EXIT_FAILURE);
1401: }
CVSweb