Annotation of pod2mdoc/pod2mdoc.c, Revision 1.22
1.22 ! kristaps 1: /* $Id: pod2mdoc.c,v 1.21 2014/04/03 11:55:01 kristaps Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
1.10 kristaps 29: /*
1.19 kristaps 30: * In what section can we find Perl module manuals?
31: * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
32: * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
1.10 kristaps 33: */
34: #define PERL_SECTION "3p"
35:
1.1 schwarze 36: struct args {
37: const char *title; /* override "Dt" title */
38: const char *date; /* override "Dd" date */
39: const char *section; /* override "Dt" section */
40: };
41:
1.4 schwarze 42: enum list {
43: LIST_BULLET = 0,
44: LIST_ENUM,
45: LIST_TAG,
46: LIST__MAX
47: };
48:
1.11 kristaps 49: enum sect {
50: SECT_NONE = 0,
51: SECT_NAME, /* NAME section */
52: SECT_SYNOPSIS, /* SYNOPSIS section */
53: };
54:
1.1 schwarze 55: struct state {
56: int parsing; /* after =cut of before command */
57: int paused; /* in =begin and before =end */
58: int haspar; /* in paragraph: do we need Pp? */
1.11 kristaps 59: enum sect sect; /* which section are we in? */
1.1 schwarze 60: const char *fname; /* file being parsed */
1.4 schwarze 61: #define LIST_STACKSZ 128
62: enum list lstack[LIST_STACKSZ]; /* open lists */
63: size_t lpos; /* where in list stack */
1.1 schwarze 64: };
65:
66: enum fmt {
67: FMT_ITALIC,
68: FMT_BOLD,
69: FMT_CODE,
70: FMT_LINK,
71: FMT_ESCAPE,
72: FMT_FILE,
73: FMT_NBSP,
74: FMT_INDEX,
75: FMT_NULL,
76: FMT__MAX
77: };
78:
79: enum cmd {
80: CMD_POD = 0,
81: CMD_HEAD1,
82: CMD_HEAD2,
83: CMD_HEAD3,
84: CMD_HEAD4,
85: CMD_OVER,
86: CMD_ITEM,
87: CMD_BACK,
88: CMD_BEGIN,
89: CMD_END,
90: CMD_FOR,
91: CMD_ENCODING,
92: CMD_CUT,
93: CMD__MAX
94: };
95:
96: static const char *const cmds[CMD__MAX] = {
97: "pod", /* CMD_POD */
98: "head1", /* CMD_HEAD1 */
99: "head2", /* CMD_HEAD2 */
100: "head3", /* CMD_HEAD3 */
101: "head4", /* CMD_HEAD4 */
102: "over", /* CMD_OVER */
103: "item", /* CMD_ITEM */
104: "back", /* CMD_BACK */
105: "begin", /* CMD_BEGIN */
106: "end", /* CMD_END */
107: "for", /* CMD_FOR */
108: "encoding", /* CMD_ENCODING */
109: "cut" /* CMD_CUT */
110: };
111:
112: static const char fmts[FMT__MAX] = {
113: 'I', /* FMT_ITALIC */
114: 'B', /* FMT_BOLD */
115: 'C', /* FMT_CODE */
116: 'L', /* FMT_LINK */
117: 'E', /* FMT_ESCAPE */
118: 'F', /* FMT_FILE */
119: 'S', /* FMT_NBSP */
120: 'X', /* FMT_INDEX */
121: 'Z' /* FMT_NULL */
122: };
123:
1.6 kristaps 124: static int last;
125:
1.1 schwarze 126: /*
127: * Given buf[*start] is at the start of an escape name, read til the end
128: * of the escape ('>') then try to do something with it.
129: * Sets start to be one after the '>'.
130: */
131: static void
132: formatescape(const char *buf, size_t *start, size_t end)
133: {
134: char esc[16]; /* no more needed */
135: size_t i, max;
136:
137: max = sizeof(esc) - 1;
138: i = 0;
139: /* Read til our buffer is full. */
140: while (*start < end && '>' != buf[*start] && i < max)
141: esc[i++] = buf[(*start)++];
142: esc[i] = '\0';
143:
144: if (i == max) {
145: /* Too long... skip til we end. */
146: while (*start < end && '>' != buf[*start])
147: (*start)++;
148: return;
149: } else if (*start >= end)
150: return;
151:
152: assert('>' == buf[*start]);
153: (*start)++;
154:
155: /*
156: * TODO: right now, we only recognise the named escapes.
157: * Just let the rest of them go.
158: */
1.6 kristaps 159: if (0 == strcmp(esc, "lt"))
1.1 schwarze 160: printf("\\(la");
161: else if (0 == strcmp(esc, "gt"))
162: printf("\\(ra");
163: else if (0 == strcmp(esc, "vb"))
164: printf("\\(ba");
165: else if (0 == strcmp(esc, "sol"))
166: printf("\\(sl");
1.6 kristaps 167: else
168: return;
169:
170: last = 'a';
1.1 schwarze 171: }
172:
173: /*
1.9 kristaps 174: * Run some heuristics to intuit a link format.
1.19 kristaps 175: * I set "start" to be the end of the sequence (last right-carrot) so
1.9 kristaps 176: * that the caller can safely just continue processing.
1.19 kristaps 177: * If this is just an empty tag, I'll return 0.
1.9 kristaps 178: */
179: static int
180: trylink(const char *buf, size_t *start, size_t end, size_t dsz)
181: {
1.21 kristaps 182: size_t linkstart, realend, linkend,
183: i, j, textsz, stack;
1.18 kristaps 184: const char *text;
1.9 kristaps 185:
186: /*
187: * Scan to the start of the terminus.
188: * This function is more or less replicated in the formatcode()
189: * for null or index formatting codes.
190: */
1.21 kristaps 191: stack = 0;
1.19 kristaps 192: for (linkstart = realend = *start; realend < end; realend++) {
193: if ('>' != buf[realend])
1.9 kristaps 194: continue;
195: else if (dsz == 1)
196: break;
1.19 kristaps 197: assert(realend > 0);
198: if (' ' != buf[realend - 1])
1.9 kristaps 199: continue;
1.19 kristaps 200: for (i = realend, j = 0; i < end && j < dsz; j++)
1.9 kristaps 201: if ('>' != buf[i++])
202: break;
203: if (dsz == j)
204: break;
205: }
1.19 kristaps 206:
207: /* Ignore stubs. */
208: if (realend == end || realend == *start)
1.9 kristaps 209: return(0);
210:
1.19 kristaps 211: /* Set linkend to the end of content. */
212: linkend = dsz > 1 ? realend - 1 : realend;
1.18 kristaps 213:
1.19 kristaps 214: /* Re-scan to see if we have a title or section. */
1.18 kristaps 215: text = &buf[*start];
1.19 kristaps 216: for (textsz = *start; textsz < linkend; textsz++)
217: if ('|' == buf[textsz] || '/' == buf[textsz])
1.18 kristaps 218: break;
219:
1.19 kristaps 220: if (textsz < linkend && '|' == buf[textsz]) {
1.20 kristaps 221: /* With title: set start, then end at section. */
1.19 kristaps 222: linkstart = textsz + 1;
1.18 kristaps 223: textsz = textsz - *start;
1.19 kristaps 224: for (i = linkstart; i < linkend; i++)
225: if ('/' == buf[i])
226: break;
227: if (i < linkend)
228: linkend = i;
1.20 kristaps 229: } else if (textsz < linkend && '/' == buf[textsz]) {
230: /* With section: set end at section. */
231: linkend = textsz;
232: textsz = 0;
233: } else
234: /* No title, no section. */
1.18 kristaps 235: textsz = 0;
1.19 kristaps 236:
237: *start = realend;
238: j = linkend - linkstart;
239:
1.20 kristaps 240: /* Do we have only subsection material? */
241: if (0 == j && '/' == buf[linkend]) {
242: linkstart = linkend + 1;
243: linkend = dsz > 1 ? realend - 1 : realend;
244: if (0 == (j = linkend - linkstart))
245: return(0);
246: printf("Sx %.*s", (int)j, &buf[linkstart]);
247: return(1);
248: } else if (0 == j)
1.19 kristaps 249: return(0);
250:
251: /* See if we qualify as being a link or not. */
1.20 kristaps 252: if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
253: (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
254: (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
255: (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
256: (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
257: (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
258: /* Gross. */
259: printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
260: realend) - linkstart), &buf[linkstart]);
1.19 kristaps 261: return(1);
262: }
263:
264: /* See if we qualify as a mailto. */
1.20 kristaps 265: if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
1.19 kristaps 266: printf("Mt %.*s", (int)j, &buf[linkstart]);
267: return(1);
268: }
269:
270: /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
271: if ((j > 3 && ')' == buf[linkend - 1]) &&
272: ('(' == buf[linkend - 3])) {
273: printf("Xr %.*s %c", (int)(j - 3),
274: &buf[linkstart], buf[linkend - 2]);
275: return(1);
276: } else if ((j > 4 && ')' == buf[linkend - 1]) &&
277: ('(' == buf[linkend - 4])) {
278: printf("Xr %.*s %.*s", (int)(j - 4),
279: &buf[linkstart], 2, &buf[linkend - 3]);
280: return(1);
281: } else if ((j > 5 && ')' == buf[linkend - 1]) &&
282: ('(' == buf[linkend - 5])) {
283: printf("Xr %.*s %.*s", (int)(j - 5),
284: &buf[linkstart], 3, &buf[linkend - 4]);
285: return(1);
286: }
287:
288: /* Last try: do we have a double-colon? */
289: for (i = linkstart + 1; i < linkend; i++)
290: if (':' == buf[i] && ':' == buf[i - 1])
1.18 kristaps 291: break;
1.9 kristaps 292:
1.19 kristaps 293: if (i < linkend)
1.10 kristaps 294: printf("Xr %.*s " PERL_SECTION,
1.19 kristaps 295: (int)j, &buf[linkstart]);
1.9 kristaps 296: else
1.19 kristaps 297: printf("Xr %.*s 1", (int)j, &buf[linkstart]);
1.9 kristaps 298:
299: return(1);
300: }
301:
1.13 kristaps 302: /*
303: * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
304: * then it's likely that we're a flag.
305: * Our flag might be followed by an argument, so make sure that we're
306: * accounting for that, too.
307: * If we don't have a flag at all, however, then assume we're an "Ar".
308: */
309: static void
310: dosynopsisfl(const char *buf, size_t *start, size_t end)
311: {
312: size_t i;
313: again:
1.14 kristaps 314: assert(*start + 1 < end);
315: assert('-' == buf[*start]);
316:
317: if ( ! isalnum((int)buf[*start + 1]) &&
318: '?' != buf[*start + 1] &&
319: '-' != buf[*start + 1]) {
320: (*start)--;
321: fputs("Ar ", stdout);
322: return;
323: }
324:
1.13 kristaps 325: (*start)++;
326: for (i = *start; i < end; i++)
327: if (isalnum((int)buf[i]))
328: continue;
1.14 kristaps 329: else if ('?' == buf[i])
330: continue;
1.13 kristaps 331: else if ('-' == buf[i])
332: continue;
333: else if ('_' == buf[i])
334: continue;
335: else
336: break;
337:
338: assert(i < end);
339:
340: if ( ! (' ' == buf[i] || '>' == buf[i])) {
341: printf("Ar ");
342: return;
343: }
344:
345: printf("Fl ");
346: if (end - *start > 1 &&
347: isupper((int)buf[*start]) &&
348: islower((int)buf[*start + 1]) &&
349: (end - *start == 2 ||
350: ' ' == buf[*start + 2]))
351: printf("\\&");
352: printf("%.*s ", (int)(i - *start), &buf[*start]);
353: *start = i;
354:
355: if (' ' == buf[i]) {
356: while (i < end && ' ' == buf[i])
357: i++;
358: assert(i < end);
359: if ('-' == buf[i]) {
360: *start = i;
361: goto again;
362: }
363: printf("Ar ");
364: *start = i;
365: }
366: }
367:
1.9 kristaps 368: /*
1.1 schwarze 369: * We're at the character in front of a format code, which is structured
370: * like X<...> and can contain nested format codes.
371: * This consumes the whole format code, and any nested format codes, til
372: * the end of matched production.
373: * If "reentrant", then we're being called after a macro has already
374: * been printed to the current line.
1.6 kristaps 375: * If "nomacro", then we don't print any macros, just contained data
376: * (e.g., following "Sh" or "Nm").
1.15 kristaps 377: * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
378: * as the first format code on a line (for decoration as an "Nm"),
379: * non-zero otherwise.
1.6 kristaps 380: * Return whether we've printed a macro or not--in other words, whether
381: * this should trigger a subsequent newline (this should be ignored when
382: * reentrant).
1.1 schwarze 383: */
384: static int
1.15 kristaps 385: formatcode(struct state *st, const char *buf, size_t *start,
386: size_t end, int reentrant, int nomacro, int pos)
1.1 schwarze 387: {
388: enum fmt fmt;
1.5 kristaps 389: size_t i, j, dsz;
1.1 schwarze 390:
391: assert(*start + 1 < end);
392: assert('<' == buf[*start + 1]);
393:
1.6 kristaps 394: /*
395: * First, look up the format code.
396: * If it's not valid, then exit immediately.
397: */
398: for (fmt = 0; fmt < FMT__MAX; fmt++)
399: if (buf[*start] == fmts[fmt])
400: break;
401:
402: if (FMT__MAX == fmt) {
403: putchar(last = buf[(*start)++]);
1.8 kristaps 404: if ('\\' == last)
405: putchar('e');
1.6 kristaps 406: return(0);
407: }
408:
1.5 kristaps 409: /*
410: * Determine whether we're overriding our delimiter.
411: * According to POD, if we have more than one '<' followed by a
412: * space, then we need a space followed by matching '>' to close
413: * the expression.
414: * Otherwise we use the usual '<' and '>' matched pair.
415: */
416: i = *start + 1;
417: while (i < end && '<' == buf[i])
418: i++;
419: assert(i > *start + 1);
420: dsz = i - (*start + 1);
421: if (dsz > 1 && (i >= end || ' ' != buf[i]))
422: dsz = 1;
423:
424: /* Remember, if dsz>1, to jump the trailing space. */
425: *start += dsz + 1 + (dsz > 1 ? 1 : 0);
1.1 schwarze 426:
427: /*
1.6 kristaps 428: * Escapes and ignored codes (NULL and INDEX) don't print macro
429: * sequences, so just output them like normal text before
430: * processing for real macros.
1.1 schwarze 431: */
432: if (FMT_ESCAPE == fmt) {
433: formatescape(buf, start, end);
434: return(0);
435: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
1.5 kristaps 436: /*
1.6 kristaps 437: * Just consume til the end delimiter, accounting for
438: * whether it's a custom one.
1.5 kristaps 439: */
440: for ( ; *start < end; (*start)++) {
441: if ('>' != buf[*start])
442: continue;
443: else if (dsz == 1)
444: break;
445: assert(*start > 0);
446: if (' ' != buf[*start - 1])
447: continue;
448: i = *start;
449: for (j = 0; i < end && j < dsz; j++)
450: if ('>' != buf[i++])
451: break;
452: if (dsz != j)
453: continue;
454: (*start) += dsz;
455: break;
456: }
1.1 schwarze 457: return(0);
458: }
459:
1.6 kristaps 460: /*
461: * Check whether we're supposed to print macro stuff (this is
462: * suppressed in, e.g., "Nm" and "Sh" macros).
463: */
1.1 schwarze 464: if ( ! nomacro) {
465: /*
466: * Print out the macro describing this format code.
467: * If we're not "reentrant" (not yet on a macro line)
468: * then print a newline, if necessary, and the macro
469: * indicator.
470: * Otherwise, offset us with a space.
471: */
1.6 kristaps 472: if ( ! reentrant) {
473: if (last != '\n')
474: putchar('\n');
1.1 schwarze 475: putchar('.');
1.6 kristaps 476: } else
1.1 schwarze 477: putchar(' ');
478:
479: /*
1.6 kristaps 480: * If we don't have whitespace before us (and none after
481: * the opening delimiter), then suppress macro
482: * whitespace with Pf.
1.1 schwarze 483: */
1.6 kristaps 484: if (' ' != last && '\n' != last && ' ' != buf[*start])
485: printf("Pf ");
486:
1.1 schwarze 487: switch (fmt) {
488: case (FMT_ITALIC):
489: printf("Em ");
490: break;
491: case (FMT_BOLD):
1.14 kristaps 492: if (SECT_SYNOPSIS == st->sect) {
493: if (1 == dsz && '-' == buf[*start])
494: dosynopsisfl(buf, start, end);
1.15 kristaps 495: else if (0 == pos)
496: printf("Nm ");
1.14 kristaps 497: else
498: printf("Ar ");
499: break;
500: }
501: printf("Sy ");
1.1 schwarze 502: break;
503: case (FMT_CODE):
1.2 schwarze 504: printf("Qo Li ");
1.1 schwarze 505: break;
506: case (FMT_LINK):
1.19 kristaps 507: /* Try to link; use "No" if it's empty. */
1.9 kristaps 508: if ( ! trylink(buf, start, end, dsz))
509: printf("No ");
1.1 schwarze 510: break;
511: case (FMT_FILE):
512: printf("Pa ");
513: break;
514: case (FMT_NBSP):
515: printf("No ");
516: break;
517: default:
518: abort();
519: }
520: }
521:
522: /*
1.6 kristaps 523: * Process until we reach the end marker (e.g., '>') or until we
1.5 kristaps 524: * find a nested format code.
1.1 schwarze 525: * Don't emit any newlines: since we're on a macro line, we
526: * don't want to break the line.
527: */
528: while (*start < end) {
1.5 kristaps 529: if ('>' == buf[*start] && 1 == dsz) {
1.1 schwarze 530: (*start)++;
531: break;
1.5 kristaps 532: } else if ('>' == buf[*start] &&
533: ' ' == buf[*start - 1]) {
534: /*
535: * Handle custom delimiters.
536: * These require a certain number of
537: * space-preceded carrots before we're really at
538: * the end.
539: */
540: i = *start;
541: for (j = 0; i < end && j < dsz; j++)
542: if ('>' != buf[i++])
543: break;
544: if (dsz == j) {
545: *start += dsz;
546: break;
547: }
1.1 schwarze 548: }
549: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 550: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 551: continue;
552: }
1.3 schwarze 553:
1.4 schwarze 554: /*
555: * Make sure that any macro-like words (or
556: * really any word starting with a capital
557: * letter) is assumed to be a macro that must be
558: * escaped.
559: * This matches "Xx " and "XxEOLN".
560: */
561: if ((' ' == last || '\n' == last) &&
562: end - *start > 1 &&
563: isupper((int)buf[*start]) &&
564: islower((int)buf[*start + 1]) &&
565: (end - *start == 2 ||
566: ' ' == buf[*start + 2]))
567: printf("\\&");
1.3 schwarze 568:
1.4 schwarze 569: /* Suppress newline. */
1.6 kristaps 570: if ('\n' == buf[*start])
571: putchar(last = ' ');
572: else
573: putchar(last = buf[*start]);
1.4 schwarze 574:
1.8 kristaps 575: /* Protect against character escapes. */
576: if ('\\' == last)
577: putchar('e');
578:
1.6 kristaps 579: (*start)++;
580:
581: if (' ' == last)
582: while (*start < end && ' ' == buf[*start])
583: (*start)++;
1.1 schwarze 584: }
1.2 schwarze 585:
586: if ( ! nomacro && FMT_CODE == fmt)
587: printf(" Qc ");
1.1 schwarze 588:
589: /*
1.6 kristaps 590: * We're now subsequent the format code.
591: * If there isn't a space (or newline) here, and we haven't just
592: * printed a space, then suppress space.
1.1 schwarze 593: */
1.6 kristaps 594: if ( ! nomacro && ' ' != last)
595: if (' ' != buf[*start] && '\n' != buf[*start])
596: printf(" Ns ");
1.5 kristaps 597:
1.1 schwarze 598: return(1);
599: }
600:
601: /*
602: * Calls formatcode() til the end of a paragraph.
603: */
604: static void
1.11 kristaps 605: formatcodeln(struct state *st, const char *buf,
606: size_t *start, size_t end, int nomacro)
1.1 schwarze 607: {
608:
1.4 schwarze 609: last = ' ';
1.1 schwarze 610: while (*start < end) {
611: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 612: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 613: continue;
614: }
1.4 schwarze 615: /*
616: * Since we're already on a macro line, we want to make
617: * sure that we don't inadvertently invoke a macro.
618: * We need to do this carefully because section names
619: * are used in troff and we don't want to escape
620: * something that needn't be escaped.
621: */
622: if (' ' == last && end - *start > 1 &&
623: isupper((int)buf[*start]) &&
624: islower((int)buf[*start + 1]) &&
625: (end - *start == 2 ||
626: ' ' == buf[*start + 2]))
627: printf("\\&");
628:
1.8 kristaps 629: if ('\n' == buf[*start])
630: putchar(last = ' ');
631: else
1.1 schwarze 632: putchar(last = buf[*start]);
1.8 kristaps 633:
634: /* Protect against character escapes. */
635: if ('\\' == last)
636: putchar('e');
637:
1.1 schwarze 638: (*start)++;
639: }
640: }
641:
642: /*
1.4 schwarze 643: * Guess at what kind of list we are.
644: * These are taken straight from the POD manual.
645: * I don't know what people do in real life.
646: */
647: static enum list
648: listguess(const char *buf, size_t start, size_t end)
649: {
650: size_t len = end - start;
651:
652: assert(end >= start);
653:
654: if (len == 1 && '*' == buf[start])
655: return(LIST_BULLET);
656: if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
657: return(LIST_ENUM);
658: else if (len == 1 && '1' == buf[start])
659: return(LIST_ENUM);
660: else
661: return(LIST_TAG);
662: }
663:
664: /*
1.1 schwarze 665: * A command paragraph, as noted in the perlpod manual, just indicates
666: * that we should do something, optionally with some text to print as
667: * well.
668: */
669: static void
670: command(struct state *st, const char *buf, size_t start, size_t end)
671: {
672: size_t len, csz;
673: enum cmd cmd;
674:
675: assert('=' == buf[start]);
676: start++;
677: len = end - start;
678:
679: for (cmd = 0; cmd < CMD__MAX; cmd++) {
680: csz = strlen(cmds[cmd]);
681: if (len < csz)
682: continue;
683: if (0 == memcmp(&buf[start], cmd[cmds], csz))
684: break;
685: }
686:
687: /* Ignore bogus commands. */
688:
689: if (CMD__MAX == cmd)
690: return;
691:
692: start += csz;
1.8 kristaps 693: while (start < end && ' ' == buf[start])
694: start++;
695:
1.1 schwarze 696: len = end - start;
697:
698: if (st->paused) {
699: st->paused = CMD_END != cmd;
700: return;
701: }
702:
703: switch (cmd) {
704: case (CMD_POD):
705: break;
706: case (CMD_HEAD1):
707: /*
708: * The behaviour of head= follows from a quick glance at
709: * how pod2man handles it.
710: */
711: printf(".Sh ");
1.11 kristaps 712: st->sect = SECT_NONE;
713: if (end - start == 4) {
1.1 schwarze 714: if (0 == memcmp(&buf[start], "NAME", 4))
1.11 kristaps 715: st->sect = SECT_NAME;
716: } else if (end - start == 8) {
717: if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
718: st->sect = SECT_SYNOPSIS;
719: }
720: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 721: putchar('\n');
722: st->haspar = 1;
723: break;
724: case (CMD_HEAD2):
725: printf(".Ss ");
1.11 kristaps 726: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 727: putchar('\n');
728: st->haspar = 1;
729: break;
730: case (CMD_HEAD3):
731: puts(".Pp");
732: printf(".Em ");
1.11 kristaps 733: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 734: putchar('\n');
735: puts(".Pp");
736: st->haspar = 1;
737: break;
738: case (CMD_HEAD4):
739: puts(".Pp");
740: printf(".No ");
1.11 kristaps 741: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 742: putchar('\n');
743: puts(".Pp");
744: st->haspar = 1;
745: break;
746: case (CMD_OVER):
1.4 schwarze 747: /*
748: * If we have an existing list that hasn't had an =item
749: * yet, then make sure that we open it now.
750: * We use the default list type, but that can't be
751: * helped (we haven't seen any items yet).
1.1 schwarze 752: */
1.4 schwarze 753: if (st->lpos > 0)
754: if (LIST__MAX == st->lstack[st->lpos - 1]) {
755: st->lstack[st->lpos - 1] = LIST_TAG;
756: puts(".Bl -tag -width Ds");
757: }
758: st->lpos++;
759: assert(st->lpos < LIST_STACKSZ);
760: st->lstack[st->lpos - 1] = LIST__MAX;
1.1 schwarze 761: break;
762: case (CMD_ITEM):
1.6 kristaps 763: if (0 == st->lpos) {
764: /*
765: * Bad markup.
766: * Try to compensate.
767: */
768: st->lstack[st->lpos] = LIST__MAX;
769: st->lpos++;
770: }
1.4 schwarze 771: assert(st->lpos > 0);
772: /*
773: * If we're the first =item, guess at what our content
774: * will be: "*" is a bullet list, "1." is a numbered
775: * list, and everything is tagged.
776: */
777: if (LIST__MAX == st->lstack[st->lpos - 1]) {
778: st->lstack[st->lpos - 1] =
779: listguess(buf, start, end);
780: switch (st->lstack[st->lpos - 1]) {
781: case (LIST_BULLET):
782: puts(".Bl -bullet");
783: break;
784: case (LIST_ENUM):
785: puts(".Bl -enum");
786: break;
787: default:
788: puts(".Bl -tag -width Ds");
789: break;
790: }
791: }
792: switch (st->lstack[st->lpos - 1]) {
793: case (LIST_TAG):
794: printf(".It ");
1.11 kristaps 795: formatcodeln(st, buf, &start, end, 0);
1.4 schwarze 796: putchar('\n');
797: break;
798: case (LIST_ENUM):
799: /* FALLTHROUGH */
800: case (LIST_BULLET):
801: /*
802: * Abandon the remainder of the paragraph
803: * because we're going to be a bulletted or
804: * numbered list.
805: */
806: puts(".It");
807: break;
808: default:
809: abort();
810: }
1.1 schwarze 811: st->haspar = 1;
812: break;
813: case (CMD_BACK):
1.4 schwarze 814: /* Make sure we don't back over the stack. */
815: if (st->lpos > 0) {
816: st->lpos--;
817: puts(".El");
818: }
1.1 schwarze 819: break;
820: case (CMD_BEGIN):
821: /*
822: * We disregard all types for now.
823: * TODO: process at least "text" in a -literal block.
824: */
825: st->paused = 1;
826: break;
827: case (CMD_FOR):
828: /*
829: * We ignore all types of encodings and formats
830: * unilaterally.
831: */
832: break;
833: case (CMD_ENCODING):
834: break;
835: case (CMD_CUT):
836: st->parsing = 0;
837: return;
838: default:
839: abort();
840: }
841:
842: /* Any command (but =cut) makes us start parsing. */
843: st->parsing = 1;
844: }
845:
846: /*
847: * Just pump out the line in a verbatim block.
848: */
849: static void
850: verbatim(struct state *st, const char *buf, size_t start, size_t end)
851: {
1.8 kristaps 852: int last;
1.22 ! kristaps 853: size_t i;
1.1 schwarze 854:
855: if ( ! st->parsing || st->paused)
856: return;
1.22 ! kristaps 857: again:
! 858: /*
! 859: * If we're in the SYNOPSIS, see if we're an #include block.
! 860: * If we are, then print the "In" macro and re-loop.
! 861: * This handles any number of inclusions, but only when they
! 862: * come before the remaining parts...
! 863: */
! 864: if (SECT_SYNOPSIS == st->sect) {
! 865: i = start;
! 866: for (i = start; i < end && ' ' == buf[i]; i++)
! 867: /* Spin. */ ;
! 868: if (i == end)
! 869: return;
! 870: /* We're an include block! */
! 871: if (end - i > 10 &&
! 872: 0 == memcmp(&buf[i], "#include <", 10)) {
! 873: start = i + 10;
! 874: while (start < end && ' ' == buf[start])
! 875: start++;
! 876: fputs(".In ", stdout);
! 877: /* Stop til the '>' marker or we hit eoln. */
! 878: while (start < end &&
! 879: '>' != buf[start] && '\n' != buf[start])
! 880: putchar(buf[start++]);
! 881: putchar('\n');
! 882: if (start < end && '>' == buf[start])
! 883: start++;
! 884: if (start < end && '\n' == buf[start])
! 885: start++;
! 886: if (start < end)
! 887: goto again;
! 888: return;
! 889: }
! 890: }
! 891:
! 892: if (start == end)
! 893: return;
1.1 schwarze 894: puts(".Bd -literal");
1.8 kristaps 895: for (last = ' '; start < end; start++) {
896: /*
897: * Handle accidental macros (newline starting with
898: * control character) and escapes.
899: */
900: if ('\n' == last)
1.7 kristaps 901: if ('.' == buf[start] || '\'' == buf[start])
902: printf("\\&");
1.8 kristaps 903: putchar(last = buf[start]);
904: if ('\\' == buf[start])
905: printf("e");
1.7 kristaps 906: }
907: putchar('\n');
1.1 schwarze 908: puts(".Ed");
909: }
910:
911: /*
1.13 kristaps 912: * See dosynopsisop().
913: */
914: static int
915: hasmatch(const char *buf, size_t start, size_t end)
916: {
917: size_t stack;
918:
919: for (stack = 0; start < end; start++)
920: if (buf[start] == '[')
921: stack++;
922: else if (buf[start] == ']' && 0 == stack)
923: return(1);
924: else if (buf[start] == ']')
925: stack--;
926: return(0);
927: }
928:
929: /*
930: * If we're in the SYNOPSIS section and we've encounter braces in an
931: * ordinary paragraph, then try to see whether we're an [-option].
932: * Do this, if we're an opening bracket, by first seeing if we have a
933: * matching end via hasmatch().
934: * If we're an ending bracket, see if we have a stack already.
935: */
936: static int
937: dosynopsisop(const char *buf, int *last,
938: size_t *start, size_t end, size_t *opstack)
939: {
940:
941: assert('[' == buf[*start] || ']' == buf[*start]);
942:
943: if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
944: if ('\n' != *last)
945: putchar('\n');
946: puts(".Oo");
947: (*opstack)++;
948: } else if ('[' == buf[*start])
949: return(0);
950:
951: if (']' == buf[*start] && *opstack > 0) {
952: if ('\n' != *last)
953: putchar('\n');
954: puts(".Oc");
955: (*opstack)--;
956: } else if (']' == buf[*start])
957: return(0);
958:
959: (*start)++;
960: *last = '\n';
961: while (' ' == buf[*start])
962: (*start)++;
963: return(1);
964: }
965:
966: /*
1.17 kristaps 967: * Format multiple "Nm" manpage names in the NAME section.
968: */
969: static void
970: donamenm(struct state *st, const char *buf, size_t *start, size_t end)
971: {
972: size_t word;
973:
974: while (*start < end && ' ' == buf[*start])
975: (*start)++;
976:
977: if (end == *start) {
978: puts(".Nm unknown");
979: return;
980: }
981:
982: while (*start < end) {
983: fputs(".Nm ", stdout);
984: for (word = *start; word < end; word++)
985: if (',' == buf[word])
986: break;
987: formatcodeln(st, buf, start, word, 1);
988: if (*start == end) {
989: putchar('\n');
990: continue;
991: }
992: assert(',' == buf[*start]);
993: puts(" ,");
994: (*start)++;
995: while (*start < end && ' ' == buf[*start])
996: (*start)++;
997: }
998: }
999:
1000: /*
1.1 schwarze 1001: * Ordinary paragraph.
1002: * Well, this is really the hardest--POD seems to assume that, for
1003: * example, a leading space implies a newline, and so on.
1004: * Lots of other snakes in the grass: escaping a newline followed by a
1005: * period (accidental mdoc(7) control), double-newlines after macro
1006: * passages, etc.
1007: */
1008: static void
1009: ordinary(struct state *st, const char *buf, size_t start, size_t end)
1010: {
1.13 kristaps 1011: size_t i, j, opstack;
1.15 kristaps 1012: int seq;
1.1 schwarze 1013:
1014: if ( ! st->parsing || st->paused)
1015: return;
1016:
1017: /*
1018: * Special-case: the NAME section.
1019: * If we find a "-" when searching from the end, assume that
1020: * we're in "name - description" format.
1021: * To wit, print out a "Nm" and "Nd" in that format.
1022: */
1.11 kristaps 1023: if (SECT_NAME == st->sect) {
1.15 kristaps 1024: for (i = end - 2; i > start; i--)
1025: if ('-' == buf[i] && ' ' == buf[i + 1])
1.1 schwarze 1026: break;
1027: if ('-' == buf[i]) {
1028: j = i;
1029: /* Roll over multiple "-". */
1030: for ( ; i > start; i--)
1031: if ('-' != buf[i])
1032: break;
1.17 kristaps 1033: donamenm(st, buf, &start, i + 1);
1.5 kristaps 1034: start = j + 1;
1.17 kristaps 1035: while (start < end && ' ' == buf[start])
1036: start++;
1.15 kristaps 1037: fputs(".Nd ", stdout);
1.11 kristaps 1038: formatcodeln(st, buf, &start, end, 1);
1.5 kristaps 1039: putchar('\n');
1.1 schwarze 1040: return;
1041: }
1042: }
1043:
1044: if ( ! st->haspar)
1045: puts(".Pp");
1046:
1047: st->haspar = 0;
1048: last = '\n';
1.13 kristaps 1049: opstack = 0;
1.1 schwarze 1050:
1.15 kristaps 1051: for (seq = 0; start < end; seq++) {
1.1 schwarze 1052: /*
1053: * Loop til we get either to a newline or escape.
1054: * Escape initial control characters.
1055: */
1056: while (start < end) {
1057: if (start < end - 1 && '<' == buf[start + 1])
1058: break;
1059: else if ('\n' == buf[start])
1060: break;
1061: else if ('\n' == last && '.' == buf[start])
1062: printf("\\&");
1063: else if ('\n' == last && '\'' == buf[start])
1064: printf("\\&");
1.12 kristaps 1065: /*
1066: * If we're in the SYNOPSIS, have square
1067: * brackets indicate that we're opening and
1068: * closing an optional context.
1069: */
1.13 kristaps 1070: if (SECT_SYNOPSIS == st->sect &&
1071: ('[' == buf[start] ||
1072: ']' == buf[start]) &&
1073: dosynopsisop(buf, &last,
1074: &start, end, &opstack))
1075: continue;
1.1 schwarze 1076: putchar(last = buf[start++]);
1.8 kristaps 1077: if ('\\' == last)
1078: putchar('e');
1.1 schwarze 1079: }
1080:
1081: if (start < end - 1 && '<' == buf[start + 1]) {
1082: /*
1083: * We've encountered a format code.
1084: * This is going to trigger a macro no matter
1085: * what, so print a newline now.
1086: * Then print the (possibly nested) macros and
1087: * following that, a newline.
1.8 kristaps 1088: * Consume all whitespace so we don't
1089: * accidentally start an implicit literal line.
1.16 kristaps 1090: * If the macro ends with a flush comma or
1091: * period, let mdoc(7) handle it for us.
1.1 schwarze 1092: */
1.15 kristaps 1093: if (formatcode(st, buf, &start, end, 0, 0, seq)) {
1.16 kristaps 1094: if ((start == end - 1 ||
1095: (start < end - 1 &&
1096: (' ' == buf[start + 1] ||
1097: '\n' == buf[start + 1]))) &&
1098: ('.' == buf[start] ||
1099: ',' == buf[start])) {
1100: putchar(' ');
1101: putchar(buf[start++]);
1102: }
1.1 schwarze 1103: putchar(last = '\n');
1.6 kristaps 1104: while (start < end && ' ' == buf[start])
1105: start++;
1106: }
1.1 schwarze 1107: } else if (start < end && '\n' == buf[start]) {
1108: /*
1109: * Print the newline only if we haven't already
1110: * printed a newline.
1111: */
1112: if (last != '\n')
1113: putchar(last = buf[start]);
1114: if (++start >= end)
1115: continue;
1116: /*
1117: * If we have whitespace next, eat it to prevent
1118: * mdoc(7) from thinking that it's meant for
1119: * verbatim text.
1120: * It is--but if we start with that, we can't
1121: * have a macro subsequent it, which may be
1122: * possible if we have an escape next.
1123: */
1124: if (' ' == buf[start] || '\t' == buf[start]) {
1125: puts(".br");
1126: last = '\n';
1127: }
1128: for ( ; start < end; start++)
1129: if (' ' != buf[start] && '\t' != buf[start])
1130: break;
1.12 kristaps 1131: }
1.1 schwarze 1132: }
1133:
1134: if (last != '\n')
1135: putchar('\n');
1136: }
1137:
1138: /*
1139: * There are three kinds of paragraphs: verbatim (starts with whitespace
1140: * of some sort), ordinary (starts without "=" marker), or a command
1141: * (default: starts with "=").
1142: */
1143: static void
1144: dopar(struct state *st, const char *buf, size_t start, size_t end)
1145: {
1146:
1147: if (end == start)
1148: return;
1149: if (' ' == buf[start] || '\t' == buf[start])
1150: verbatim(st, buf, start, end);
1151: else if ('=' != buf[start])
1152: ordinary(st, buf, start, end);
1153: else
1154: command(st, buf, start, end);
1155: }
1156:
1157: /*
1158: * Loop around paragraphs within a document, processing each one in the
1159: * POD way.
1160: */
1161: static void
1162: dofile(const struct args *args, const char *fname,
1163: const struct tm *tm, const char *buf, size_t sz)
1164: {
1165: size_t sup, end, i, cur = 0;
1166: struct state st;
1167: const char *section, *date;
1168: char datebuf[64];
1169: char *title, *cp;
1170:
1171: if (0 == sz)
1172: return;
1173:
1174: /* Title is last path component of the filename. */
1175:
1176: if (NULL != args->title)
1177: title = strdup(args->title);
1178: else if (NULL != (cp = strrchr(fname, '/')))
1179: title = strdup(cp + 1);
1180: else
1181: title = strdup(fname);
1182:
1183: if (NULL == title) {
1184: perror(NULL);
1185: exit(EXIT_FAILURE);
1186: }
1187:
1188: /* Section is 1 unless suffix is "pm". */
1189:
1190: if (NULL == (section = args->section)) {
1191: section = "1";
1192: if (NULL != (cp = strrchr(title, '.'))) {
1193: *cp++ = '\0';
1194: if (0 == strcmp(cp, "pm"))
1.10 kristaps 1195: section = PERL_SECTION;
1.1 schwarze 1196: }
1197: }
1198:
1199: /* Date. Or the given "tm" if not supplied. */
1200:
1201: if (NULL == (date = args->date)) {
1202: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
1203: date = datebuf;
1204: }
1205:
1206: for (cp = title; '\0' != *cp; cp++)
1207: *cp = toupper((int)*cp);
1208:
1209: /* The usual mdoc(7) preamble. */
1210:
1211: printf(".Dd %s\n", date);
1212: printf(".Dt %s %s\n", title, section);
1213: puts(".Os");
1214:
1215: free(title);
1216:
1217: memset(&st, 0, sizeof(struct state));
1218: assert(sz > 0);
1219:
1220: /* Main loop over file contents. */
1221:
1222: while (cur < sz) {
1223: /* Read until next paragraph. */
1224: for (i = cur + 1; i < sz; i++)
1225: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
1226: /* Consume blank paragraphs. */
1227: while (i + 1 < sz && '\n' == buf[i + 1])
1228: i++;
1229: break;
1230: }
1231:
1232: /* Adjust end marker for EOF. */
1233: end = i < sz ? i - 1 :
1234: ('\n' == buf[sz - 1] ? sz - 1 : sz);
1235: sup = i < sz ? end + 2 : sz;
1236:
1237: /* Process paragraph and adjust start. */
1238: dopar(&st, buf, cur, end);
1239: cur = sup;
1240: }
1241: }
1242:
1243: /*
1244: * Read a single file fully into memory.
1245: * If the file is "-", do it from stdin.
1246: * If successfully read, send the input buffer to dofile() for further
1247: * processing.
1248: */
1249: static int
1250: readfile(const struct args *args, const char *fname)
1251: {
1252: int fd;
1253: char *buf;
1254: size_t bufsz, cur;
1255: ssize_t ssz;
1256: struct tm *tm;
1257: time_t ttm;
1258: struct stat st;
1259:
1260: assert(NULL != fname);
1261:
1262: fd = 0 != strcmp("-", fname) ?
1263: open(fname, O_RDONLY, 0) : STDIN_FILENO;
1264:
1265: if (-1 == fd) {
1266: perror(fname);
1267: return(0);
1268: }
1269:
1270: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
1271: ttm = time(NULL);
1272: tm = localtime(&ttm);
1273: } else
1274: tm = localtime(&st.st_mtime);
1275:
1276: /*
1277: * Arbitrarily-sized initial buffer.
1278: * Should be big enough for most files...
1279: */
1280: cur = 0;
1281: bufsz = 1 << 14;
1282: if (NULL == (buf = malloc(bufsz))) {
1283: perror(NULL);
1284: exit(EXIT_FAILURE);
1285: }
1286:
1287: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
1288: /* Double buffer size on fill. */
1289: if ((size_t)ssz == bufsz - cur) {
1290: bufsz *= 2;
1291: if (NULL == (buf = realloc(buf, bufsz))) {
1292: perror(NULL);
1293: exit(EXIT_FAILURE);
1294: }
1295: }
1296: cur += (size_t)ssz;
1297: }
1298: if (ssz < 0) {
1299: perror(fname);
1300: free(buf);
1301: return(0);
1302: }
1303:
1304: dofile(args, STDIN_FILENO == fd ?
1305: "STDIN" : fname, tm, buf, cur);
1306: free(buf);
1307: if (STDIN_FILENO != fd)
1308: close(fd);
1309: return(1);
1310: }
1311:
1312: int
1313: main(int argc, char *argv[])
1314: {
1315: const char *fname, *name;
1316: struct args args;
1317: int c;
1318:
1319: name = strrchr(argv[0], '/');
1320: if (name == NULL)
1321: name = argv[0];
1322: else
1323: ++name;
1324:
1325: memset(&args, 0, sizeof(struct args));
1326: fname = "-";
1327:
1328: /* Accept no arguments for now. */
1329:
1330: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
1331: switch (c) {
1332: case ('h'):
1333: /* FALLTHROUGH */
1334: case ('l'):
1335: /* FALLTHROUGH */
1336: case ('c'):
1337: /* FALLTHROUGH */
1338: case ('o'):
1339: /* FALLTHROUGH */
1340: case ('q'):
1341: /* FALLTHROUGH */
1342: case ('r'):
1343: /* FALLTHROUGH */
1344: case ('u'):
1345: /* FALLTHROUGH */
1346: case ('v'):
1347: /* Ignore these. */
1348: break;
1349: case ('d'):
1350: args.date = optarg;
1351: break;
1352: case ('n'):
1353: args.title = optarg;
1354: break;
1355: case ('s'):
1356: args.section = optarg;
1357: break;
1358: default:
1359: goto usage;
1360: }
1361:
1362: argc -= optind;
1363: argv += optind;
1364:
1365: /* Accept only a single input file. */
1366:
1367: if (argc > 2)
1368: return(EXIT_FAILURE);
1369: else if (1 == argc)
1370: fname = *argv;
1371:
1372: return(readfile(&args, fname) ?
1373: EXIT_SUCCESS : EXIT_FAILURE);
1374:
1375: usage:
1376: fprintf(stderr, "usage: %s [-d date] "
1377: "[-n title] [-s section]\n", name);
1378:
1379: return(EXIT_FAILURE);
1380: }
CVSweb