Annotation of pod2mdoc/pod2mdoc.c, Revision 1.28
1.28 ! schwarze 1: /* $Id: pod2mdoc.c,v 1.27 2014/07/11 09:10:50 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
1.10 kristaps 29: /*
1.19 kristaps 30: * In what section can we find Perl module manuals?
31: * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
32: * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
1.10 kristaps 33: */
34: #define PERL_SECTION "3p"
35:
1.1 schwarze 36: struct args {
37: const char *title; /* override "Dt" title */
38: const char *date; /* override "Dd" date */
39: const char *section; /* override "Dt" section */
40: };
41:
1.4 schwarze 42: enum list {
43: LIST_BULLET = 0,
44: LIST_ENUM,
45: LIST_TAG,
46: LIST__MAX
47: };
48:
1.11 kristaps 49: enum sect {
50: SECT_NONE = 0,
51: SECT_NAME, /* NAME section */
52: SECT_SYNOPSIS, /* SYNOPSIS section */
53: };
54:
1.1 schwarze 55: struct state {
56: int parsing; /* after =cut of before command */
57: int paused; /* in =begin and before =end */
58: int haspar; /* in paragraph: do we need Pp? */
1.11 kristaps 59: enum sect sect; /* which section are we in? */
1.1 schwarze 60: const char *fname; /* file being parsed */
1.4 schwarze 61: #define LIST_STACKSZ 128
62: enum list lstack[LIST_STACKSZ]; /* open lists */
63: size_t lpos; /* where in list stack */
1.1 schwarze 64: };
65:
66: enum fmt {
67: FMT_ITALIC,
68: FMT_BOLD,
69: FMT_CODE,
70: FMT_LINK,
71: FMT_ESCAPE,
72: FMT_FILE,
73: FMT_NBSP,
74: FMT_INDEX,
75: FMT_NULL,
76: FMT__MAX
77: };
78:
79: enum cmd {
80: CMD_POD = 0,
81: CMD_HEAD1,
82: CMD_HEAD2,
83: CMD_HEAD3,
84: CMD_HEAD4,
85: CMD_OVER,
86: CMD_ITEM,
87: CMD_BACK,
88: CMD_BEGIN,
89: CMD_END,
90: CMD_FOR,
91: CMD_ENCODING,
92: CMD_CUT,
93: CMD__MAX
94: };
95:
96: static const char *const cmds[CMD__MAX] = {
97: "pod", /* CMD_POD */
98: "head1", /* CMD_HEAD1 */
99: "head2", /* CMD_HEAD2 */
100: "head3", /* CMD_HEAD3 */
101: "head4", /* CMD_HEAD4 */
102: "over", /* CMD_OVER */
103: "item", /* CMD_ITEM */
104: "back", /* CMD_BACK */
105: "begin", /* CMD_BEGIN */
106: "end", /* CMD_END */
107: "for", /* CMD_FOR */
108: "encoding", /* CMD_ENCODING */
109: "cut" /* CMD_CUT */
110: };
111:
112: static const char fmts[FMT__MAX] = {
113: 'I', /* FMT_ITALIC */
114: 'B', /* FMT_BOLD */
115: 'C', /* FMT_CODE */
116: 'L', /* FMT_LINK */
117: 'E', /* FMT_ESCAPE */
118: 'F', /* FMT_FILE */
119: 'S', /* FMT_NBSP */
120: 'X', /* FMT_INDEX */
121: 'Z' /* FMT_NULL */
122: };
123:
1.6 kristaps 124: static int last;
125:
1.1 schwarze 126: /*
127: * Given buf[*start] is at the start of an escape name, read til the end
128: * of the escape ('>') then try to do something with it.
129: * Sets start to be one after the '>'.
130: */
131: static void
132: formatescape(const char *buf, size_t *start, size_t end)
133: {
134: char esc[16]; /* no more needed */
135: size_t i, max;
136:
137: max = sizeof(esc) - 1;
138: i = 0;
139: /* Read til our buffer is full. */
140: while (*start < end && '>' != buf[*start] && i < max)
141: esc[i++] = buf[(*start)++];
142: esc[i] = '\0';
143:
144: if (i == max) {
145: /* Too long... skip til we end. */
146: while (*start < end && '>' != buf[*start])
147: (*start)++;
148: return;
149: } else if (*start >= end)
150: return;
151:
152: assert('>' == buf[*start]);
153: (*start)++;
154:
155: /*
156: * TODO: right now, we only recognise the named escapes.
157: * Just let the rest of them go.
158: */
1.6 kristaps 159: if (0 == strcmp(esc, "lt"))
1.1 schwarze 160: printf("\\(la");
161: else if (0 == strcmp(esc, "gt"))
162: printf("\\(ra");
163: else if (0 == strcmp(esc, "vb"))
164: printf("\\(ba");
165: else if (0 == strcmp(esc, "sol"))
166: printf("\\(sl");
1.6 kristaps 167: else
168: return;
169:
170: last = 'a';
1.1 schwarze 171: }
172:
173: /*
1.9 kristaps 174: * Run some heuristics to intuit a link format.
1.19 kristaps 175: * I set "start" to be the end of the sequence (last right-carrot) so
1.9 kristaps 176: * that the caller can safely just continue processing.
1.19 kristaps 177: * If this is just an empty tag, I'll return 0.
1.9 kristaps 178: */
179: static int
180: trylink(const char *buf, size_t *start, size_t end, size_t dsz)
181: {
1.21 kristaps 182: size_t linkstart, realend, linkend,
183: i, j, textsz, stack;
1.9 kristaps 184:
185: /*
186: * Scan to the start of the terminus.
187: * This function is more or less replicated in the formatcode()
188: * for null or index formatting codes.
1.23 kristaps 189: * However, we're slightly different because we might have
190: * nested escapes we need to ignore.
1.9 kristaps 191: */
1.21 kristaps 192: stack = 0;
1.19 kristaps 193: for (linkstart = realend = *start; realend < end; realend++) {
1.23 kristaps 194: if ('<' == buf[realend])
195: stack++;
1.19 kristaps 196: if ('>' != buf[realend])
1.9 kristaps 197: continue;
1.23 kristaps 198: else if (stack-- > 0)
199: continue;
200: if (dsz == 1)
1.9 kristaps 201: break;
1.19 kristaps 202: assert(realend > 0);
203: if (' ' != buf[realend - 1])
1.9 kristaps 204: continue;
1.19 kristaps 205: for (i = realend, j = 0; i < end && j < dsz; j++)
1.9 kristaps 206: if ('>' != buf[i++])
207: break;
208: if (dsz == j)
209: break;
210: }
1.19 kristaps 211:
212: /* Ignore stubs. */
213: if (realend == end || realend == *start)
1.9 kristaps 214: return(0);
215:
1.19 kristaps 216: /* Set linkend to the end of content. */
217: linkend = dsz > 1 ? realend - 1 : realend;
1.18 kristaps 218:
1.19 kristaps 219: /* Re-scan to see if we have a title or section. */
220: for (textsz = *start; textsz < linkend; textsz++)
221: if ('|' == buf[textsz] || '/' == buf[textsz])
1.18 kristaps 222: break;
223:
1.19 kristaps 224: if (textsz < linkend && '|' == buf[textsz]) {
1.20 kristaps 225: /* With title: set start, then end at section. */
1.19 kristaps 226: linkstart = textsz + 1;
1.18 kristaps 227: textsz = textsz - *start;
1.19 kristaps 228: for (i = linkstart; i < linkend; i++)
229: if ('/' == buf[i])
230: break;
231: if (i < linkend)
232: linkend = i;
1.20 kristaps 233: } else if (textsz < linkend && '/' == buf[textsz]) {
234: /* With section: set end at section. */
235: linkend = textsz;
236: textsz = 0;
237: } else
238: /* No title, no section. */
1.18 kristaps 239: textsz = 0;
1.19 kristaps 240:
241: *start = realend;
242: j = linkend - linkstart;
243:
1.20 kristaps 244: /* Do we have only subsection material? */
245: if (0 == j && '/' == buf[linkend]) {
246: linkstart = linkend + 1;
247: linkend = dsz > 1 ? realend - 1 : realend;
248: if (0 == (j = linkend - linkstart))
249: return(0);
250: printf("Sx %.*s", (int)j, &buf[linkstart]);
251: return(1);
252: } else if (0 == j)
1.19 kristaps 253: return(0);
254:
255: /* See if we qualify as being a link or not. */
1.20 kristaps 256: if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
257: (j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
258: (j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
259: (j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
260: (j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
261: (j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
262: /* Gross. */
263: printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 :
264: realend) - linkstart), &buf[linkstart]);
1.19 kristaps 265: return(1);
266: }
267:
268: /* See if we qualify as a mailto. */
1.20 kristaps 269: if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
1.19 kristaps 270: printf("Mt %.*s", (int)j, &buf[linkstart]);
271: return(1);
272: }
273:
274: /* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
275: if ((j > 3 && ')' == buf[linkend - 1]) &&
276: ('(' == buf[linkend - 3])) {
277: printf("Xr %.*s %c", (int)(j - 3),
278: &buf[linkstart], buf[linkend - 2]);
279: return(1);
280: } else if ((j > 4 && ')' == buf[linkend - 1]) &&
281: ('(' == buf[linkend - 4])) {
282: printf("Xr %.*s %.*s", (int)(j - 4),
283: &buf[linkstart], 2, &buf[linkend - 3]);
284: return(1);
285: } else if ((j > 5 && ')' == buf[linkend - 1]) &&
286: ('(' == buf[linkend - 5])) {
287: printf("Xr %.*s %.*s", (int)(j - 5),
288: &buf[linkstart], 3, &buf[linkend - 4]);
289: return(1);
290: }
291:
292: /* Last try: do we have a double-colon? */
293: for (i = linkstart + 1; i < linkend; i++)
294: if (':' == buf[i] && ':' == buf[i - 1])
1.18 kristaps 295: break;
1.9 kristaps 296:
1.19 kristaps 297: if (i < linkend)
1.10 kristaps 298: printf("Xr %.*s " PERL_SECTION,
1.19 kristaps 299: (int)j, &buf[linkstart]);
1.9 kristaps 300: else
1.19 kristaps 301: printf("Xr %.*s 1", (int)j, &buf[linkstart]);
1.9 kristaps 302:
303: return(1);
304: }
305:
1.13 kristaps 306: /*
307: * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
308: * then it's likely that we're a flag.
309: * Our flag might be followed by an argument, so make sure that we're
310: * accounting for that, too.
311: * If we don't have a flag at all, however, then assume we're an "Ar".
312: */
313: static void
314: dosynopsisfl(const char *buf, size_t *start, size_t end)
315: {
316: size_t i;
317: again:
1.14 kristaps 318: assert(*start + 1 < end);
319: assert('-' == buf[*start]);
320:
321: if ( ! isalnum((int)buf[*start + 1]) &&
322: '?' != buf[*start + 1] &&
323: '-' != buf[*start + 1]) {
324: (*start)--;
325: fputs("Ar ", stdout);
326: return;
327: }
328:
1.13 kristaps 329: (*start)++;
330: for (i = *start; i < end; i++)
331: if (isalnum((int)buf[i]))
332: continue;
1.14 kristaps 333: else if ('?' == buf[i])
334: continue;
1.13 kristaps 335: else if ('-' == buf[i])
336: continue;
337: else if ('_' == buf[i])
338: continue;
339: else
340: break;
341:
342: assert(i < end);
343:
344: if ( ! (' ' == buf[i] || '>' == buf[i])) {
345: printf("Ar ");
346: return;
347: }
348:
349: printf("Fl ");
350: if (end - *start > 1 &&
351: isupper((int)buf[*start]) &&
352: islower((int)buf[*start + 1]) &&
353: (end - *start == 2 ||
354: ' ' == buf[*start + 2]))
355: printf("\\&");
356: printf("%.*s ", (int)(i - *start), &buf[*start]);
357: *start = i;
358:
359: if (' ' == buf[i]) {
360: while (i < end && ' ' == buf[i])
361: i++;
362: assert(i < end);
363: if ('-' == buf[i]) {
364: *start = i;
365: goto again;
366: }
367: printf("Ar ");
368: *start = i;
369: }
370: }
371:
1.9 kristaps 372: /*
1.1 schwarze 373: * We're at the character in front of a format code, which is structured
374: * like X<...> and can contain nested format codes.
375: * This consumes the whole format code, and any nested format codes, til
376: * the end of matched production.
377: * If "reentrant", then we're being called after a macro has already
378: * been printed to the current line.
1.6 kristaps 379: * If "nomacro", then we don't print any macros, just contained data
380: * (e.g., following "Sh" or "Nm").
1.15 kristaps 381: * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
382: * as the first format code on a line (for decoration as an "Nm"),
383: * non-zero otherwise.
1.6 kristaps 384: * Return whether we've printed a macro or not--in other words, whether
385: * this should trigger a subsequent newline (this should be ignored when
386: * reentrant).
1.1 schwarze 387: */
388: static int
1.15 kristaps 389: formatcode(struct state *st, const char *buf, size_t *start,
390: size_t end, int reentrant, int nomacro, int pos)
1.1 schwarze 391: {
392: enum fmt fmt;
1.5 kristaps 393: size_t i, j, dsz;
1.1 schwarze 394:
395: assert(*start + 1 < end);
396: assert('<' == buf[*start + 1]);
397:
1.6 kristaps 398: /*
399: * First, look up the format code.
400: * If it's not valid, then exit immediately.
401: */
402: for (fmt = 0; fmt < FMT__MAX; fmt++)
403: if (buf[*start] == fmts[fmt])
404: break;
405:
406: if (FMT__MAX == fmt) {
407: putchar(last = buf[(*start)++]);
1.8 kristaps 408: if ('\\' == last)
409: putchar('e');
1.6 kristaps 410: return(0);
411: }
412:
1.5 kristaps 413: /*
414: * Determine whether we're overriding our delimiter.
415: * According to POD, if we have more than one '<' followed by a
416: * space, then we need a space followed by matching '>' to close
417: * the expression.
418: * Otherwise we use the usual '<' and '>' matched pair.
419: */
420: i = *start + 1;
421: while (i < end && '<' == buf[i])
422: i++;
423: assert(i > *start + 1);
424: dsz = i - (*start + 1);
425: if (dsz > 1 && (i >= end || ' ' != buf[i]))
426: dsz = 1;
427:
428: /* Remember, if dsz>1, to jump the trailing space. */
429: *start += dsz + 1 + (dsz > 1 ? 1 : 0);
1.1 schwarze 430:
431: /*
1.6 kristaps 432: * Escapes and ignored codes (NULL and INDEX) don't print macro
433: * sequences, so just output them like normal text before
434: * processing for real macros.
1.1 schwarze 435: */
436: if (FMT_ESCAPE == fmt) {
437: formatescape(buf, start, end);
438: return(0);
439: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
1.5 kristaps 440: /*
1.6 kristaps 441: * Just consume til the end delimiter, accounting for
442: * whether it's a custom one.
1.5 kristaps 443: */
444: for ( ; *start < end; (*start)++) {
445: if ('>' != buf[*start])
446: continue;
447: else if (dsz == 1)
448: break;
449: assert(*start > 0);
450: if (' ' != buf[*start - 1])
451: continue;
452: i = *start;
453: for (j = 0; i < end && j < dsz; j++)
454: if ('>' != buf[i++])
455: break;
456: if (dsz != j)
457: continue;
458: (*start) += dsz;
459: break;
460: }
1.24 kristaps 461: if (*start < end) {
462: assert('>' == buf[*start]);
463: (*start)++;
464: }
465: if (isspace(last))
466: while (*start < end && isspace((int)buf[*start]))
467: (*start)++;
1.1 schwarze 468: return(0);
469: }
470:
1.6 kristaps 471: /*
472: * Check whether we're supposed to print macro stuff (this is
473: * suppressed in, e.g., "Nm" and "Sh" macros).
474: */
1.1 schwarze 475: if ( ! nomacro) {
476: /*
477: * Print out the macro describing this format code.
478: * If we're not "reentrant" (not yet on a macro line)
479: * then print a newline, if necessary, and the macro
480: * indicator.
481: * Otherwise, offset us with a space.
482: */
1.6 kristaps 483: if ( ! reentrant) {
484: if (last != '\n')
485: putchar('\n');
1.1 schwarze 486: putchar('.');
1.6 kristaps 487: } else
1.1 schwarze 488: putchar(' ');
489:
490: /*
1.6 kristaps 491: * If we don't have whitespace before us (and none after
492: * the opening delimiter), then suppress macro
493: * whitespace with Pf.
1.1 schwarze 494: */
1.6 kristaps 495: if (' ' != last && '\n' != last && ' ' != buf[*start])
496: printf("Pf ");
497:
1.1 schwarze 498: switch (fmt) {
499: case (FMT_ITALIC):
500: printf("Em ");
501: break;
502: case (FMT_BOLD):
1.14 kristaps 503: if (SECT_SYNOPSIS == st->sect) {
504: if (1 == dsz && '-' == buf[*start])
505: dosynopsisfl(buf, start, end);
1.15 kristaps 506: else if (0 == pos)
507: printf("Nm ");
1.14 kristaps 508: else
509: printf("Ar ");
510: break;
511: }
1.27 schwarze 512: if (0 == strncmp(buf + *start, "NULL", 4) &&
513: ('=' == buf[*start + 4] ||
514: '>' == buf[*start + 4]))
515: printf("Dv ");
516: else
517: printf("Sy ");
1.1 schwarze 518: break;
519: case (FMT_CODE):
1.2 schwarze 520: printf("Qo Li ");
1.1 schwarze 521: break;
522: case (FMT_LINK):
1.19 kristaps 523: /* Try to link; use "No" if it's empty. */
1.9 kristaps 524: if ( ! trylink(buf, start, end, dsz))
525: printf("No ");
1.1 schwarze 526: break;
527: case (FMT_FILE):
528: printf("Pa ");
529: break;
530: case (FMT_NBSP):
531: printf("No ");
532: break;
533: default:
534: abort();
535: }
536: }
537:
538: /*
1.6 kristaps 539: * Process until we reach the end marker (e.g., '>') or until we
1.5 kristaps 540: * find a nested format code.
1.1 schwarze 541: * Don't emit any newlines: since we're on a macro line, we
542: * don't want to break the line.
543: */
544: while (*start < end) {
1.5 kristaps 545: if ('>' == buf[*start] && 1 == dsz) {
1.1 schwarze 546: (*start)++;
547: break;
1.5 kristaps 548: } else if ('>' == buf[*start] &&
549: ' ' == buf[*start - 1]) {
550: /*
551: * Handle custom delimiters.
552: * These require a certain number of
553: * space-preceded carrots before we're really at
554: * the end.
555: */
556: i = *start;
557: for (j = 0; i < end && j < dsz; j++)
558: if ('>' != buf[i++])
559: break;
560: if (dsz == j) {
561: *start += dsz;
562: break;
563: }
1.1 schwarze 564: }
565: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 566: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 567: continue;
568: }
1.3 schwarze 569:
1.4 schwarze 570: /*
571: * Make sure that any macro-like words (or
572: * really any word starting with a capital
573: * letter) is assumed to be a macro that must be
574: * escaped.
575: * This matches "Xx " and "XxEOLN".
576: */
577: if ((' ' == last || '\n' == last) &&
578: end - *start > 1 &&
579: isupper((int)buf[*start]) &&
580: islower((int)buf[*start + 1]) &&
581: (end - *start == 2 ||
582: ' ' == buf[*start + 2]))
583: printf("\\&");
1.3 schwarze 584:
1.4 schwarze 585: /* Suppress newline. */
1.6 kristaps 586: if ('\n' == buf[*start])
587: putchar(last = ' ');
588: else
589: putchar(last = buf[*start]);
1.4 schwarze 590:
1.8 kristaps 591: /* Protect against character escapes. */
592: if ('\\' == last)
593: putchar('e');
594:
1.6 kristaps 595: (*start)++;
596:
597: if (' ' == last)
598: while (*start < end && ' ' == buf[*start])
599: (*start)++;
1.1 schwarze 600: }
1.2 schwarze 601:
602: if ( ! nomacro && FMT_CODE == fmt)
603: printf(" Qc ");
1.1 schwarze 604:
605: /*
1.6 kristaps 606: * We're now subsequent the format code.
607: * If there isn't a space (or newline) here, and we haven't just
608: * printed a space, then suppress space.
1.1 schwarze 609: */
1.6 kristaps 610: if ( ! nomacro && ' ' != last)
611: if (' ' != buf[*start] && '\n' != buf[*start])
612: printf(" Ns ");
1.5 kristaps 613:
1.1 schwarze 614: return(1);
615: }
616:
617: /*
618: * Calls formatcode() til the end of a paragraph.
619: */
620: static void
1.11 kristaps 621: formatcodeln(struct state *st, const char *buf,
622: size_t *start, size_t end, int nomacro)
1.1 schwarze 623: {
624:
1.4 schwarze 625: last = ' ';
1.1 schwarze 626: while (*start < end) {
627: if (*start + 1 < end && '<' == buf[*start + 1]) {
1.15 kristaps 628: formatcode(st, buf, start, end, 1, nomacro, 1);
1.1 schwarze 629: continue;
630: }
1.4 schwarze 631: /*
632: * Since we're already on a macro line, we want to make
633: * sure that we don't inadvertently invoke a macro.
634: * We need to do this carefully because section names
635: * are used in troff and we don't want to escape
636: * something that needn't be escaped.
637: */
638: if (' ' == last && end - *start > 1 &&
639: isupper((int)buf[*start]) &&
640: islower((int)buf[*start + 1]) &&
641: (end - *start == 2 ||
642: ' ' == buf[*start + 2]))
643: printf("\\&");
644:
1.8 kristaps 645: if ('\n' == buf[*start])
646: putchar(last = ' ');
647: else
1.1 schwarze 648: putchar(last = buf[*start]);
1.8 kristaps 649:
650: /* Protect against character escapes. */
651: if ('\\' == last)
652: putchar('e');
653:
1.1 schwarze 654: (*start)++;
655: }
656: }
657:
658: /*
1.4 schwarze 659: * Guess at what kind of list we are.
660: * These are taken straight from the POD manual.
661: * I don't know what people do in real life.
662: */
663: static enum list
664: listguess(const char *buf, size_t start, size_t end)
665: {
666: size_t len = end - start;
667:
668: assert(end >= start);
669:
670: if (len == 1 && '*' == buf[start])
671: return(LIST_BULLET);
672: if (len == 2 && '1' == buf[start] && '.' == buf[start + 1])
673: return(LIST_ENUM);
674: else if (len == 1 && '1' == buf[start])
675: return(LIST_ENUM);
676: else
677: return(LIST_TAG);
678: }
679:
680: /*
1.1 schwarze 681: * A command paragraph, as noted in the perlpod manual, just indicates
682: * that we should do something, optionally with some text to print as
683: * well.
684: */
685: static void
686: command(struct state *st, const char *buf, size_t start, size_t end)
687: {
688: size_t len, csz;
689: enum cmd cmd;
690:
691: assert('=' == buf[start]);
692: start++;
693: len = end - start;
694:
695: for (cmd = 0; cmd < CMD__MAX; cmd++) {
696: csz = strlen(cmds[cmd]);
697: if (len < csz)
698: continue;
699: if (0 == memcmp(&buf[start], cmd[cmds], csz))
700: break;
701: }
702:
703: /* Ignore bogus commands. */
704:
705: if (CMD__MAX == cmd)
706: return;
707:
708: start += csz;
1.8 kristaps 709: while (start < end && ' ' == buf[start])
710: start++;
711:
1.1 schwarze 712: len = end - start;
713:
714: if (st->paused) {
715: st->paused = CMD_END != cmd;
716: return;
717: }
718:
719: switch (cmd) {
720: case (CMD_POD):
721: break;
722: case (CMD_HEAD1):
723: /*
724: * The behaviour of head= follows from a quick glance at
725: * how pod2man handles it.
726: */
727: printf(".Sh ");
1.11 kristaps 728: st->sect = SECT_NONE;
729: if (end - start == 4) {
1.1 schwarze 730: if (0 == memcmp(&buf[start], "NAME", 4))
1.11 kristaps 731: st->sect = SECT_NAME;
732: } else if (end - start == 8) {
733: if (0 == memcmp(&buf[start], "SYNOPSIS", 8))
734: st->sect = SECT_SYNOPSIS;
735: }
736: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 737: putchar('\n');
738: st->haspar = 1;
739: break;
740: case (CMD_HEAD2):
741: printf(".Ss ");
1.11 kristaps 742: formatcodeln(st, buf, &start, end, 1);
1.1 schwarze 743: putchar('\n');
744: st->haspar = 1;
745: break;
746: case (CMD_HEAD3):
747: puts(".Pp");
748: printf(".Em ");
1.11 kristaps 749: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 750: putchar('\n');
751: puts(".Pp");
752: st->haspar = 1;
753: break;
754: case (CMD_HEAD4):
755: puts(".Pp");
756: printf(".No ");
1.11 kristaps 757: formatcodeln(st, buf, &start, end, 0);
1.1 schwarze 758: putchar('\n');
759: puts(".Pp");
760: st->haspar = 1;
761: break;
762: case (CMD_OVER):
1.4 schwarze 763: /*
764: * If we have an existing list that hasn't had an =item
765: * yet, then make sure that we open it now.
766: * We use the default list type, but that can't be
767: * helped (we haven't seen any items yet).
1.1 schwarze 768: */
1.4 schwarze 769: if (st->lpos > 0)
770: if (LIST__MAX == st->lstack[st->lpos - 1]) {
771: st->lstack[st->lpos - 1] = LIST_TAG;
772: puts(".Bl -tag -width Ds");
773: }
774: st->lpos++;
775: assert(st->lpos < LIST_STACKSZ);
776: st->lstack[st->lpos - 1] = LIST__MAX;
1.1 schwarze 777: break;
778: case (CMD_ITEM):
1.6 kristaps 779: if (0 == st->lpos) {
780: /*
781: * Bad markup.
782: * Try to compensate.
783: */
784: st->lstack[st->lpos] = LIST__MAX;
785: st->lpos++;
786: }
1.4 schwarze 787: assert(st->lpos > 0);
788: /*
789: * If we're the first =item, guess at what our content
790: * will be: "*" is a bullet list, "1." is a numbered
791: * list, and everything is tagged.
792: */
793: if (LIST__MAX == st->lstack[st->lpos - 1]) {
794: st->lstack[st->lpos - 1] =
795: listguess(buf, start, end);
796: switch (st->lstack[st->lpos - 1]) {
797: case (LIST_BULLET):
798: puts(".Bl -bullet");
799: break;
800: case (LIST_ENUM):
801: puts(".Bl -enum");
802: break;
803: default:
804: puts(".Bl -tag -width Ds");
805: break;
806: }
807: }
808: switch (st->lstack[st->lpos - 1]) {
809: case (LIST_TAG):
810: printf(".It ");
1.11 kristaps 811: formatcodeln(st, buf, &start, end, 0);
1.4 schwarze 812: putchar('\n');
813: break;
814: case (LIST_ENUM):
815: /* FALLTHROUGH */
816: case (LIST_BULLET):
817: /*
818: * Abandon the remainder of the paragraph
819: * because we're going to be a bulletted or
820: * numbered list.
821: */
822: puts(".It");
823: break;
824: default:
825: abort();
826: }
1.1 schwarze 827: st->haspar = 1;
828: break;
829: case (CMD_BACK):
1.4 schwarze 830: /* Make sure we don't back over the stack. */
831: if (st->lpos > 0) {
832: st->lpos--;
833: puts(".El");
834: }
1.1 schwarze 835: break;
836: case (CMD_BEGIN):
837: /*
838: * We disregard all types for now.
839: * TODO: process at least "text" in a -literal block.
840: */
841: st->paused = 1;
842: break;
843: case (CMD_FOR):
844: /*
845: * We ignore all types of encodings and formats
846: * unilaterally.
847: */
848: break;
849: case (CMD_ENCODING):
850: break;
851: case (CMD_CUT):
852: st->parsing = 0;
853: return;
854: default:
855: abort();
856: }
857:
858: /* Any command (but =cut) makes us start parsing. */
859: st->parsing = 1;
860: }
861:
862: /*
863: * Just pump out the line in a verbatim block.
864: */
865: static void
866: verbatim(struct state *st, const char *buf, size_t start, size_t end)
867: {
1.8 kristaps 868: int last;
1.22 kristaps 869: size_t i;
1.1 schwarze 870:
871: if ( ! st->parsing || st->paused)
872: return;
1.22 kristaps 873: again:
874: /*
875: * If we're in the SYNOPSIS, see if we're an #include block.
876: * If we are, then print the "In" macro and re-loop.
877: * This handles any number of inclusions, but only when they
878: * come before the remaining parts...
879: */
880: if (SECT_SYNOPSIS == st->sect) {
881: i = start;
882: for (i = start; i < end && ' ' == buf[i]; i++)
883: /* Spin. */ ;
884: if (i == end)
885: return;
886: /* We're an include block! */
887: if (end - i > 10 &&
888: 0 == memcmp(&buf[i], "#include <", 10)) {
889: start = i + 10;
890: while (start < end && ' ' == buf[start])
891: start++;
892: fputs(".In ", stdout);
893: /* Stop til the '>' marker or we hit eoln. */
894: while (start < end &&
895: '>' != buf[start] && '\n' != buf[start])
896: putchar(buf[start++]);
897: putchar('\n');
898: if (start < end && '>' == buf[start])
899: start++;
900: if (start < end && '\n' == buf[start])
901: start++;
902: if (start < end)
903: goto again;
904: return;
905: }
906: }
907:
908: if (start == end)
909: return;
1.1 schwarze 910: puts(".Bd -literal");
1.8 kristaps 911: for (last = ' '; start < end; start++) {
912: /*
913: * Handle accidental macros (newline starting with
914: * control character) and escapes.
915: */
916: if ('\n' == last)
1.7 kristaps 917: if ('.' == buf[start] || '\'' == buf[start])
918: printf("\\&");
1.8 kristaps 919: putchar(last = buf[start]);
920: if ('\\' == buf[start])
921: printf("e");
1.7 kristaps 922: }
923: putchar('\n');
1.1 schwarze 924: puts(".Ed");
925: }
926:
927: /*
1.13 kristaps 928: * See dosynopsisop().
929: */
930: static int
931: hasmatch(const char *buf, size_t start, size_t end)
932: {
933: size_t stack;
934:
935: for (stack = 0; start < end; start++)
936: if (buf[start] == '[')
937: stack++;
938: else if (buf[start] == ']' && 0 == stack)
939: return(1);
940: else if (buf[start] == ']')
941: stack--;
942: return(0);
943: }
944:
945: /*
946: * If we're in the SYNOPSIS section and we've encounter braces in an
947: * ordinary paragraph, then try to see whether we're an [-option].
948: * Do this, if we're an opening bracket, by first seeing if we have a
949: * matching end via hasmatch().
950: * If we're an ending bracket, see if we have a stack already.
951: */
952: static int
953: dosynopsisop(const char *buf, int *last,
954: size_t *start, size_t end, size_t *opstack)
955: {
956:
957: assert('[' == buf[*start] || ']' == buf[*start]);
958:
959: if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
960: if ('\n' != *last)
961: putchar('\n');
962: puts(".Oo");
963: (*opstack)++;
964: } else if ('[' == buf[*start])
965: return(0);
966:
967: if (']' == buf[*start] && *opstack > 0) {
968: if ('\n' != *last)
969: putchar('\n');
970: puts(".Oc");
971: (*opstack)--;
972: } else if (']' == buf[*start])
973: return(0);
974:
975: (*start)++;
976: *last = '\n';
977: while (' ' == buf[*start])
978: (*start)++;
979: return(1);
980: }
981:
982: /*
1.17 kristaps 983: * Format multiple "Nm" manpage names in the NAME section.
984: */
985: static void
986: donamenm(struct state *st, const char *buf, size_t *start, size_t end)
987: {
988: size_t word;
989:
990: while (*start < end && ' ' == buf[*start])
991: (*start)++;
992:
993: if (end == *start) {
994: puts(".Nm unknown");
995: return;
996: }
997:
998: while (*start < end) {
999: fputs(".Nm ", stdout);
1000: for (word = *start; word < end; word++)
1001: if (',' == buf[word])
1002: break;
1003: formatcodeln(st, buf, start, word, 1);
1004: if (*start == end) {
1005: putchar('\n');
1006: continue;
1007: }
1008: assert(',' == buf[*start]);
1009: puts(" ,");
1010: (*start)++;
1011: while (*start < end && ' ' == buf[*start])
1012: (*start)++;
1013: }
1014: }
1015:
1016: /*
1.1 schwarze 1017: * Ordinary paragraph.
1018: * Well, this is really the hardest--POD seems to assume that, for
1019: * example, a leading space implies a newline, and so on.
1020: * Lots of other snakes in the grass: escaping a newline followed by a
1021: * period (accidental mdoc(7) control), double-newlines after macro
1022: * passages, etc.
1023: */
1024: static void
1025: ordinary(struct state *st, const char *buf, size_t start, size_t end)
1026: {
1.13 kristaps 1027: size_t i, j, opstack;
1.15 kristaps 1028: int seq;
1.1 schwarze 1029:
1030: if ( ! st->parsing || st->paused)
1031: return;
1032:
1033: /*
1034: * Special-case: the NAME section.
1035: * If we find a "-" when searching from the end, assume that
1036: * we're in "name - description" format.
1037: * To wit, print out a "Nm" and "Nd" in that format.
1038: */
1.11 kristaps 1039: if (SECT_NAME == st->sect) {
1.15 kristaps 1040: for (i = end - 2; i > start; i--)
1041: if ('-' == buf[i] && ' ' == buf[i + 1])
1.1 schwarze 1042: break;
1043: if ('-' == buf[i]) {
1044: j = i;
1045: /* Roll over multiple "-". */
1046: for ( ; i > start; i--)
1047: if ('-' != buf[i])
1048: break;
1.17 kristaps 1049: donamenm(st, buf, &start, i + 1);
1.5 kristaps 1050: start = j + 1;
1.17 kristaps 1051: while (start < end && ' ' == buf[start])
1052: start++;
1.15 kristaps 1053: fputs(".Nd ", stdout);
1.11 kristaps 1054: formatcodeln(st, buf, &start, end, 1);
1.5 kristaps 1055: putchar('\n');
1.1 schwarze 1056: return;
1057: }
1058: }
1059:
1060: if ( ! st->haspar)
1061: puts(".Pp");
1062:
1063: st->haspar = 0;
1064: last = '\n';
1.13 kristaps 1065: opstack = 0;
1.1 schwarze 1066:
1.15 kristaps 1067: for (seq = 0; start < end; seq++) {
1.1 schwarze 1068: /*
1069: * Loop til we get either to a newline or escape.
1070: * Escape initial control characters.
1071: */
1072: while (start < end) {
1073: if (start < end - 1 && '<' == buf[start + 1])
1074: break;
1075: else if ('\n' == buf[start])
1076: break;
1077: else if ('\n' == last && '.' == buf[start])
1078: printf("\\&");
1079: else if ('\n' == last && '\'' == buf[start])
1080: printf("\\&");
1.12 kristaps 1081: /*
1082: * If we're in the SYNOPSIS, have square
1083: * brackets indicate that we're opening and
1084: * closing an optional context.
1085: */
1.13 kristaps 1086: if (SECT_SYNOPSIS == st->sect &&
1087: ('[' == buf[start] ||
1088: ']' == buf[start]) &&
1089: dosynopsisop(buf, &last,
1090: &start, end, &opstack))
1091: continue;
1.1 schwarze 1092: putchar(last = buf[start++]);
1.8 kristaps 1093: if ('\\' == last)
1094: putchar('e');
1.1 schwarze 1095: }
1096:
1097: if (start < end - 1 && '<' == buf[start + 1]) {
1098: /*
1099: * We've encountered a format code.
1100: * This is going to trigger a macro no matter
1101: * what, so print a newline now.
1102: * Then print the (possibly nested) macros and
1103: * following that, a newline.
1.8 kristaps 1104: * Consume all whitespace so we don't
1105: * accidentally start an implicit literal line.
1.16 kristaps 1106: * If the macro ends with a flush comma or
1107: * period, let mdoc(7) handle it for us.
1.1 schwarze 1108: */
1.15 kristaps 1109: if (formatcode(st, buf, &start, end, 0, 0, seq)) {
1.16 kristaps 1110: if ((start == end - 1 ||
1111: (start < end - 1 &&
1112: (' ' == buf[start + 1] ||
1113: '\n' == buf[start + 1]))) &&
1114: ('.' == buf[start] ||
1115: ',' == buf[start])) {
1116: putchar(' ');
1117: putchar(buf[start++]);
1118: }
1.1 schwarze 1119: putchar(last = '\n');
1.6 kristaps 1120: while (start < end && ' ' == buf[start])
1121: start++;
1122: }
1.1 schwarze 1123: } else if (start < end && '\n' == buf[start]) {
1124: /*
1125: * Print the newline only if we haven't already
1126: * printed a newline.
1127: */
1128: if (last != '\n')
1129: putchar(last = buf[start]);
1130: if (++start >= end)
1131: continue;
1132: /*
1133: * If we have whitespace next, eat it to prevent
1134: * mdoc(7) from thinking that it's meant for
1135: * verbatim text.
1136: * It is--but if we start with that, we can't
1137: * have a macro subsequent it, which may be
1138: * possible if we have an escape next.
1139: */
1140: if (' ' == buf[start] || '\t' == buf[start]) {
1141: puts(".br");
1142: last = '\n';
1143: }
1144: for ( ; start < end; start++)
1145: if (' ' != buf[start] && '\t' != buf[start])
1146: break;
1.12 kristaps 1147: }
1.1 schwarze 1148: }
1149:
1150: if (last != '\n')
1151: putchar('\n');
1152: }
1153:
1154: /*
1155: * There are three kinds of paragraphs: verbatim (starts with whitespace
1156: * of some sort), ordinary (starts without "=" marker), or a command
1157: * (default: starts with "=").
1158: */
1159: static void
1160: dopar(struct state *st, const char *buf, size_t start, size_t end)
1161: {
1162:
1163: if (end == start)
1164: return;
1165: if (' ' == buf[start] || '\t' == buf[start])
1166: verbatim(st, buf, start, end);
1167: else if ('=' != buf[start])
1168: ordinary(st, buf, start, end);
1169: else
1170: command(st, buf, start, end);
1171: }
1172:
1173: /*
1174: * Loop around paragraphs within a document, processing each one in the
1175: * POD way.
1176: */
1177: static void
1178: dofile(const struct args *args, const char *fname,
1179: const struct tm *tm, const char *buf, size_t sz)
1180: {
1181: size_t sup, end, i, cur = 0;
1182: struct state st;
1183: const char *section, *date;
1184: char datebuf[64];
1185: char *title, *cp;
1186:
1187: if (0 == sz)
1188: return;
1189:
1190: /* Title is last path component of the filename. */
1191:
1192: if (NULL != args->title)
1193: title = strdup(args->title);
1194: else if (NULL != (cp = strrchr(fname, '/')))
1195: title = strdup(cp + 1);
1196: else
1197: title = strdup(fname);
1198:
1199: if (NULL == title) {
1200: perror(NULL);
1201: exit(EXIT_FAILURE);
1202: }
1203:
1204: /* Section is 1 unless suffix is "pm". */
1205:
1206: if (NULL == (section = args->section)) {
1207: section = "1";
1208: if (NULL != (cp = strrchr(title, '.'))) {
1209: *cp++ = '\0';
1210: if (0 == strcmp(cp, "pm"))
1.10 kristaps 1211: section = PERL_SECTION;
1.1 schwarze 1212: }
1213: }
1214:
1215: /* Date. Or the given "tm" if not supplied. */
1216:
1217: if (NULL == (date = args->date)) {
1218: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
1219: date = datebuf;
1220: }
1221:
1222: for (cp = title; '\0' != *cp; cp++)
1223: *cp = toupper((int)*cp);
1224:
1225: /* The usual mdoc(7) preamble. */
1226:
1227: printf(".Dd %s\n", date);
1228: printf(".Dt %s %s\n", title, section);
1229: puts(".Os");
1230:
1231: free(title);
1232:
1233: memset(&st, 0, sizeof(struct state));
1234: assert(sz > 0);
1235:
1236: /* Main loop over file contents. */
1237:
1238: while (cur < sz) {
1239: /* Read until next paragraph. */
1240: for (i = cur + 1; i < sz; i++)
1241: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
1242: /* Consume blank paragraphs. */
1243: while (i + 1 < sz && '\n' == buf[i + 1])
1244: i++;
1245: break;
1246: }
1247:
1248: /* Adjust end marker for EOF. */
1249: end = i < sz ? i - 1 :
1250: ('\n' == buf[sz - 1] ? sz - 1 : sz);
1251: sup = i < sz ? end + 2 : sz;
1252:
1253: /* Process paragraph and adjust start. */
1254: dopar(&st, buf, cur, end);
1255: cur = sup;
1256: }
1257: }
1258:
1259: /*
1260: * Read a single file fully into memory.
1261: * If the file is "-", do it from stdin.
1262: * If successfully read, send the input buffer to dofile() for further
1263: * processing.
1264: */
1265: static int
1266: readfile(const struct args *args, const char *fname)
1267: {
1268: int fd;
1269: char *buf;
1270: size_t bufsz, cur;
1271: ssize_t ssz;
1272: struct tm *tm;
1273: time_t ttm;
1274: struct stat st;
1275:
1276: fd = 0 != strcmp("-", fname) ?
1277: open(fname, O_RDONLY, 0) : STDIN_FILENO;
1278:
1279: if (-1 == fd) {
1280: perror(fname);
1281: return(0);
1282: }
1283:
1284: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
1285: ttm = time(NULL);
1286: tm = localtime(&ttm);
1287: } else
1288: tm = localtime(&st.st_mtime);
1289:
1290: /*
1291: * Arbitrarily-sized initial buffer.
1292: * Should be big enough for most files...
1293: */
1294: cur = 0;
1295: bufsz = 1 << 14;
1296: if (NULL == (buf = malloc(bufsz))) {
1297: perror(NULL);
1298: exit(EXIT_FAILURE);
1299: }
1300:
1301: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
1302: /* Double buffer size on fill. */
1303: if ((size_t)ssz == bufsz - cur) {
1304: bufsz *= 2;
1305: if (NULL == (buf = realloc(buf, bufsz))) {
1306: perror(NULL);
1307: exit(EXIT_FAILURE);
1308: }
1309: }
1310: cur += (size_t)ssz;
1311: }
1312: if (ssz < 0) {
1313: perror(fname);
1314: free(buf);
1315: return(0);
1316: }
1317:
1318: dofile(args, STDIN_FILENO == fd ?
1319: "STDIN" : fname, tm, buf, cur);
1320: free(buf);
1321: if (STDIN_FILENO != fd)
1322: close(fd);
1323: return(1);
1324: }
1325:
1326: int
1327: main(int argc, char *argv[])
1328: {
1329: const char *fname, *name;
1330: struct args args;
1331: int c;
1332:
1333: name = strrchr(argv[0], '/');
1334: if (name == NULL)
1335: name = argv[0];
1336: else
1337: ++name;
1338:
1339: memset(&args, 0, sizeof(struct args));
1340: fname = "-";
1341:
1342: /* Accept no arguments for now. */
1343:
1344: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
1345: switch (c) {
1346: case ('h'):
1347: /* FALLTHROUGH */
1348: case ('l'):
1349: /* FALLTHROUGH */
1350: case ('c'):
1351: /* FALLTHROUGH */
1352: case ('o'):
1353: /* FALLTHROUGH */
1354: case ('q'):
1355: /* FALLTHROUGH */
1356: case ('r'):
1357: /* FALLTHROUGH */
1358: case ('u'):
1359: /* FALLTHROUGH */
1360: case ('v'):
1361: /* Ignore these. */
1362: break;
1363: case ('d'):
1364: args.date = optarg;
1365: break;
1366: case ('n'):
1367: args.title = optarg;
1368: break;
1369: case ('s'):
1370: args.section = optarg;
1371: break;
1372: default:
1373: goto usage;
1374: }
1375:
1376: argc -= optind;
1377: argv += optind;
1378:
1379: /* Accept only a single input file. */
1380:
1.25 schwarze 1381: if (argc > 1)
1382: goto usage;
1.1 schwarze 1383: else if (1 == argc)
1384: fname = *argv;
1385:
1386: return(readfile(&args, fname) ?
1387: EXIT_SUCCESS : EXIT_FAILURE);
1388:
1389: usage:
1390: fprintf(stderr, "usage: %s [-d date] "
1.25 schwarze 1391: "[-n title] [-s section] [file]\n", name);
1.1 schwarze 1392:
1393: return(EXIT_FAILURE);
1394: }
CVSweb