Annotation of pod2mdoc/pod2mdoc.c, Revision 1.3
1.3 ! schwarze 1: /* $Id: pod2mdoc.c,v 1.2 2014/03/20 15:15:32 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/stat.h>
18: #include <sys/time.h>
19:
20: #include <assert.h>
21: #include <ctype.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27: #include <unistd.h>
28:
29: struct args {
30: const char *title; /* override "Dt" title */
31: const char *date; /* override "Dd" date */
32: const char *section; /* override "Dt" section */
33: };
34:
35: struct state {
36: int parsing; /* after =cut of before command */
37: int paused; /* in =begin and before =end */
38: int haspar; /* in paragraph: do we need Pp? */
39: int isname; /* are we the NAME section? */
40: const char *fname; /* file being parsed */
41: };
42:
43: enum fmt {
44: FMT_ITALIC,
45: FMT_BOLD,
46: FMT_CODE,
47: FMT_LINK,
48: FMT_ESCAPE,
49: FMT_FILE,
50: FMT_NBSP,
51: FMT_INDEX,
52: FMT_NULL,
53: FMT__MAX
54: };
55:
56: enum cmd {
57: CMD_POD = 0,
58: CMD_HEAD1,
59: CMD_HEAD2,
60: CMD_HEAD3,
61: CMD_HEAD4,
62: CMD_OVER,
63: CMD_ITEM,
64: CMD_BACK,
65: CMD_BEGIN,
66: CMD_END,
67: CMD_FOR,
68: CMD_ENCODING,
69: CMD_CUT,
70: CMD__MAX
71: };
72:
73: static const char *const cmds[CMD__MAX] = {
74: "pod", /* CMD_POD */
75: "head1", /* CMD_HEAD1 */
76: "head2", /* CMD_HEAD2 */
77: "head3", /* CMD_HEAD3 */
78: "head4", /* CMD_HEAD4 */
79: "over", /* CMD_OVER */
80: "item", /* CMD_ITEM */
81: "back", /* CMD_BACK */
82: "begin", /* CMD_BEGIN */
83: "end", /* CMD_END */
84: "for", /* CMD_FOR */
85: "encoding", /* CMD_ENCODING */
86: "cut" /* CMD_CUT */
87: };
88:
89: static const char fmts[FMT__MAX] = {
90: 'I', /* FMT_ITALIC */
91: 'B', /* FMT_BOLD */
92: 'C', /* FMT_CODE */
93: 'L', /* FMT_LINK */
94: 'E', /* FMT_ESCAPE */
95: 'F', /* FMT_FILE */
96: 'S', /* FMT_NBSP */
97: 'X', /* FMT_INDEX */
98: 'Z' /* FMT_NULL */
99: };
100:
101: /*
102: * Given buf[*start] is at the start of an escape name, read til the end
103: * of the escape ('>') then try to do something with it.
104: * Sets start to be one after the '>'.
105: */
106: static void
107: formatescape(const char *buf, size_t *start, size_t end)
108: {
109: char esc[16]; /* no more needed */
110: size_t i, max;
111:
112: max = sizeof(esc) - 1;
113: i = 0;
114: /* Read til our buffer is full. */
115: while (*start < end && '>' != buf[*start] && i < max)
116: esc[i++] = buf[(*start)++];
117: esc[i] = '\0';
118:
119: if (i == max) {
120: /* Too long... skip til we end. */
121: while (*start < end && '>' != buf[*start])
122: (*start)++;
123: return;
124: } else if (*start >= end)
125: return;
126:
127: assert('>' == buf[*start]);
128: (*start)++;
129:
130: /*
131: * TODO: right now, we only recognise the named escapes.
132: * Just let the rest of them go.
133: */
134: if (0 == strcmp(esc, "lt"))
135: printf("\\(la");
136: else if (0 == strcmp(esc, "gt"))
137: printf("\\(ra");
138: else if (0 == strcmp(esc, "vb"))
139: printf("\\(ba");
140: else if (0 == strcmp(esc, "sol"))
141: printf("\\(sl");
142: }
143:
144: /*
145: * Skip space characters.
146: */
147: static void
148: skipspace(const char *buf, size_t *start, size_t end)
149: {
150:
151: while (*start < end && ' ' == buf[*start])
152: (*start)++;
153: }
154:
155: /*
156: * We're at the character in front of a format code, which is structured
157: * like X<...> and can contain nested format codes.
158: * This consumes the whole format code, and any nested format codes, til
159: * the end of matched production.
160: * If "reentrant", then we're being called after a macro has already
161: * been printed to the current line.
162: * "last" is set to the last read character: this is used to determine
163: * whether we should buffer with space or not.
164: * If "nomacro", then we don't print any macros, just contained data.
165: */
166: static int
167: formatcode(const char *buf, size_t *start,
168: size_t end, int reentrant, int last, int nomacro)
169: {
170: enum fmt fmt;
171:
172: assert(*start + 1 < end);
173: assert('<' == buf[*start + 1]);
174:
175: for (fmt = 0; fmt < FMT__MAX; fmt++)
176: if (buf[*start] == fmts[fmt])
177: break;
178:
179: /* Invalid macros are just regular text. */
180:
181: if (FMT__MAX == fmt) {
182: putchar(buf[*start]);
183: (*start)++;
184: return(0);
185: }
186:
187: *start += 2;
188:
189: /*
190: * Escapes don't print macro sequences, so just output them like
191: * normal text before processing for macros.
192: */
193: if (FMT_ESCAPE == fmt) {
194: formatescape(buf, start, end);
195: return(0);
196: } else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
197: /* For indices and nulls, just consume. */
198: while (*start < end && '>' != buf[*start])
199: (*start)++;
200: if (*start < end)
201: (*start)++;
202: return(0);
203: }
204:
205: if ( ! nomacro) {
206: /*
207: * Print out the macro describing this format code.
208: * If we're not "reentrant" (not yet on a macro line)
209: * then print a newline, if necessary, and the macro
210: * indicator.
211: * Otherwise, offset us with a space.
212: */
213: if ( ! reentrant && last != '\n')
214: putchar('\n');
215: if ( ! reentrant)
216: putchar('.');
217: else
218: putchar(' ');
219:
220: /*
221: * If we don't have whitespace before us, then suppress
222: * macro whitespace with Ns.
223: */
224: if (' ' != last)
225: printf("Ns ");
226: switch (fmt) {
227: case (FMT_ITALIC):
228: printf("Em ");
229: break;
230: case (FMT_BOLD):
231: printf("Sy ");
232: break;
233: case (FMT_CODE):
1.2 schwarze 234: printf("Qo Li ");
1.1 schwarze 235: break;
236: case (FMT_LINK):
237: printf("Lk ");
238: break;
239: case (FMT_FILE):
240: printf("Pa ");
241: break;
242: case (FMT_NBSP):
243: /* TODO. */
244: printf("No ");
245: break;
246: default:
247: abort();
248: }
249: }
250:
251: /*
252: * Read until we reach the end market ('>') or until we find a
253: * nested format code.
254: * Don't emit any newlines: since we're on a macro line, we
255: * don't want to break the line.
256: */
257: while (*start < end) {
258: if ('>' == buf[*start]) {
259: (*start)++;
260: break;
261: }
262: if (*start + 1 < end && '<' == buf[*start + 1]) {
263: formatcode(buf, start, end, 1, last, nomacro);
264: continue;
265: }
1.3 ! schwarze 266:
1.1 schwarze 267: /*
268: * Make sure that any macro-like words (or
269: * really any word starting with a capital
270: * letter) is assumed to be a macro that must be
271: * escaped.
272: * XXX: should this be isalpha()?
273: */
274: if ((' ' == last || '\n' == last) &&
275: isupper(buf[*start]))
276: printf("\\&");
1.3 ! schwarze 277:
! 278: last = buf[*start];
! 279: if ('\n' == last)
! 280: last = ' ';
! 281: putchar(last);
! 282:
1.1 schwarze 283: (*start)++;
284: }
1.2 schwarze 285:
286: if ( ! nomacro && FMT_CODE == fmt)
287: printf(" Qc ");
1.1 schwarze 288:
289: if (reentrant)
290: return(1);
291:
292: /*
293: * If we're not reentrant, we want to put ending punctuation on
294: * the macro line so that it's properly handled by being
295: * smooshed against the terminal word.
296: */
297: skipspace(buf, start, end);
298: if (',' != buf[*start] && '.' != buf[*start] &&
299: '!' != buf[*start] && '?' != buf[*start] &&
300: ')' != buf[*start])
301: return(1);
302: while (*start < end) {
303: if (',' != buf[*start] &&
304: '.' != buf[*start] &&
305: '!' != buf[*start] &&
306: '?' != buf[*start] &&
307: ')' != buf[*start])
308: break;
309: putchar(' ');
310: putchar(buf[*start]);
311: (*start)++;
312: }
313: skipspace(buf, start, end);
314: return(1);
315: }
316:
317: /*
318: * Calls formatcode() til the end of a paragraph.
319: */
320: static void
321: formatcodeln(const char *buf, size_t *start, size_t end, int nomacro)
322: {
323: int last;
324:
325: last = '\n';
326: while (*start < end) {
327: if (*start + 1 < end && '<' == buf[*start + 1]) {
328: formatcode(buf, start, end, 1, last, nomacro);
329: continue;
330: }
331: if ('\n' != buf[*start])
332: putchar(last = buf[*start]);
333: (*start)++;
334: }
335: }
336:
337: /*
338: * A command paragraph, as noted in the perlpod manual, just indicates
339: * that we should do something, optionally with some text to print as
340: * well.
341: */
342: static void
343: command(struct state *st, const char *buf, size_t start, size_t end)
344: {
345: size_t len, csz;
346: enum cmd cmd;
347:
348: assert('=' == buf[start]);
349: start++;
350: len = end - start;
351:
352: for (cmd = 0; cmd < CMD__MAX; cmd++) {
353: csz = strlen(cmds[cmd]);
354: if (len < csz)
355: continue;
356: if (0 == memcmp(&buf[start], cmd[cmds], csz))
357: break;
358: }
359:
360: /* Ignore bogus commands. */
361:
362: if (CMD__MAX == cmd)
363: return;
364:
365: start += csz;
366: skipspace(buf, &start, end);
367: len = end - start;
368:
369: if (st->paused) {
370: st->paused = CMD_END != cmd;
371: return;
372: }
373:
374: switch (cmd) {
375: case (CMD_POD):
376: break;
377: case (CMD_HEAD1):
378: /*
379: * The behaviour of head= follows from a quick glance at
380: * how pod2man handles it.
381: */
382: printf(".Sh ");
383: st->isname = 0;
384: if (end - start == 4)
385: if (0 == memcmp(&buf[start], "NAME", 4))
386: st->isname = 1;
387: formatcodeln(buf, &start, end, 1);
388: putchar('\n');
389: st->haspar = 1;
390: break;
391: case (CMD_HEAD2):
392: printf(".Ss ");
393: formatcodeln(buf, &start, end, 1);
394: putchar('\n');
395: st->haspar = 1;
396: break;
397: case (CMD_HEAD3):
398: puts(".Pp");
399: printf(".Em ");
400: formatcodeln(buf, &start, end, 0);
401: putchar('\n');
402: puts(".Pp");
403: st->haspar = 1;
404: break;
405: case (CMD_HEAD4):
406: puts(".Pp");
407: printf(".No ");
408: formatcodeln(buf, &start, end, 0);
409: putchar('\n');
410: puts(".Pp");
411: st->haspar = 1;
412: break;
413: case (CMD_OVER):
414: /*
415: * TODO: we should be doing this after we process the
416: * first =item to see whether we'll do an -enum,
417: * -bullet, or something else.
418: */
419: puts(".Bl -tag -width Ds");
420: break;
421: case (CMD_ITEM):
422: printf(".It ");
423: formatcodeln(buf, &start, end, 0);
424: putchar('\n');
425: st->haspar = 1;
426: break;
427: case (CMD_BACK):
428: puts(".El");
429: break;
430: case (CMD_BEGIN):
431: /*
432: * We disregard all types for now.
433: * TODO: process at least "text" in a -literal block.
434: */
435: st->paused = 1;
436: break;
437: case (CMD_FOR):
438: /*
439: * We ignore all types of encodings and formats
440: * unilaterally.
441: */
442: break;
443: case (CMD_ENCODING):
444: break;
445: case (CMD_CUT):
446: st->parsing = 0;
447: return;
448: default:
449: abort();
450: }
451:
452: /* Any command (but =cut) makes us start parsing. */
453: st->parsing = 1;
454: }
455:
456: /*
457: * Just pump out the line in a verbatim block.
458: */
459: static void
460: verbatim(struct state *st, const char *buf, size_t start, size_t end)
461: {
462:
463: if ( ! st->parsing || st->paused)
464: return;
465:
466: puts(".Bd -literal");
467: printf("%.*s\n", (int)(end - start), &buf[start]);
468: puts(".Ed");
469: }
470:
471: /*
472: * Ordinary paragraph.
473: * Well, this is really the hardest--POD seems to assume that, for
474: * example, a leading space implies a newline, and so on.
475: * Lots of other snakes in the grass: escaping a newline followed by a
476: * period (accidental mdoc(7) control), double-newlines after macro
477: * passages, etc.
478: */
479: static void
480: ordinary(struct state *st, const char *buf, size_t start, size_t end)
481: {
482: int last;
483: size_t i, j;
484:
485: if ( ! st->parsing || st->paused)
486: return;
487:
488: /*
489: * Special-case: the NAME section.
490: * If we find a "-" when searching from the end, assume that
491: * we're in "name - description" format.
492: * To wit, print out a "Nm" and "Nd" in that format.
493: */
494: if (st->isname) {
495: for (i = end - 1; i > start; i--)
496: if ('-' == buf[i])
497: break;
498: if ('-' == buf[i]) {
499: j = i;
500: /* Roll over multiple "-". */
501: for ( ; i > start; i--)
502: if ('-' != buf[i])
503: break;
504: printf(".Nm %.*s\n",
505: (int)((i + 1) - start), &buf[start]);
506: printf(".Nd %.*s\n",
507: (int)(end - (j + 1)), &buf[j + 1]);
508: return;
509: }
510: }
511:
512: if ( ! st->haspar)
513: puts(".Pp");
514:
515: st->haspar = 0;
516: last = '\n';
517:
518: while (start < end) {
519: /*
520: * Loop til we get either to a newline or escape.
521: * Escape initial control characters.
522: */
523: while (start < end) {
524: if (start < end - 1 && '<' == buf[start + 1])
525: break;
526: else if ('\n' == buf[start])
527: break;
528: else if ('\n' == last && '.' == buf[start])
529: printf("\\&");
530: else if ('\n' == last && '\'' == buf[start])
531: printf("\\&");
532: putchar(last = buf[start++]);
533: }
534:
535: if (start < end - 1 && '<' == buf[start + 1]) {
536: /*
537: * We've encountered a format code.
538: * This is going to trigger a macro no matter
539: * what, so print a newline now.
540: * Then print the (possibly nested) macros and
541: * following that, a newline.
542: */
543: if (formatcode(buf, &start, end, 0, last, 0))
544: putchar(last = '\n');
545: } else if (start < end && '\n' == buf[start]) {
546: /*
547: * Print the newline only if we haven't already
548: * printed a newline.
549: */
550: if (last != '\n')
551: putchar(last = buf[start]);
552: if (++start >= end)
553: continue;
554: /*
555: * If we have whitespace next, eat it to prevent
556: * mdoc(7) from thinking that it's meant for
557: * verbatim text.
558: * It is--but if we start with that, we can't
559: * have a macro subsequent it, which may be
560: * possible if we have an escape next.
561: */
562: if (' ' == buf[start] || '\t' == buf[start]) {
563: puts(".br");
564: last = '\n';
565: }
566: for ( ; start < end; start++)
567: if (' ' != buf[start] && '\t' != buf[start])
568: break;
569: } else if (start < end) {
570: /*
571: * Default: print the character.
572: * Escape initial control characters.
573: */
574: if ('\n' == last && '.' == buf[start])
575: printf("\\&");
576: else if ('\n' == last && '\'' == buf[start])
577: printf("\\&");
578: putchar(last = buf[start++]);
579: }
580: }
581:
582: if (last != '\n')
583: putchar('\n');
584: }
585:
586: /*
587: * There are three kinds of paragraphs: verbatim (starts with whitespace
588: * of some sort), ordinary (starts without "=" marker), or a command
589: * (default: starts with "=").
590: */
591: static void
592: dopar(struct state *st, const char *buf, size_t start, size_t end)
593: {
594:
595: if (end == start)
596: return;
597: if (' ' == buf[start] || '\t' == buf[start])
598: verbatim(st, buf, start, end);
599: else if ('=' != buf[start])
600: ordinary(st, buf, start, end);
601: else
602: command(st, buf, start, end);
603: }
604:
605: /*
606: * Loop around paragraphs within a document, processing each one in the
607: * POD way.
608: */
609: static void
610: dofile(const struct args *args, const char *fname,
611: const struct tm *tm, const char *buf, size_t sz)
612: {
613: size_t sup, end, i, cur = 0;
614: struct state st;
615: const char *section, *date;
616: char datebuf[64];
617: char *title, *cp;
618:
619: if (0 == sz)
620: return;
621:
622: /* Title is last path component of the filename. */
623:
624: if (NULL != args->title)
625: title = strdup(args->title);
626: else if (NULL != (cp = strrchr(fname, '/')))
627: title = strdup(cp + 1);
628: else
629: title = strdup(fname);
630:
631: if (NULL == title) {
632: perror(NULL);
633: exit(EXIT_FAILURE);
634: }
635:
636: /* Section is 1 unless suffix is "pm". */
637:
638: if (NULL == (section = args->section)) {
639: section = "1";
640: if (NULL != (cp = strrchr(title, '.'))) {
641: *cp++ = '\0';
642: if (0 == strcmp(cp, "pm"))
643: section = "3p";
644: }
645: }
646:
647: /* Date. Or the given "tm" if not supplied. */
648:
649: if (NULL == (date = args->date)) {
650: strftime(datebuf, sizeof(datebuf), "%B %d, %Y", tm);
651: date = datebuf;
652: }
653:
654: for (cp = title; '\0' != *cp; cp++)
655: *cp = toupper((int)*cp);
656:
657: /* The usual mdoc(7) preamble. */
658:
659: printf(".Dd %s\n", date);
660: printf(".Dt %s %s\n", title, section);
661: puts(".Os");
662:
663: free(title);
664:
665: memset(&st, 0, sizeof(struct state));
666: assert(sz > 0);
667:
668: /* Main loop over file contents. */
669:
670: while (cur < sz) {
671: /* Read until next paragraph. */
672: for (i = cur + 1; i < sz; i++)
673: if ('\n' == buf[i] && '\n' == buf[i - 1]) {
674: /* Consume blank paragraphs. */
675: while (i + 1 < sz && '\n' == buf[i + 1])
676: i++;
677: break;
678: }
679:
680: /* Adjust end marker for EOF. */
681: end = i < sz ? i - 1 :
682: ('\n' == buf[sz - 1] ? sz - 1 : sz);
683: sup = i < sz ? end + 2 : sz;
684:
685: /* Process paragraph and adjust start. */
686: dopar(&st, buf, cur, end);
687: cur = sup;
688: }
689: }
690:
691: /*
692: * Read a single file fully into memory.
693: * If the file is "-", do it from stdin.
694: * If successfully read, send the input buffer to dofile() for further
695: * processing.
696: */
697: static int
698: readfile(const struct args *args, const char *fname)
699: {
700: int fd;
701: char *buf;
702: size_t bufsz, cur;
703: ssize_t ssz;
704: struct tm *tm;
705: time_t ttm;
706: struct stat st;
707:
708: assert(NULL != fname);
709:
710: fd = 0 != strcmp("-", fname) ?
711: open(fname, O_RDONLY, 0) : STDIN_FILENO;
712:
713: if (-1 == fd) {
714: perror(fname);
715: return(0);
716: }
717:
718: if (STDIN_FILENO == fd || -1 == fstat(fd, &st)) {
719: ttm = time(NULL);
720: tm = localtime(&ttm);
721: } else
722: tm = localtime(&st.st_mtime);
723:
724: /*
725: * Arbitrarily-sized initial buffer.
726: * Should be big enough for most files...
727: */
728: cur = 0;
729: bufsz = 1 << 14;
730: if (NULL == (buf = malloc(bufsz))) {
731: perror(NULL);
732: exit(EXIT_FAILURE);
733: }
734:
735: while ((ssz = read(fd, buf + cur, bufsz - cur)) > 0) {
736: /* Double buffer size on fill. */
737: if ((size_t)ssz == bufsz - cur) {
738: bufsz *= 2;
739: if (NULL == (buf = realloc(buf, bufsz))) {
740: perror(NULL);
741: exit(EXIT_FAILURE);
742: }
743: }
744: cur += (size_t)ssz;
745: }
746: if (ssz < 0) {
747: perror(fname);
748: free(buf);
749: return(0);
750: }
751:
752: dofile(args, STDIN_FILENO == fd ?
753: "STDIN" : fname, tm, buf, cur);
754: free(buf);
755: if (STDIN_FILENO != fd)
756: close(fd);
757: return(1);
758: }
759:
760: int
761: main(int argc, char *argv[])
762: {
763: const char *fname, *name;
764: struct args args;
765: int c;
766:
767: name = strrchr(argv[0], '/');
768: if (name == NULL)
769: name = argv[0];
770: else
771: ++name;
772:
773: memset(&args, 0, sizeof(struct args));
774: fname = "-";
775:
776: /* Accept no arguments for now. */
777:
778: while (-1 != (c = getopt(argc, argv, "c:d:hln:oq:rs:uv")))
779: switch (c) {
780: case ('h'):
781: /* FALLTHROUGH */
782: case ('l'):
783: /* FALLTHROUGH */
784: case ('c'):
785: /* FALLTHROUGH */
786: case ('o'):
787: /* FALLTHROUGH */
788: case ('q'):
789: /* FALLTHROUGH */
790: case ('r'):
791: /* FALLTHROUGH */
792: case ('u'):
793: /* FALLTHROUGH */
794: case ('v'):
795: /* Ignore these. */
796: break;
797: case ('d'):
798: args.date = optarg;
799: break;
800: case ('n'):
801: args.title = optarg;
802: break;
803: case ('s'):
804: args.section = optarg;
805: break;
806: default:
807: goto usage;
808: }
809:
810: argc -= optind;
811: argv += optind;
812:
813: /* Accept only a single input file. */
814:
815: if (argc > 2)
816: return(EXIT_FAILURE);
817: else if (1 == argc)
818: fname = *argv;
819:
820: return(readfile(&args, fname) ?
821: EXIT_SUCCESS : EXIT_FAILURE);
822:
823: usage:
824: fprintf(stderr, "usage: %s [-d date] "
825: "[-n title] [-s section]\n", name);
826:
827: return(EXIT_FAILURE);
828: }
CVSweb