Annotation of docbook2mdoc/docbook2mdoc.c, Revision 1.2
1.2 ! kristaps 1: /* $Id: docbook2mdoc.c,v 1.1.1.1 2014/03/28 02:04:47 kristaps Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <sys/queue.h>
18:
19: #include <assert.h>
20: #include <ctype.h>
21: #include <expat.h>
22: #include <fcntl.h>
23: #include <getopt.h>
24: #include <stdio.h>
25: #include <stdlib.h>
26: #include <string.h>
27:
28: /*
29: * All recognised node types.
30: */
31: enum nodeid {
32: NODE_ROOT = 0, /* Must comes first. */
33: /* Alpha-ordered hereafter. */
34: NODE_CITEREFENTRY,
35: NODE_CODE,
36: NODE_FUNCSYNOPSIS,
37: NODE_FUNCSYNOPSISINFO,
38: NODE_MANVOLNUM,
39: NODE_PARA,
40: NODE_PROGRAMLISTING,
41: NODE_REFCLASS,
42: NODE_REFDESCRIPTOR,
43: NODE_REFENTRY,
44: NODE_REFENTRYTITLE,
45: NODE_REFMETA,
46: NODE_REFMISCINFO,
47: NODE_REFNAME,
48: NODE_REFNAMEDIV,
49: NODE_REFPURPOSE,
50: NODE_REFSECT1,
51: NODE_REFSYNOPSISDIV,
52: NODE_SYNOPSIS,
53: NODE_TEXT,
54: NODE_TITLE,
55: NODE__MAX
56: };
57:
58: /*
59: * Global parse state.
60: * Keep this as simple and small as possible.
61: */
62: struct parse {
63: enum nodeid node; /* current (NODE_ROOT if pre-tree) */
64: int stop; /* should we stop now? */
65: struct pnode *root; /* root of parse tree */
66: struct pnode *cur; /* current node in tree */
67: char *b;
68: size_t bsz;
69: size_t mbsz;
70: };
71:
72: struct node {
73: const char *name;
74: unsigned int flags;
75: #define NODE_IGNTEXT 1 /* ignore all contained text */
76: };
77:
78: TAILQ_HEAD(pnodeq, pnode);
79:
80: struct pnode {
81: enum nodeid node; /* node type */
82: char *b; /* binary data buffer */
83: size_t bsz; /* data buffer size */
84: struct pnode *parent; /* parent (or NULL if top) */
85: struct pnodeq childq; /* queue of children */
86: TAILQ_ENTRY(pnode) child;
87: };
88:
89: static const struct node nodes[NODE__MAX] = {
90: { NULL, 0 },
91: { "citerefentry", NODE_IGNTEXT },
92: { "code", 0 },
93: { "funcsynopsis", NODE_IGNTEXT },
94: { "funcsynopsisinfo", 0 },
95: { "manvolnum", 0 },
96: { "para", 0 },
97: { "programlisting", 0 },
98: { "refclass", NODE_IGNTEXT },
99: { "refdescriptor", NODE_IGNTEXT },
100: { "refentry", NODE_IGNTEXT },
101: { "refentrytitle", 0 },
102: { "refmeta", NODE_IGNTEXT },
103: { "refmiscinfo", NODE_IGNTEXT },
104: { "refname", 0 },
105: { "refnamediv", NODE_IGNTEXT },
106: { "refpurpose", 0 },
107: { "refsect1", 0 },
108: { "refsynopsisdiv", NODE_IGNTEXT },
109: { "synopsis", 0 },
110: { NULL, 0 },
111: { "title", 0 },
112: };
113:
114: /*
115: * Look up whether "parent" is a valid parent for "node".
116: */
117: static int
118: isparent(enum nodeid node, enum nodeid parent)
119: {
120:
121: switch (node) {
122: case (NODE_ROOT):
123: return(0);
124: case (NODE_CITEREFENTRY):
125: switch (parent) {
126: case (NODE_FUNCSYNOPSISINFO):
127: case (NODE_PARA):
128: case (NODE_PROGRAMLISTING):
129: case (NODE_REFDESCRIPTOR):
130: case (NODE_REFENTRYTITLE):
131: case (NODE_REFNAME):
132: case (NODE_REFPURPOSE):
133: case (NODE_SYNOPSIS):
134: case (NODE_TITLE):
135: return(1);
136: default:
137: break;
138: }
139: return(0);
140: case (NODE_CODE):
141: switch (parent) {
142: case (NODE_FUNCSYNOPSISINFO):
143: case (NODE_PARA):
144: case (NODE_PROGRAMLISTING):
145: case (NODE_REFDESCRIPTOR):
146: case (NODE_REFENTRYTITLE):
147: case (NODE_REFNAME):
148: case (NODE_REFPURPOSE):
149: case (NODE_SYNOPSIS):
150: case (NODE_TITLE):
151: return(1);
152: default:
153: break;
154: }
155: return(0);
156: case (NODE_MANVOLNUM):
157: switch (parent) {
158: case (NODE_CITEREFENTRY):
159: case (NODE_REFMETA):
160: return(1);
161: default:
162: break;
163: }
164: return(0);
165: case (NODE_FUNCSYNOPSIS):
166: switch (parent) {
167: case (NODE_PARA):
168: case (NODE_REFSECT1):
169: case (NODE_REFSYNOPSISDIV):
170: return(1);
171: default:
172: break;
173: }
174: return(0);
175: case (NODE_FUNCSYNOPSISINFO):
176: return(NODE_FUNCSYNOPSIS == parent);
177: case (NODE_PARA):
178: switch (parent) {
179: case (NODE_REFSECT1):
180: case (NODE_REFSYNOPSISDIV):
181: return(1);
182: default:
183: break;
184: }
185: return(0);
186: case (NODE_PROGRAMLISTING):
187: switch (parent) {
188: case (NODE_PARA):
189: case (NODE_REFSECT1):
190: case (NODE_REFSYNOPSISDIV):
191: return(1);
192: default:
193: break;
194: }
195: return(0);
196: case (NODE_REFCLASS):
197: return(parent == NODE_REFNAMEDIV);
198: case (NODE_REFDESCRIPTOR):
199: return(parent == NODE_REFNAMEDIV);
200: case (NODE_REFENTRY):
201: return(parent == NODE_ROOT);
202: case (NODE_REFENTRYTITLE):
203: switch (parent) {
204: case (NODE_CITEREFENTRY):
205: case (NODE_REFMETA):
206: return(1);
207: default:
208: break;
209: }
210: case (NODE_REFMETA):
211: return(parent == NODE_REFENTRY);
212: case (NODE_REFMISCINFO):
213: return(parent == NODE_REFMETA);
214: case (NODE_REFNAME):
215: return(parent == NODE_REFNAMEDIV);
216: case (NODE_REFNAMEDIV):
217: return(parent == NODE_REFENTRY);
218: case (NODE_REFPURPOSE):
219: return(parent == NODE_REFNAMEDIV);
220: case (NODE_REFSECT1):
221: return(parent == NODE_REFENTRY);
222: case (NODE_REFSYNOPSISDIV):
223: return(parent == NODE_REFENTRY);
224: case (NODE_SYNOPSIS):
225: switch (parent) {
226: case (NODE_REFSYNOPSISDIV):
227: case (NODE_REFSECT1):
228: return(1);
229: default:
230: break;
231: }
232: return(0);
233: case (NODE_TITLE):
234: switch (parent) {
235: case (NODE_REFSECT1):
236: case (NODE_REFSYNOPSISDIV):
237: return(1);
238: default:
239: break;
240: }
241: return(0);
242: case (NODE_TEXT):
243: return(1);
244: case (NODE__MAX):
245: break;
246: }
247:
248: abort();
249: return(0);
250: }
251:
252: static void
253: xml_char(void *arg, const XML_Char *p, int sz)
254: {
255: struct parse *ps = arg;
256: struct pnode *dat;
257:
258: /* Stopped or no tree yet. */
259: if (ps->stop || NODE_ROOT == ps->node)
260: return;
261:
262: /* Not supposed to be collecting text. */
263: assert(NULL != ps->cur);
264: if (NODE_IGNTEXT & nodes[ps->node].flags)
265: return;
266:
267: /*
268: * Are we in the midst of processing text?
269: * If we're not processing text right now, then create a text
270: * node for doing so.
271: */
272: if (NODE_TEXT != ps->node) {
273: dat = calloc(1, sizeof(struct pnode));
274: if (NULL == dat) {
275: perror(NULL);
276: exit(EXIT_FAILURE);
277: }
278:
279: dat->node = ps->node = NODE_TEXT;
280: dat->parent = ps->cur;
281: TAILQ_INIT(&dat->childq);
282: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
283: ps->cur = dat;
284: assert(NULL != ps->root);
285:
286: }
287:
288: /* Append to current buffer. */
289: assert(sz >= 0);
290: ps->cur->b = realloc(ps->cur->b,
291: ps->cur->bsz + (size_t)sz);
292: if (NULL == ps->cur->b) {
293: perror(NULL);
294: exit(EXIT_FAILURE);
295: }
296: memcpy(ps->cur->b + ps->cur->bsz, p, sz);
297: ps->cur->bsz += (size_t)sz;
298: }
299:
300: /*
301: * Begin an element.
302: * First, look for the element.
303: * If we don't find it and we're not parsing, keep going.
304: * If we don't find it (and we're parsing), puke and exit.
305: * If we find it but we're not parsing yet (i.e., it's not a refentry
306: * and thus out of context), keep going.
307: * If we're at the root and already have a tree, puke and exit.
308: * Make sure that the element is in the right context.
309: * Lastly, put the node onto our parse tree and continue.
310: */
311: static void
312: xml_elem_start(void *arg, const XML_Char *name, const XML_Char **atts)
313: {
314: struct parse *ps = arg;
315: enum nodeid node;
316: struct pnode *dat;
317:
318: if (ps->stop)
319: return;
320:
321: /* Close out text node, if applicable... */
322: if (NODE_TEXT == ps->node) {
323: assert(NULL != ps->cur);
324: ps->cur = ps->cur->parent;
325: assert(NULL != ps->cur);
326: ps->node = ps->cur->node;
327: }
328:
329: for (node = 0; node < NODE__MAX; node++)
330: if (NULL == nodes[node].name)
331: continue;
332: else if (0 == strcmp(nodes[node].name, name))
333: break;
334:
335: if (NODE__MAX == node && NODE_ROOT == ps->node) {
336: fprintf(stderr, "%s: ignoring node\n", name);
337: return;
338: } else if (NODE__MAX == node) {
339: fprintf(stderr, "%s: unknown node\n", name);
340: ps->stop = 1;
341: return;
342: } else if (NODE_ROOT == ps->node && NULL != ps->root) {
343: fprintf(stderr, "%s: reentering?\n", name);
344: ps->stop = 1;
345: return;
346: } else if (NODE_ROOT == ps->node && NODE_REFENTRY != node) {
347: fprintf(stderr, "%s: known node w/o context\n", name);
348: return;
349: } else if ( ! isparent(node, ps->node)) {
350: fprintf(stderr, "%s: bad parent\n", name);
351: ps->stop = 1;
352: return;
353: }
354:
355: if (NULL == (dat = calloc(1, sizeof(struct pnode)))) {
356: perror(NULL);
357: exit(EXIT_FAILURE);
358: }
359:
360: dat->node = ps->node = node;
361: dat->parent = ps->cur;
362: TAILQ_INIT(&dat->childq);
363:
364: if (NULL != ps->cur)
365: TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
366:
367: ps->cur = dat;
368: if (NULL == ps->root)
369: ps->root = dat;
370: }
371:
372: /*
373: * Roll up the parse tree.
374: * Does nothing else special.
375: * If we hit the root, then assign ourselves as the NODE_ROOT.
376: */
377: static void
378: xml_elem_end(void *arg, const XML_Char *name)
379: {
380: struct parse *ps = arg;
381:
382: if (ps->stop || NODE_ROOT == ps->node)
383: return;
384:
385: /* Close out text node, if applicable... */
386: if (NODE_TEXT == ps->node) {
387: assert(NULL != ps->cur);
388: ps->cur = ps->cur->parent;
389: assert(NULL != ps->cur);
390: ps->node = ps->cur->node;
391: }
392:
393: if (NULL == (ps->cur = ps->cur->parent))
394: ps->node = NODE_ROOT;
395: else
396: ps->node = ps->cur->node;
397: }
398:
399: static void
400: pnode_free(struct pnode *pn)
401: {
402: struct pnode *pp;
403:
404: if (NULL == pn)
405: return;
406:
407: while (NULL != (pp = TAILQ_FIRST(&pn->childq))) {
408: TAILQ_REMOVE(&pn->childq, pp, child);
409: pnode_free(pp);
410: }
411:
412: free(pn->b);
413: free(pn);
414: }
415:
416: static void
417: pnode_unlink(struct pnode *pn)
418: {
419:
420: if (NULL != pn->parent)
421: TAILQ_REMOVE(&pn->parent->childq, pn, child);
422: pnode_free(pn);
423: }
424:
425: static void
426: bufclear(struct parse *p)
427: {
428:
429: p->b[p->bsz = 0] = '\0';
430: }
431:
432: static void
433: bufappend(struct parse *p, struct pnode *pn)
434: {
435:
436: assert(NODE_TEXT == pn->node);
437: if (p->bsz + pn->bsz + 1 > p->mbsz) {
438: p->mbsz = p->bsz + pn->bsz + 1;
439: if (NULL == (p->b = realloc(p->b, p->mbsz))) {
440: perror(NULL);
441: exit(EXIT_FAILURE);
442: }
443: }
444: memcpy(p->b + p->bsz, pn->b, pn->bsz);
445: p->bsz += pn->bsz;
446: p->b[p->bsz] = '\0';
447: }
448:
449: /*
450: * Print text presumably on a macro line.
451: * Ignore any child macros.
452: * Convert all whitespace to regular spaces.
453: */
454: static void
455: pnode_printmacrolinepart(struct parse *p, struct pnode *pn)
456: {
457: struct pnode *pp;
458: char *cp;
459:
460: bufclear(p);
461: while (NULL != (pp = TAILQ_FIRST(&pn->childq))) {
462: if (NODE_TEXT == pp->node)
463: bufappend(p, pp);
464: pnode_unlink(pp);
465: }
466:
467: /* Convert all space to spaces. */
468: for (cp = p->b; '\0' != *cp; cp++)
469: if (isspace((int)*cp))
470: *cp = ' ';
471:
472: for (cp = p->b; isspace((int)*cp); cp++)
473: /* Spin. */ ;
474:
475: for ( ; '\0' != *cp; cp++) {
476: /* Escape us if we look like a macro. */
477: if ((cp == p->b || ' ' == *(cp - 1)) &&
478: isupper((int)*cp) &&
479: '\0' != *(cp + 1) &&
480: islower((int)*(cp + 1)) &&
481: ('\0' == *(cp + 2) ||
482: ' ' == *(cp + 2) ||
483: (islower((int)*(cp + 2)) &&
484: ('\0' == *(cp + 3) ||
485: ' ' == *(cp + 3)))))
486: fputs("\\&", stdout);
487: putchar(*cp);
488: /* If we're a character escape, escape us. */
489: if ('\\' == *cp)
490: putchar('e');
491: }
492: }
493:
494: /*
495: * Just pnode_printmacrolinepart() but with a newline.
496: * If no text, just the newline.
497: */
498: static void
499: pnode_printmacroline(struct parse *p, struct pnode *pn)
500: {
501:
502: pnode_printmacrolinepart(p, pn);
503: putchar('\n');
504: }
505:
506: static void
507: pnode_printrefsect(struct parse *p, struct pnode *pn)
508: {
509: struct pnode *pp;
510:
511: TAILQ_FOREACH(pp, &pn->childq, child)
512: if (NODE_TITLE == pp->node)
513: break;
514:
515: if (NULL != pp) {
516: fputs(".Sh ", stdout);
517: pnode_printmacroline(p, pp);
518: pnode_unlink(pp);
519: } else
520: puts(".Sh UNKNOWN");
521: }
522:
523: static void
524: pnode_printciterefentry(struct parse *p, struct pnode *pn)
525: {
526: struct pnode *pp, *title, *manvol;
527:
528: title = manvol = NULL;
529: TAILQ_FOREACH(pp, &pn->childq, child)
530: if (NODE_MANVOLNUM == pp->node)
531: manvol = pp;
532: else if (NODE_REFENTRYTITLE == pp->node)
533: title = pp;
534:
535: fputs(".Xr ", stdout);
536: if (NULL != title) {
537: pnode_printmacrolinepart(p, title);
538: pnode_unlink(title);
539: } else
540: fputs("unknown", stdout);
541: putchar(' ');
542: if (NULL != manvol) {
543: pnode_printmacroline(p, manvol);
544: pnode_unlink(manvol);
545: } else
546: puts("1");
547: }
548:
549: static void
550: pnode_printrefmeta(struct parse *p, struct pnode *pn)
551: {
552: struct pnode *pp, *title, *manvol;
553:
554: title = manvol = NULL;
555: TAILQ_FOREACH(pp, &pn->childq, child)
556: if (NODE_MANVOLNUM == pp->node)
557: manvol = pp;
558: else if (NODE_REFENTRYTITLE == pp->node)
559: title = pp;
560:
1.2 ! kristaps 561: puts(".Dd $Mdocdate" "$");
1.1 kristaps 562: fputs(".Dt ", stdout);
563:
564: if (NULL != title) {
565: pnode_printmacrolinepart(p, title);
566: pnode_unlink(title);
567: } else
568: fputs("UNKNOWN", stdout);
569: putchar(' ');
570: if (NULL != manvol) {
571: pnode_printmacroline(p, manvol);
572: pnode_unlink(manvol);
573: } else
574: puts("1");
575:
576: puts(".Os");
577: }
578:
579: /*
580: * Print a parsed node (or ignore it--whatever).
581: * This is a recursive function.
582: * FIXME: macro line continuation?
583: */
584: static void
585: pnode_print(struct parse *p, struct pnode *pn)
586: {
587: struct pnode *pp;
588: char *cp;
589: int last;
590:
591: if (NULL == pn)
592: return;
593:
594: if (NODE_TEXT != pn->node && NODE_ROOT != pn->node)
595: printf(".\\\" %s\n", nodes[pn->node].name);
596:
597: switch (pn->node) {
598: case (NODE_CITEREFENTRY):
599: pnode_printciterefentry(p, pn);
600: break;
601: case (NODE_CODE):
602: fputs(".Li ", stdout);
603: pnode_printmacroline(p, pn);
604: break;
605: case (NODE_FUNCSYNOPSISINFO):
606: fputs(".Fd ", stdout);
607: pnode_printmacroline(p, pn);
608: break;
609: case (NODE_PARA):
610: /* FIXME: not always. */
611: puts(".Pp");
612: break;
613: case (NODE_PROGRAMLISTING):
614: puts(".Bd -literal");
615: break;
616: case (NODE_REFMETA):
617: pnode_printrefmeta(p, pn);
618: break;
619: case (NODE_REFNAME):
620: fputs(".Nm ", stdout);
621: pnode_printmacroline(p, pn);
622: return;
623: case (NODE_REFNAMEDIV):
624: puts(".Sh NAME");
625: break;
626: case (NODE_REFPURPOSE):
627: fputs(".Nd ", stdout);
628: pnode_printmacroline(p, pn);
629: return;
630: case (NODE_REFSYNOPSISDIV):
631: puts(".Sh SYNOPSIS");
632: break;
633: case (NODE_REFSECT1):
634: pnode_printrefsect(p, pn);
635: break;
636: case (NODE_TEXT):
637: bufclear(p);
638: bufappend(p, pn);
639: /*
640: * Output all characters, squeezing out whitespace
641: * between newlines.
642: * XXX: all whitespace, including tabs (?).
643: * Remember to escape control characters and escapes.
644: */
645: for (last = '\n', cp = p->b; '\0' != *cp; ) {
646: if ('\n' == last) {
647: /* Consume all whitespace. */
648: if (isspace((int)*cp)) {
649: while (isspace((int)*cp))
650: cp++;
651: continue;
652: } else if ('\'' == *cp || '.' == *cp)
653: fputs("\\&", stdout);
654: }
655: putchar(last = *cp++);
656: /* If we're a character escape, escape us. */
657: if ('\\' == last)
658: putchar('e');
659: }
660: if ('\n' != last)
661: putchar('\n');
662: break;
663: default:
664: break;
665: }
666:
667: TAILQ_FOREACH(pp, &pn->childq, child)
668: pnode_print(p, pp);
669:
670: switch (pn->node) {
671: case (NODE_PROGRAMLISTING):
672: puts(".Ed");
673: break;
674: default:
675: break;
676: }
677: }
678:
679: /*
680: * Loop around the read buffer until we've drained it of all data.
681: * Invoke the parser context with each buffer fill.
682: */
683: static int
684: readfile(XML_Parser xp, int fd,
685: char *b, size_t bsz, const char *fn)
686: {
687: struct parse p;
688: int rc;
689: ssize_t ssz;
690:
691: memset(&p, 0, sizeof(struct parse));
692:
693: p.b = malloc(p.bsz = p.mbsz = 1024);
694:
695: XML_SetCharacterDataHandler(xp, xml_char);
696: XML_SetElementHandler(xp, xml_elem_start, xml_elem_end);
697: XML_SetUserData(xp, &p);
698:
699: while ((ssz = read(fd, b, bsz)) >= 0) {
700: if (0 == (rc = XML_Parse(xp, b, ssz, 0 == ssz)))
701: fprintf(stderr, "%s: %s\n", fn,
702: XML_ErrorString
703: (XML_GetErrorCode(xp)));
704: else if ( ! p.stop && ssz > 0)
705: continue;
706: /*
707: * Exit when we've read all or errors have occured
708: * during the parse sequence.
709: */
710: pnode_print(&p, p.root);
711: pnode_free(p.root);
712: free(p.b);
713: return(0 != rc && ! p.stop);
714: }
715:
716: /* Read error has occured. */
717: perror(fn);
718: pnode_free(p.root);
719: free(p.b);
720: return(0);
721: }
722:
723: int
724: main(int argc, char *argv[])
725: {
726: XML_Parser xp;
727: const char *fname;
728: char *buf;
729: int fd, rc;
730:
731: fname = "-";
732: xp = NULL;
733: buf = NULL;
734: rc = 0;
735:
736: if (-1 != getopt(argc, argv, ""))
737: return(EXIT_FAILURE);
738:
739: argc -= optind;
740: argv += optind;
741:
742: if (argc > 1)
743: return(EXIT_FAILURE);
744: else if (argc > 0)
745: fname = argv[0];
746:
747: /* Read from stdin or a file. */
748: fd = 0 == strcmp(fname, "-") ?
749: STDIN_FILENO : open(fname, O_RDONLY, 0);
750:
751: /*
752: * Open file for reading.
753: * Allocate a read buffer.
754: * Create the parser context.
755: * Dive directly into the parse.
756: */
757: if (-1 == fd)
758: perror(fname);
759: else if (NULL == (buf = malloc(4096)))
760: perror(NULL);
761: else if (NULL == (xp = XML_ParserCreate(NULL)))
762: perror(NULL);
763: else if ( ! readfile(xp, fd, buf, 4096, fname))
764: rc = 1;
765:
766: XML_ParserFree(xp);
767: free(buf);
768: if (STDIN_FILENO != fd)
769: close(fd);
770: return(rc ? EXIT_SUCCESS : EXIT_FAILURE);
771: }
CVSweb