Annotation of docbook2mdoc/statistics.c, Revision 1.25
1.25 ! schwarze 1: /* $Id: statistics.c,v 1.24 2019/04/14 14:00:17 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <assert.h>
18: #include <ctype.h>
19: #include <err.h>
20: #include <fcntl.h>
1.2 schwarze 21: #include <getopt.h>
1.1 schwarze 22: #include <stdio.h>
23: #include <stdlib.h>
24: #include <string.h>
25: #include <unistd.h>
26:
27: /*
28: * Count parent-child element relations in a corpus of DocBook documents.
29: *
30: * Read absolute or relative input file names from standard input,
31: * one per line.
32: * For each parent-child relation, print the total number of occurrences,
33: * the parent name, and the child name, separated by tab characters
34: * and followed by a newline character.
35: *
36: * Typical usage:
37: * statistics < filenames.txt | sort -n
38: * statistics < filenames.txt | grep '\<listitem\>' | sort -n
1.4 schwarze 39: *
40: * Relations already fully implemented are excluded by default.
41: * The option -a shows all relations.
42: *
43: * If two arguments (parent and child) are given, a histogram
44: * of the number of children of the kind in each parent is given
45: * in addition to the normal output.
46: *
47: * Example usage:
48: * statistics tgroup colspec < filenames.txt | grep colspec
1.25 ! schwarze 49: *
! 50: * Synchronized with parse.c up to rev. 1.42.
1.1 schwarze 51: */
52:
53: struct entry {
54: char *parent;
55: char *child;
56: int count;
57: };
58:
59: static struct entry *table;
60: static size_t tablesz;
61: static size_t tablei;
62:
63: static char **stack;
64: static size_t stacksz;
65: static size_t stacki;
66:
1.4 schwarze 67: static const int nchildsz = 8;
68: struct nchild {
69: char *parent;
70: char *child;
71: int freq[nchildsz];
72: int count;
73: };
74:
75: static struct nchild nchild;
76: static char *fname;
77:
1.1 schwarze 78:
79: /*
80: * Count one instance of a parent-child relation.
1.2 schwarze 81: * Before the special call table_add(NULL, NULL),
82: * mark relations to not be counted;
83: * in that phase, child can be NULL as a wildcard.
1.1 schwarze 84: */
85: static void
86: table_add(const char *parent, const char *child)
87: {
1.2 schwarze 88: static int init_done;
89: size_t i;
90:
91: if (parent == NULL && child == NULL) {
92: init_done = 1;
93: return;
94: }
1.1 schwarze 95:
1.4 schwarze 96: /* Optional parent-child histogram. */
97:
98: if (init_done && parent != NULL && child != NULL &&
99: nchild.parent != NULL && nchild.child != NULL &&
100: strcmp(parent, nchild.parent) == 0 &&
101: strcmp(child, nchild.child) == 0) {
102: if (nchild.count < nchildsz) {
103: nchild.freq[nchild.count]++;
104: if (nchild.count > 0)
105: nchild.freq[nchild.count - 1]--;
106: } else if (nchild.count == nchildsz)
107: puts(fname);
108: nchild.count++;
109: }
110:
1.1 schwarze 111: /* If the table entry already exists, increment its count. */
112:
113: for (i = 0; i < tablei; i++) {
114: if (strcmp(parent, table[i].parent) == 0 &&
1.2 schwarze 115: (child == NULL || table[i].child == NULL ||
116: strcmp(child, table[i].child) == 0)) {
117: assert(init_done);
118: if (table[i].count != -1)
119: table[i].count++;
1.1 schwarze 120: return;
121: }
122: }
123:
124: /* If the table is full, make room. */
125:
126: if (tablei == tablesz) {
127: tablesz += 64;
128: table = reallocarray(table, tablesz, sizeof(*table));
129: if (table == NULL)
130: err(1, NULL);
131: }
132:
133: /* Add a new entry to the table. */
134:
135: if ((table[tablei].parent = strdup(parent)) == NULL)
136: err(1, NULL);
1.2 schwarze 137: if (child == NULL)
138: table[tablei].child = NULL;
139: else if ((table[tablei].child = strdup(child)) == NULL)
1.1 schwarze 140: err(1, NULL);
1.2 schwarze 141: table[tablei++].count = init_done ? 1 : -1;
1.1 schwarze 142: }
143:
144: /*
145: * Enter an element.
146: */
147: static void
148: stack_push(const char *name)
149: {
1.4 schwarze 150: if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0)
151: nchild.count = 0;
152:
1.1 schwarze 153: if (stacki == stacksz) {
154: stacksz += 8;
155: stack = reallocarray(stack, stacksz, sizeof(*stack));
156: if (stack == NULL)
157: err(1, NULL);
158: }
159: if ((stack[stacki++] = strdup(name)) == NULL)
160: err(1, NULL);
161: }
162:
163: /*
164: * Exit an element.
165: */
166: static void
167: stack_pop(const char *name)
168: {
169: if (stacki > 0 && (name == NULL ||
170: strcmp(name, stack[stacki - 1]) == 0))
171: free(stack[--stacki]);
172: }
173:
174: /*
175: * Simplified version from parse.c.
176: */
177: static int
178: advance(char *b, size_t rlen, size_t *pend, const char *charset)
179: {
180: int space;
181:
182: if (*charset == ' ') {
183: space = 1;
184: charset++;
185: } else
186: space = 0;
187:
188: while (*pend < rlen) {
189: if (space && isspace((unsigned char)b[*pend]))
190: break;
191: if (strchr(charset, b[*pend]) != NULL)
192: break;
193: ++*pend;
194: }
195: if (*pend == rlen) {
196: b[rlen] = '\0';
197: return 1;
198: } else
199: return 0;
200: }
201:
202: /*
203: * Simplified version from parse.c.
204: */
205: static void
206: parse_file(int fd, char *fname)
207: {
208: char b[4096];
1.3 schwarze 209: char *cp;
1.1 schwarze 210: ssize_t rsz; /* Return value from read(2). */
211: size_t rlen; /* Number of bytes in b[]. */
212: size_t poff; /* Parse offset in b[]. */
213: size_t pend; /* Offset of the end of the current word. */
214: int in_tag, in_arg, in_quotes, elem_end;
215:
216: rlen = 0;
217: in_tag = in_arg = in_quotes = 0;
218: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
219: if ((rlen += rsz) == 0)
220: break;
221: pend = 0;
222: for (;;) {
223: if ((poff = pend) == rlen)
224: break;
225: if (isspace((unsigned char)b[pend])) {
226: pend++;
227: continue;
228: }
229: if (in_arg) {
1.5 schwarze 230: if (in_quotes == 0 &&
231: (b[pend] == '\'' || b[pend] == '"')) {
232: in_quotes = b[pend] == '"' ? 2 : 1;
1.1 schwarze 233: pend++;
234: continue;
235: }
236: if (advance(b, rlen, &pend,
1.5 schwarze 237: in_quotes == 2 ? "\"" :
238: in_quotes == 1 ? "'" : " >") && rsz > 0)
1.1 schwarze 239: break;
240: in_arg = in_quotes = elem_end = 0;
241: if (b[pend] == '>') {
242: in_tag = 0;
243: if (pend > 0 && b[pend - 1] == '/') {
244: b[pend - 1] = '\0';
245: elem_end = 1;
246: }
247: }
248: b[pend] = '\0';
249: if (pend < rlen)
250: pend++;
251: if (elem_end)
252: stack_pop(NULL);
253: } else if (in_tag) {
254: if (advance(b, rlen, &pend, " =>") && rsz > 0)
255: break;
256: elem_end = 0;
257: switch (b[pend]) {
258: case '>':
259: in_tag = 0;
260: if (pend > 0 && b[pend - 1] == '/') {
261: b[pend - 1] = '\0';
262: elem_end = 1;
263: }
264: break;
265: case '=':
266: in_arg = 1;
267: break;
268: default:
269: break;
270: }
271: b[pend] = '\0';
272: if (pend < rlen)
273: pend++;
274: if (elem_end)
275: stack_pop(NULL);
276: } else if (b[poff] == '<') {
277: if (advance(b, rlen, &pend, " >") && rsz > 0)
278: break;
1.3 schwarze 279: if (pend > poff + 3 &&
280: strncmp(b + poff, "<!--", 4) == 0) {
281: /* Skip a comment. */
282: cp = strstr(b + pend - 2, "-->");
283: if (cp == NULL) {
284: pend = rlen;
285: if (rsz > 0)
286: break;
287: } else
288: pend = cp + 3 - b;
289: continue;
290: }
1.1 schwarze 291: elem_end = 0;
292: if (b[pend] != '>')
293: in_tag = 1;
294: else if (pend > 0 && b[pend - 1] == '/') {
295: b[pend - 1] = '\0';
296: elem_end = 1;
297: }
298: b[pend] = '\0';
299: if (pend < rlen)
300: pend++;
301: if (b[++poff] == '/') {
302: elem_end = 1;
303: poff++;
304: } else if (b[poff] != '!' && b[poff] != '?') {
305: table_add(stacki > 0 ?
1.2 schwarze 306: stack[stacki - 1] : "ROOT",
1.1 schwarze 307: b + poff);
308: stack_push(b + poff);
1.25 ! schwarze 309: if (strcmp(b + poff, "sbr") == 0)
! 310: elem_end = 1;
1.1 schwarze 311: }
312: if (elem_end)
313: stack_pop(b + poff);
314: } else {
315: advance(b, rlen, &pend, "<");
316: if (stacki > 0)
317: table_add(stack[stacki - 1], "TEXT");
318: }
319: }
320: assert(poff > 0);
321: rlen -= poff;
1.25 ! schwarze 322: memmove(b, b + poff, rlen);
1.1 schwarze 323: }
324: if (rsz < 0)
325: perror(fname);
326: }
327:
328: int
329: main(int argc, char *argv[])
330: {
331: size_t fsz, i;
332: ssize_t rsz;
1.2 schwarze 333: int ch, fd, show_all;
334:
335: show_all = 0;
336: while ((ch = getopt(argc, argv, "a")) != -1) {
337: switch (ch) {
338: case 'a':
339: show_all = 1;
340: break;
341: default:
342: return 1;
343: }
344: }
1.4 schwarze 345: argc -= optind;
346: argv += optind;
347:
348: if (argc > 1) {
349: nchild.parent = argv[0];
350: nchild.child = argv[1];
351: }
1.1 schwarze 352:
1.2 schwarze 353: /* Exclude relations that are already fully implemented. */
354: if (show_all == 0) {
1.11 schwarze 355: table_add("ROOT", "refentry");
1.12 schwarze 356: table_add("acronym", "TEXT");
1.18 schwarze 357: table_add("appendix", NULL);
1.24 schwarze 358: table_add("application", "TEXT");
359: table_add("arg", "option");
1.18 schwarze 360: table_add("article", NULL);
1.24 schwarze 361: table_add("articleinfo", "date");
1.21 schwarze 362: table_add("articleinfo", "pubdate");
363: table_add("articleinfo", "title");
1.20 schwarze 364: table_add("author", "contrib");
365: table_add("author", "email");
366: table_add("author", "firstname");
367: table_add("author", "othername");
368: table_add("author", "surname");
1.23 schwarze 369: table_add("authorgroup", "author");
370: table_add("authorgroup", "othercredit");
1.16 schwarze 371: table_add("blockquote", NULL);
1.18 schwarze 372: table_add("book", NULL);
1.23 schwarze 373: table_add("bookinfo", "authorgroup");
374: table_add("bookinfo", "legalnotice");
1.21 schwarze 375: table_add("bookinfo", "pubdate");
376: table_add("bookinfo", "title");
1.9 schwarze 377: table_add("chapter", NULL);
1.23 schwarze 378: table_add("citerefentry", "manvolnum");
379: table_add("citerefentry", "refentrytitle");
1.24 schwarze 380: table_add("citetitle", "TEXT");
381: table_add("cmdsynopsis", "arg");
382: table_add("cmdsynopsis", "command");
383: table_add("cmdsynopsis", "group");
1.12 schwarze 384: table_add("code", "TEXT");
1.24 schwarze 385: table_add("command", "TEXT");
1.19 schwarze 386: table_add("computeroutput", "TEXT");
1.10 schwarze 387: table_add("constant", "TEXT");
1.21 schwarze 388: table_add("date", "TEXT");
1.24 schwarze 389: table_add("email", "TEXT");
1.8 schwarze 390: table_add("emphasis", "TEXT");
1.6 schwarze 391: table_add("entry", NULL);
1.12 schwarze 392: table_add("errorname", "TEXT");
393: table_add("filename", "TEXT");
1.24 schwarze 394: table_add("firstname", "TEXT");
395: table_add("firstterm", "TEXT");
1.8 schwarze 396: table_add("funcdef", "function");
397: table_add("funcdef", "TEXT");
398: table_add("funcprototype", "funcdef");
399: table_add("funcprototype", "paramdef");
1.12 schwarze 400: table_add("funcsynopsis", "funcprototype");
401: table_add("funcsynopsis", "funcsynopsisinfo");
402: table_add("funcsynopsisinfo", "TEXT");
1.8 schwarze 403: table_add("function", "TEXT");
1.17 schwarze 404: table_add("glossary", "glossdiv");
405: table_add("glossary", "glossentry");
406: table_add("glossdef", "para");
407: table_add("glossdiv", "glossentry");
408: table_add("glossentry", "glossdef");
409: table_add("glossentry", "glossterm");
410: table_add("glossentry", "indexterm");
411: table_add("glosslist", "glossentry");
412: table_add("glossterm", "TEXT");
1.24 schwarze 413: table_add("group", "arg");
1.9 schwarze 414: table_add("indexterm", "primary");
415: table_add("indexterm", "secondary");
1.6 schwarze 416: table_add("informaltable", "tgroup");
1.7 schwarze 417: table_add("itemizedlist", "listitem");
1.24 schwarze 418: table_add("keycap", "TEXT");
419: table_add("keycode", "TEXT");
420: table_add("keysym", "TEXT");
1.18 schwarze 421: table_add("legalnotice", NULL);
1.15 schwarze 422: table_add("link", NULL);
1.7 schwarze 423: table_add("listitem", NULL);
1.12 schwarze 424: table_add("literal", "TEXT");
1.10 schwarze 425: table_add("literallayout", NULL);
1.23 schwarze 426: table_add("manvolnum", "TEXT");
1.19 schwarze 427: table_add("markup", "TEXT");
1.13 schwarze 428: table_add("member", "TEXT");
1.18 schwarze 429: table_add("note", NULL);
1.24 schwarze 430: table_add("option", "TEXT");
1.7 schwarze 431: table_add("orderedlist", "listitem");
1.20 schwarze 432: table_add("othercredit", "contrib");
433: table_add("othercredit", "email");
434: table_add("othercredit", "firstname");
435: table_add("othercredit", "othername");
436: table_add("othercredit", "surname");
1.24 schwarze 437: table_add("othername", "TEXT");
1.2 schwarze 438: table_add("para", NULL);
1.9 schwarze 439: table_add("paramdef", "parameter");
440: table_add("paramdef", "TEXT");
441: table_add("parameter", "TEXT");
1.24 schwarze 442: table_add("personname", "firstname");
443: table_add("personname", "surname");
1.9 schwarze 444: table_add("primary", NULL);
1.10 schwarze 445: table_add("programlisting", NULL);
1.24 schwarze 446: table_add("property", "TEXT");
1.21 schwarze 447: table_add("pubdate", "TEXT");
1.24 schwarze 448: table_add("quote", "TEXT");
1.23 schwarze 449: table_add("refentry", "refentryinfo");
1.11 schwarze 450: table_add("refentry", "refmeta");
451: table_add("refentry", "refnamediv");
452: table_add("refentry", "refsect1");
453: table_add("refentry", "refsynopsisdiv");
1.24 schwarze 454: table_add("refentryinfo", "date");
1.23 schwarze 455: table_add("refentrytitle", "TEXT");
1.11 schwarze 456: table_add("refmeta", "manvolnum");
457: table_add("refmeta", "refentrytitle");
1.23 schwarze 458: table_add("refmeta", "refmiscinfo");
459: table_add("refmiscinfo", "TEXT");
1.11 schwarze 460: table_add("refname", "TEXT");
461: table_add("refnamediv", "refname");
462: table_add("refnamediv", "refpurpose");
463: table_add("refpurpose", "TEXT");
1.9 schwarze 464: table_add("refsect1", NULL);
465: table_add("refsect2", NULL);
1.24 schwarze 466: table_add("refsynopsisdiv", "cmdsynopsis");
1.11 schwarze 467: table_add("refsynopsisdiv", "funcsynopsis");
1.24 schwarze 468: table_add("replaceable", "TEXT");
469: table_add("returnvalue", "TEXT");
1.6 schwarze 470: table_add("row", "entry");
1.10 schwarze 471: table_add("screen", NULL);
1.9 schwarze 472: table_add("secondary", NULL);
473: table_add("section", NULL);
474: table_add("sect1", NULL);
475: table_add("sect2", NULL);
476: table_add("sect3", NULL);
477: table_add("sect4", NULL);
1.12 schwarze 478: table_add("sgmltag", "TEXT");
1.14 schwarze 479: table_add("simpara", NULL);
1.13 schwarze 480: table_add("simplelist", "member");
1.12 schwarze 481: table_add("structfield", "TEXT");
482: table_add("structname", "TEXT");
1.24 schwarze 483: table_add("surname", "TEXT");
1.10 schwarze 484: table_add("symbol", "TEXT");
1.24 schwarze 485: table_add("synopsis", "TEXT");
1.22 schwarze 486: table_add("systemitem", "TEXT");
1.6 schwarze 487: table_add("table", "tgroup");
488: table_add("table", "title");
489: table_add("tbody", "row");
1.7 schwarze 490: table_add("term", NULL);
1.6 schwarze 491: table_add("tgroup", "colspec");
492: table_add("tgroup", "tbody");
493: table_add("tgroup", "thead");
494: table_add("thead", "row");
495: table_add("title", "TEXT");
1.12 schwarze 496: table_add("type", "TEXT");
1.15 schwarze 497: table_add("ulink", NULL);
1.12 schwarze 498: table_add("userinput", "TEXT");
1.7 schwarze 499: table_add("variablelist", "varlistentry");
500: table_add("varlistentry", "listitem");
501: table_add("varlistentry", "term");
1.24 schwarze 502: table_add("varname", "TEXT");
1.2 schwarze 503: }
504: table_add(NULL, NULL);
505:
506: /* Loop over input files. */
1.1 schwarze 507: fd = -1;
508: fname = NULL;
509: while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
510: if (fname[rsz - 1] == '\n')
511: fname[--rsz] = '\0';
512: if ((fd = open(fname, O_RDONLY, 0)) == -1)
513: err(1, "%s", fname);
514: parse_file(fd, fname);
515: close(fd);
516: }
517:
518: /* Cleanup and error handling. */
519: free(fname);
520: if (ferror(stdin))
521: err(1, "standard input");
522: if (fd == -1)
523: errx(1, "No input file names found on standard input");
524:
525: /* Dump results. */
526: for (i = 0; i < tablei; i++)
1.2 schwarze 527: if (table[i].count != -1)
528: printf("%d\t%s\t%s\n", table[i].count,
529: table[i].parent, table[i].child);
1.4 schwarze 530:
531: /* Optional parent-child histogram. */
532: if (nchild.parent != NULL) {
533: printf("%s %s", nchild.parent, nchild.child);
534: for (i = 0; i < nchildsz; i++)
535: printf(" %d", nchild.freq[i]);
536: putchar('\n');
537: }
1.1 schwarze 538: return 0;
539: }
CVSweb