Annotation of docbook2mdoc/statistics.c, Revision 1.15
1.15 ! schwarze 1: /* $Id: statistics.c,v 1.14 2019/04/06 13:45:58 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <assert.h>
18: #include <ctype.h>
19: #include <err.h>
20: #include <fcntl.h>
1.2 schwarze 21: #include <getopt.h>
1.1 schwarze 22: #include <stdio.h>
23: #include <stdlib.h>
24: #include <string.h>
25: #include <unistd.h>
26:
27: /*
28: * Count parent-child element relations in a corpus of DocBook documents.
29: *
30: * Read absolute or relative input file names from standard input,
31: * one per line.
32: * For each parent-child relation, print the total number of occurrences,
33: * the parent name, and the child name, separated by tab characters
34: * and followed by a newline character.
35: *
36: * Typical usage:
37: * statistics < filenames.txt | sort -n
38: * statistics < filenames.txt | grep '\<listitem\>' | sort -n
1.4 schwarze 39: *
40: * Relations already fully implemented are excluded by default.
41: * The option -a shows all relations.
42: *
43: * If two arguments (parent and child) are given, a histogram
44: * of the number of children of the kind in each parent is given
45: * in addition to the normal output.
46: *
47: * Example usage:
48: * statistics tgroup colspec < filenames.txt | grep colspec
1.1 schwarze 49: */
50:
51: struct entry {
52: char *parent;
53: char *child;
54: int count;
55: };
56:
57: static struct entry *table;
58: static size_t tablesz;
59: static size_t tablei;
60:
61: static char **stack;
62: static size_t stacksz;
63: static size_t stacki;
64:
1.4 schwarze 65: static const int nchildsz = 8;
66: struct nchild {
67: char *parent;
68: char *child;
69: int freq[nchildsz];
70: int count;
71: };
72:
73: static struct nchild nchild;
74: static char *fname;
75:
1.1 schwarze 76:
77: /*
78: * Count one instance of a parent-child relation.
1.2 schwarze 79: * Before the special call table_add(NULL, NULL),
80: * mark relations to not be counted;
81: * in that phase, child can be NULL as a wildcard.
1.1 schwarze 82: */
83: static void
84: table_add(const char *parent, const char *child)
85: {
1.2 schwarze 86: static int init_done;
87: size_t i;
88:
89: if (parent == NULL && child == NULL) {
90: init_done = 1;
91: return;
92: }
1.1 schwarze 93:
1.4 schwarze 94: /* Optional parent-child histogram. */
95:
96: if (init_done && parent != NULL && child != NULL &&
97: nchild.parent != NULL && nchild.child != NULL &&
98: strcmp(parent, nchild.parent) == 0 &&
99: strcmp(child, nchild.child) == 0) {
100: if (nchild.count < nchildsz) {
101: nchild.freq[nchild.count]++;
102: if (nchild.count > 0)
103: nchild.freq[nchild.count - 1]--;
104: } else if (nchild.count == nchildsz)
105: puts(fname);
106: nchild.count++;
107: }
108:
1.1 schwarze 109: /* If the table entry already exists, increment its count. */
110:
111: for (i = 0; i < tablei; i++) {
112: if (strcmp(parent, table[i].parent) == 0 &&
1.2 schwarze 113: (child == NULL || table[i].child == NULL ||
114: strcmp(child, table[i].child) == 0)) {
115: assert(init_done);
116: if (table[i].count != -1)
117: table[i].count++;
1.1 schwarze 118: return;
119: }
120: }
121:
122: /* If the table is full, make room. */
123:
124: if (tablei == tablesz) {
125: tablesz += 64;
126: table = reallocarray(table, tablesz, sizeof(*table));
127: if (table == NULL)
128: err(1, NULL);
129: }
130:
131: /* Add a new entry to the table. */
132:
133: if ((table[tablei].parent = strdup(parent)) == NULL)
134: err(1, NULL);
1.2 schwarze 135: if (child == NULL)
136: table[tablei].child = NULL;
137: else if ((table[tablei].child = strdup(child)) == NULL)
1.1 schwarze 138: err(1, NULL);
1.2 schwarze 139: table[tablei++].count = init_done ? 1 : -1;
1.1 schwarze 140: }
141:
142: /*
143: * Enter an element.
144: */
145: static void
146: stack_push(const char *name)
147: {
1.4 schwarze 148: if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0)
149: nchild.count = 0;
150:
1.1 schwarze 151: if (stacki == stacksz) {
152: stacksz += 8;
153: stack = reallocarray(stack, stacksz, sizeof(*stack));
154: if (stack == NULL)
155: err(1, NULL);
156: }
157: if ((stack[stacki++] = strdup(name)) == NULL)
158: err(1, NULL);
159: }
160:
161: /*
162: * Exit an element.
163: */
164: static void
165: stack_pop(const char *name)
166: {
167: if (stacki > 0 && (name == NULL ||
168: strcmp(name, stack[stacki - 1]) == 0))
169: free(stack[--stacki]);
170: }
171:
172: /*
173: * Simplified version from parse.c.
174: */
175: static int
176: advance(char *b, size_t rlen, size_t *pend, const char *charset)
177: {
178: int space;
179:
180: if (*charset == ' ') {
181: space = 1;
182: charset++;
183: } else
184: space = 0;
185:
186: while (*pend < rlen) {
187: if (space && isspace((unsigned char)b[*pend]))
188: break;
189: if (strchr(charset, b[*pend]) != NULL)
190: break;
191: ++*pend;
192: }
193: if (*pend == rlen) {
194: b[rlen] = '\0';
195: return 1;
196: } else
197: return 0;
198: }
199:
200: /*
201: * Simplified version from parse.c.
202: */
203: static void
204: parse_file(int fd, char *fname)
205: {
206: char b[4096];
1.3 schwarze 207: char *cp;
1.1 schwarze 208: ssize_t rsz; /* Return value from read(2). */
209: size_t rlen; /* Number of bytes in b[]. */
210: size_t poff; /* Parse offset in b[]. */
211: size_t pend; /* Offset of the end of the current word. */
212: int in_tag, in_arg, in_quotes, elem_end;
213:
214: rlen = 0;
215: in_tag = in_arg = in_quotes = 0;
216: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
217: if ((rlen += rsz) == 0)
218: break;
219: pend = 0;
220: for (;;) {
221: if ((poff = pend) == rlen)
222: break;
223: if (isspace((unsigned char)b[pend])) {
224: pend++;
225: continue;
226: }
227: if (in_arg) {
1.5 schwarze 228: if (in_quotes == 0 &&
229: (b[pend] == '\'' || b[pend] == '"')) {
230: in_quotes = b[pend] == '"' ? 2 : 1;
1.1 schwarze 231: pend++;
232: continue;
233: }
234: if (advance(b, rlen, &pend,
1.5 schwarze 235: in_quotes == 2 ? "\"" :
236: in_quotes == 1 ? "'" : " >") && rsz > 0)
1.1 schwarze 237: break;
238: in_arg = in_quotes = elem_end = 0;
239: if (b[pend] == '>') {
240: in_tag = 0;
241: if (pend > 0 && b[pend - 1] == '/') {
242: b[pend - 1] = '\0';
243: elem_end = 1;
244: }
245: }
246: b[pend] = '\0';
247: if (pend < rlen)
248: pend++;
249: if (elem_end)
250: stack_pop(NULL);
251: } else if (in_tag) {
252: if (advance(b, rlen, &pend, " =>") && rsz > 0)
253: break;
254: elem_end = 0;
255: switch (b[pend]) {
256: case '>':
257: in_tag = 0;
258: if (pend > 0 && b[pend - 1] == '/') {
259: b[pend - 1] = '\0';
260: elem_end = 1;
261: }
262: break;
263: case '=':
264: in_arg = 1;
265: break;
266: default:
267: break;
268: }
269: b[pend] = '\0';
270: if (pend < rlen)
271: pend++;
272: if (elem_end)
273: stack_pop(NULL);
274: } else if (b[poff] == '<') {
275: if (advance(b, rlen, &pend, " >") && rsz > 0)
276: break;
1.3 schwarze 277: if (pend > poff + 3 &&
278: strncmp(b + poff, "<!--", 4) == 0) {
279: /* Skip a comment. */
280: cp = strstr(b + pend - 2, "-->");
281: if (cp == NULL) {
282: pend = rlen;
283: if (rsz > 0)
284: break;
285: } else
286: pend = cp + 3 - b;
287: continue;
288: }
1.1 schwarze 289: elem_end = 0;
290: if (b[pend] != '>')
291: in_tag = 1;
292: else if (pend > 0 && b[pend - 1] == '/') {
293: b[pend - 1] = '\0';
294: elem_end = 1;
295: }
296: b[pend] = '\0';
297: if (pend < rlen)
298: pend++;
299: if (b[++poff] == '/') {
300: elem_end = 1;
301: poff++;
302: } else if (b[poff] != '!' && b[poff] != '?') {
303: table_add(stacki > 0 ?
1.2 schwarze 304: stack[stacki - 1] : "ROOT",
1.1 schwarze 305: b + poff);
306: stack_push(b + poff);
307: }
308: if (elem_end)
309: stack_pop(b + poff);
310: } else {
311: advance(b, rlen, &pend, "<");
312: if (stacki > 0)
313: table_add(stack[stacki - 1], "TEXT");
314: }
315: }
316: assert(poff > 0);
317: memmove(b, b + poff, rlen - poff);
318: rlen -= poff;
319: }
320: if (rsz < 0)
321: perror(fname);
322: }
323:
324: int
325: main(int argc, char *argv[])
326: {
327: size_t fsz, i;
328: ssize_t rsz;
1.2 schwarze 329: int ch, fd, show_all;
330:
331: show_all = 0;
332: while ((ch = getopt(argc, argv, "a")) != -1) {
333: switch (ch) {
334: case 'a':
335: show_all = 1;
336: break;
337: default:
338: return 1;
339: }
340: }
1.4 schwarze 341: argc -= optind;
342: argv += optind;
343:
344: if (argc > 1) {
345: nchild.parent = argv[0];
346: nchild.child = argv[1];
347: }
1.1 schwarze 348:
1.2 schwarze 349: /* Exclude relations that are already fully implemented. */
350: if (show_all == 0) {
1.11 schwarze 351: table_add("ROOT", "refentry");
1.12 schwarze 352: table_add("acronym", "TEXT");
1.9 schwarze 353: table_add("chapter", NULL);
1.12 schwarze 354: table_add("code", "TEXT");
1.10 schwarze 355: table_add("constant", "TEXT");
1.8 schwarze 356: table_add("emphasis", "TEXT");
1.6 schwarze 357: table_add("entry", NULL);
1.12 schwarze 358: table_add("errorname", "TEXT");
359: table_add("filename", "TEXT");
1.8 schwarze 360: table_add("funcdef", "function");
361: table_add("funcdef", "TEXT");
362: table_add("funcprototype", "funcdef");
363: table_add("funcprototype", "paramdef");
1.12 schwarze 364: table_add("funcsynopsis", "funcprototype");
365: table_add("funcsynopsis", "funcsynopsisinfo");
366: table_add("funcsynopsisinfo", "TEXT");
1.8 schwarze 367: table_add("function", "TEXT");
1.9 schwarze 368: table_add("indexterm", "primary");
369: table_add("indexterm", "secondary");
1.6 schwarze 370: table_add("informaltable", "tgroup");
1.7 schwarze 371: table_add("itemizedlist", "listitem");
1.15 ! schwarze 372: table_add("link", NULL);
1.7 schwarze 373: table_add("listitem", NULL);
1.12 schwarze 374: table_add("literal", "TEXT");
1.10 schwarze 375: table_add("literallayout", NULL);
1.13 schwarze 376: table_add("member", "TEXT");
1.7 schwarze 377: table_add("orderedlist", "listitem");
1.2 schwarze 378: table_add("para", NULL);
1.9 schwarze 379: table_add("paramdef", "parameter");
380: table_add("paramdef", "TEXT");
381: table_add("parameter", "TEXT");
382: table_add("primary", NULL);
1.10 schwarze 383: table_add("programlisting", NULL);
1.11 schwarze 384: table_add("refentry", "refmeta");
385: table_add("refentry", "refnamediv");
386: table_add("refentry", "refsect1");
387: table_add("refentry", "refsynopsisdiv");
388: table_add("refmeta", "manvolnum");
389: table_add("refmeta", "refentrytitle");
390: table_add("refname", "TEXT");
391: table_add("refnamediv", "refname");
392: table_add("refnamediv", "refpurpose");
393: table_add("refpurpose", "TEXT");
1.9 schwarze 394: table_add("refsect1", NULL);
395: table_add("refsect2", NULL);
1.11 schwarze 396: table_add("refsynopsisdiv", "funcsynopsis");
1.6 schwarze 397: table_add("row", "entry");
1.10 schwarze 398: table_add("screen", NULL);
1.9 schwarze 399: table_add("secondary", NULL);
400: table_add("section", NULL);
401: table_add("sect1", NULL);
402: table_add("sect2", NULL);
403: table_add("sect3", NULL);
404: table_add("sect4", NULL);
1.12 schwarze 405: table_add("sgmltag", "TEXT");
1.14 schwarze 406: table_add("simpara", NULL);
1.13 schwarze 407: table_add("simplelist", "member");
1.12 schwarze 408: table_add("structfield", "TEXT");
409: table_add("structname", "TEXT");
1.10 schwarze 410: table_add("symbol", "TEXT");
1.6 schwarze 411: table_add("table", "tgroup");
412: table_add("table", "title");
413: table_add("tbody", "row");
1.7 schwarze 414: table_add("term", NULL);
1.6 schwarze 415: table_add("tgroup", "colspec");
416: table_add("tgroup", "tbody");
417: table_add("tgroup", "thead");
418: table_add("thead", "row");
419: table_add("title", "TEXT");
1.12 schwarze 420: table_add("type", "TEXT");
1.15 ! schwarze 421: table_add("ulink", NULL);
1.12 schwarze 422: table_add("userinput", "TEXT");
1.7 schwarze 423: table_add("variablelist", "varlistentry");
424: table_add("varlistentry", "listitem");
425: table_add("varlistentry", "term");
1.2 schwarze 426: }
427: table_add(NULL, NULL);
428:
429: /* Loop over input files. */
1.1 schwarze 430: fd = -1;
431: fname = NULL;
432: while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
433: if (fname[rsz - 1] == '\n')
434: fname[--rsz] = '\0';
435: if ((fd = open(fname, O_RDONLY, 0)) == -1)
436: err(1, "%s", fname);
437: parse_file(fd, fname);
438: close(fd);
439: }
440:
441: /* Cleanup and error handling. */
442: free(fname);
443: if (ferror(stdin))
444: err(1, "standard input");
445: if (fd == -1)
446: errx(1, "No input file names found on standard input");
447:
448: /* Dump results. */
449: for (i = 0; i < tablei; i++)
1.2 schwarze 450: if (table[i].count != -1)
451: printf("%d\t%s\t%s\n", table[i].count,
452: table[i].parent, table[i].child);
1.4 schwarze 453:
454: /* Optional parent-child histogram. */
455: if (nchild.parent != NULL) {
456: printf("%s %s", nchild.parent, nchild.child);
457: for (i = 0; i < nchildsz; i++)
458: printf(" %d", nchild.freq[i]);
459: putchar('\n');
460: }
1.1 schwarze 461: return 0;
462: }
CVSweb