Annotation of docbook2mdoc/statistics.c, Revision 1.3
1.3 ! schwarze 1: /* $Id: statistics.c,v 1.2 2019/03/29 18:09:43 schwarze Exp $ */
1.1 schwarze 2: /*
3: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include <assert.h>
18: #include <ctype.h>
19: #include <err.h>
20: #include <fcntl.h>
1.2 schwarze 21: #include <getopt.h>
1.1 schwarze 22: #include <stdio.h>
23: #include <stdlib.h>
24: #include <string.h>
25: #include <unistd.h>
26:
27: /*
28: * Count parent-child element relations in a corpus of DocBook documents.
29: *
30: * Read absolute or relative input file names from standard input,
31: * one per line.
32: * For each parent-child relation, print the total number of occurrences,
33: * the parent name, and the child name, separated by tab characters
34: * and followed by a newline character.
35: *
36: * Typical usage:
37: * statistics < filenames.txt | sort -n
38: * statistics < filenames.txt | grep '\<listitem\>' | sort -n
39: */
40:
41: struct entry {
42: char *parent;
43: char *child;
44: int count;
45: };
46:
47: static struct entry *table;
48: static size_t tablesz;
49: static size_t tablei;
50:
51: static char **stack;
52: static size_t stacksz;
53: static size_t stacki;
54:
55:
56: /*
57: * Count one instance of a parent-child relation.
1.2 schwarze 58: * Before the special call table_add(NULL, NULL),
59: * mark relations to not be counted;
60: * in that phase, child can be NULL as a wildcard.
1.1 schwarze 61: */
62: static void
63: table_add(const char *parent, const char *child)
64: {
1.2 schwarze 65: static int init_done;
66: size_t i;
67:
68: if (parent == NULL && child == NULL) {
69: init_done = 1;
70: return;
71: }
1.1 schwarze 72:
73: /* If the table entry already exists, increment its count. */
74:
75: for (i = 0; i < tablei; i++) {
76: if (strcmp(parent, table[i].parent) == 0 &&
1.2 schwarze 77: (child == NULL || table[i].child == NULL ||
78: strcmp(child, table[i].child) == 0)) {
79: assert(init_done);
80: if (table[i].count != -1)
81: table[i].count++;
1.1 schwarze 82: return;
83: }
84: }
85:
86: /* If the table is full, make room. */
87:
88: if (tablei == tablesz) {
89: tablesz += 64;
90: table = reallocarray(table, tablesz, sizeof(*table));
91: if (table == NULL)
92: err(1, NULL);
93: }
94:
95: /* Add a new entry to the table. */
96:
97: if ((table[tablei].parent = strdup(parent)) == NULL)
98: err(1, NULL);
1.2 schwarze 99: if (child == NULL)
100: table[tablei].child = NULL;
101: else if ((table[tablei].child = strdup(child)) == NULL)
1.1 schwarze 102: err(1, NULL);
1.2 schwarze 103: table[tablei++].count = init_done ? 1 : -1;
1.1 schwarze 104: }
105:
106: /*
107: * Enter an element.
108: */
109: static void
110: stack_push(const char *name)
111: {
112: if (stacki == stacksz) {
113: stacksz += 8;
114: stack = reallocarray(stack, stacksz, sizeof(*stack));
115: if (stack == NULL)
116: err(1, NULL);
117: }
118: if ((stack[stacki++] = strdup(name)) == NULL)
119: err(1, NULL);
120: }
121:
122: /*
123: * Exit an element.
124: */
125: static void
126: stack_pop(const char *name)
127: {
128: if (stacki > 0 && (name == NULL ||
129: strcmp(name, stack[stacki - 1]) == 0))
130: free(stack[--stacki]);
131: }
132:
133: /*
134: * Simplified version from parse.c.
135: */
136: static int
137: advance(char *b, size_t rlen, size_t *pend, const char *charset)
138: {
139: int space;
140:
141: if (*charset == ' ') {
142: space = 1;
143: charset++;
144: } else
145: space = 0;
146:
147: while (*pend < rlen) {
148: if (space && isspace((unsigned char)b[*pend]))
149: break;
150: if (strchr(charset, b[*pend]) != NULL)
151: break;
152: ++*pend;
153: }
154: if (*pend == rlen) {
155: b[rlen] = '\0';
156: return 1;
157: } else
158: return 0;
159: }
160:
161: /*
162: * Simplified version from parse.c.
163: */
164: static void
165: parse_file(int fd, char *fname)
166: {
167: char b[4096];
1.3 ! schwarze 168: char *cp;
1.1 schwarze 169: ssize_t rsz; /* Return value from read(2). */
170: size_t rlen; /* Number of bytes in b[]. */
171: size_t poff; /* Parse offset in b[]. */
172: size_t pend; /* Offset of the end of the current word. */
173: int in_tag, in_arg, in_quotes, elem_end;
174:
175: rlen = 0;
176: in_tag = in_arg = in_quotes = 0;
177: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
178: if ((rlen += rsz) == 0)
179: break;
180: pend = 0;
181: for (;;) {
182: if ((poff = pend) == rlen)
183: break;
184: if (isspace((unsigned char)b[pend])) {
185: pend++;
186: continue;
187: }
188: if (in_arg) {
189: if (in_quotes == 0 && b[pend] == '"') {
190: in_quotes = 1;
191: pend++;
192: continue;
193: }
194: if (advance(b, rlen, &pend,
195: in_quotes ? "\"" : " >") && rsz > 0)
196: break;
197: in_arg = in_quotes = elem_end = 0;
198: if (b[pend] == '>') {
199: in_tag = 0;
200: if (pend > 0 && b[pend - 1] == '/') {
201: b[pend - 1] = '\0';
202: elem_end = 1;
203: }
204: }
205: b[pend] = '\0';
206: if (pend < rlen)
207: pend++;
208: if (elem_end)
209: stack_pop(NULL);
210: } else if (in_tag) {
211: if (advance(b, rlen, &pend, " =>") && rsz > 0)
212: break;
213: elem_end = 0;
214: switch (b[pend]) {
215: case '>':
216: in_tag = 0;
217: if (pend > 0 && b[pend - 1] == '/') {
218: b[pend - 1] = '\0';
219: elem_end = 1;
220: }
221: break;
222: case '=':
223: in_arg = 1;
224: break;
225: default:
226: break;
227: }
228: b[pend] = '\0';
229: if (pend < rlen)
230: pend++;
231: if (elem_end)
232: stack_pop(NULL);
233: } else if (b[poff] == '<') {
234: if (advance(b, rlen, &pend, " >") && rsz > 0)
235: break;
1.3 ! schwarze 236: if (pend > poff + 3 &&
! 237: strncmp(b + poff, "<!--", 4) == 0) {
! 238: /* Skip a comment. */
! 239: cp = strstr(b + pend - 2, "-->");
! 240: if (cp == NULL) {
! 241: pend = rlen;
! 242: if (rsz > 0)
! 243: break;
! 244: } else
! 245: pend = cp + 3 - b;
! 246: continue;
! 247: }
1.1 schwarze 248: elem_end = 0;
249: if (b[pend] != '>')
250: in_tag = 1;
251: else if (pend > 0 && b[pend - 1] == '/') {
252: b[pend - 1] = '\0';
253: elem_end = 1;
254: }
255: b[pend] = '\0';
256: if (pend < rlen)
257: pend++;
258: if (b[++poff] == '/') {
259: elem_end = 1;
260: poff++;
261: } else if (b[poff] != '!' && b[poff] != '?') {
262: table_add(stacki > 0 ?
1.2 schwarze 263: stack[stacki - 1] : "ROOT",
1.1 schwarze 264: b + poff);
265: stack_push(b + poff);
266: }
267: if (elem_end)
268: stack_pop(b + poff);
269: } else {
270: advance(b, rlen, &pend, "<");
271: if (stacki > 0)
272: table_add(stack[stacki - 1], "TEXT");
273: }
274: }
275: assert(poff > 0);
276: memmove(b, b + poff, rlen - poff);
277: rlen -= poff;
278: }
279: if (rsz < 0)
280: perror(fname);
281: }
282:
283: int
284: main(int argc, char *argv[])
285: {
286: char *fname;
287: size_t fsz, i;
288: ssize_t rsz;
1.2 schwarze 289: int ch, fd, show_all;
290:
291: show_all = 0;
292: while ((ch = getopt(argc, argv, "a")) != -1) {
293: switch (ch) {
294: case 'a':
295: show_all = 1;
296: break;
297: default:
298: return 1;
299: }
300: }
1.1 schwarze 301:
1.2 schwarze 302: /* Exclude relations that are already fully implemented. */
303: if (show_all == 0) {
304: table_add("para", NULL);
305: }
306: table_add(NULL, NULL);
307:
308: /* Loop over input files. */
1.1 schwarze 309: fd = -1;
310: fname = NULL;
311: while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
312: if (fname[rsz - 1] == '\n')
313: fname[--rsz] = '\0';
314: if ((fd = open(fname, O_RDONLY, 0)) == -1)
315: err(1, "%s", fname);
316: parse_file(fd, fname);
317: close(fd);
318: }
319:
320: /* Cleanup and error handling. */
321: free(fname);
322: if (ferror(stdin))
323: err(1, "standard input");
324: if (fd == -1)
325: errx(1, "No input file names found on standard input");
326:
327: /* Dump results. */
328: for (i = 0; i < tablei; i++)
1.2 schwarze 329: if (table[i].count != -1)
330: printf("%d\t%s\t%s\n", table[i].count,
331: table[i].parent, table[i].child);
1.1 schwarze 332: return 0;
333: }
CVSweb