Annotation of docbook2mdoc/statistics.c, Revision 1.1
1.1 ! schwarze 1: /* $Id$ */
! 2: /*
! 3: * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
! 4: *
! 5: * Permission to use, copy, modify, and distribute this software for any
! 6: * purpose with or without fee is hereby granted, provided that the above
! 7: * copyright notice and this permission notice appear in all copies.
! 8: *
! 9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
! 10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
! 12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 16: */
! 17: #include <assert.h>
! 18: #include <ctype.h>
! 19: #include <err.h>
! 20: #include <fcntl.h>
! 21: #include <stdio.h>
! 22: #include <stdlib.h>
! 23: #include <string.h>
! 24: #include <unistd.h>
! 25:
! 26: /*
! 27: * Count parent-child element relations in a corpus of DocBook documents.
! 28: *
! 29: * Read absolute or relative input file names from standard input,
! 30: * one per line.
! 31: * For each parent-child relation, print the total number of occurrences,
! 32: * the parent name, and the child name, separated by tab characters
! 33: * and followed by a newline character.
! 34: *
! 35: * Typical usage:
! 36: * statistics < filenames.txt | sort -n
! 37: * statistics < filenames.txt | grep '\<listitem\>' | sort -n
! 38: */
! 39:
! 40: struct entry {
! 41: char *parent;
! 42: char *child;
! 43: int count;
! 44: };
! 45:
! 46: static struct entry *table;
! 47: static size_t tablesz;
! 48: static size_t tablei;
! 49:
! 50: static char **stack;
! 51: static size_t stacksz;
! 52: static size_t stacki;
! 53:
! 54:
! 55: /*
! 56: * Count one instance of a parent-child relation.
! 57: */
! 58: static void
! 59: table_add(const char *parent, const char *child)
! 60: {
! 61: size_t i;
! 62:
! 63: /* If the table entry already exists, increment its count. */
! 64:
! 65: for (i = 0; i < tablei; i++) {
! 66: if (strcmp(parent, table[i].parent) == 0 &&
! 67: strcmp(child, table[i].child) == 0) {
! 68: table[i].count++;
! 69: return;
! 70: }
! 71: }
! 72:
! 73: /* If the table is full, make room. */
! 74:
! 75: if (tablei == tablesz) {
! 76: tablesz += 64;
! 77: table = reallocarray(table, tablesz, sizeof(*table));
! 78: if (table == NULL)
! 79: err(1, NULL);
! 80: }
! 81:
! 82: /* Add a new entry to the table. */
! 83:
! 84: if ((table[tablei].parent = strdup(parent)) == NULL)
! 85: err(1, NULL);
! 86: if ((table[tablei].child = strdup(child)) == NULL)
! 87: err(1, NULL);
! 88: table[tablei++].count = 1;
! 89: }
! 90:
! 91: /*
! 92: * Enter an element.
! 93: */
! 94: static void
! 95: stack_push(const char *name)
! 96: {
! 97: if (stacki == stacksz) {
! 98: stacksz += 8;
! 99: stack = reallocarray(stack, stacksz, sizeof(*stack));
! 100: if (stack == NULL)
! 101: err(1, NULL);
! 102: }
! 103: if ((stack[stacki++] = strdup(name)) == NULL)
! 104: err(1, NULL);
! 105: }
! 106:
! 107: /*
! 108: * Exit an element.
! 109: */
! 110: static void
! 111: stack_pop(const char *name)
! 112: {
! 113: if (stacki > 0 && (name == NULL ||
! 114: strcmp(name, stack[stacki - 1]) == 0))
! 115: free(stack[--stacki]);
! 116: }
! 117:
! 118: /*
! 119: * Simplified version from parse.c.
! 120: */
! 121: static int
! 122: advance(char *b, size_t rlen, size_t *pend, const char *charset)
! 123: {
! 124: int space;
! 125:
! 126: if (*charset == ' ') {
! 127: space = 1;
! 128: charset++;
! 129: } else
! 130: space = 0;
! 131:
! 132: while (*pend < rlen) {
! 133: if (space && isspace((unsigned char)b[*pend]))
! 134: break;
! 135: if (strchr(charset, b[*pend]) != NULL)
! 136: break;
! 137: ++*pend;
! 138: }
! 139: if (*pend == rlen) {
! 140: b[rlen] = '\0';
! 141: return 1;
! 142: } else
! 143: return 0;
! 144: }
! 145:
! 146: /*
! 147: * Simplified version from parse.c.
! 148: */
! 149: static void
! 150: parse_file(int fd, char *fname)
! 151: {
! 152: char b[4096];
! 153: ssize_t rsz; /* Return value from read(2). */
! 154: size_t rlen; /* Number of bytes in b[]. */
! 155: size_t poff; /* Parse offset in b[]. */
! 156: size_t pend; /* Offset of the end of the current word. */
! 157: int in_tag, in_arg, in_quotes, elem_end;
! 158:
! 159: rlen = 0;
! 160: in_tag = in_arg = in_quotes = 0;
! 161: while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
! 162: if ((rlen += rsz) == 0)
! 163: break;
! 164: pend = 0;
! 165: for (;;) {
! 166: if ((poff = pend) == rlen)
! 167: break;
! 168: if (isspace((unsigned char)b[pend])) {
! 169: pend++;
! 170: continue;
! 171: }
! 172: if (in_arg) {
! 173: if (in_quotes == 0 && b[pend] == '"') {
! 174: in_quotes = 1;
! 175: pend++;
! 176: continue;
! 177: }
! 178: if (advance(b, rlen, &pend,
! 179: in_quotes ? "\"" : " >") && rsz > 0)
! 180: break;
! 181: in_arg = in_quotes = elem_end = 0;
! 182: if (b[pend] == '>') {
! 183: in_tag = 0;
! 184: if (pend > 0 && b[pend - 1] == '/') {
! 185: b[pend - 1] = '\0';
! 186: elem_end = 1;
! 187: }
! 188: }
! 189: b[pend] = '\0';
! 190: if (pend < rlen)
! 191: pend++;
! 192: if (elem_end)
! 193: stack_pop(NULL);
! 194: } else if (in_tag) {
! 195: if (advance(b, rlen, &pend, " =>") && rsz > 0)
! 196: break;
! 197: elem_end = 0;
! 198: switch (b[pend]) {
! 199: case '>':
! 200: in_tag = 0;
! 201: if (pend > 0 && b[pend - 1] == '/') {
! 202: b[pend - 1] = '\0';
! 203: elem_end = 1;
! 204: }
! 205: break;
! 206: case '=':
! 207: in_arg = 1;
! 208: break;
! 209: default:
! 210: break;
! 211: }
! 212: b[pend] = '\0';
! 213: if (pend < rlen)
! 214: pend++;
! 215: if (elem_end)
! 216: stack_pop(NULL);
! 217: } else if (b[poff] == '<') {
! 218: if (advance(b, rlen, &pend, " >") && rsz > 0)
! 219: break;
! 220: elem_end = 0;
! 221: if (b[pend] != '>')
! 222: in_tag = 1;
! 223: else if (pend > 0 && b[pend - 1] == '/') {
! 224: b[pend - 1] = '\0';
! 225: elem_end = 1;
! 226: }
! 227: b[pend] = '\0';
! 228: if (pend < rlen)
! 229: pend++;
! 230: if (b[++poff] == '/') {
! 231: elem_end = 1;
! 232: poff++;
! 233: } else if (b[poff] != '!' && b[poff] != '?') {
! 234: table_add(stacki > 0 ?
! 235: stack[stacki - 1] : "",
! 236: b + poff);
! 237: stack_push(b + poff);
! 238: }
! 239: if (elem_end)
! 240: stack_pop(b + poff);
! 241: } else {
! 242: advance(b, rlen, &pend, "<");
! 243: if (stacki > 0)
! 244: table_add(stack[stacki - 1], "TEXT");
! 245: }
! 246: }
! 247: assert(poff > 0);
! 248: memmove(b, b + poff, rlen - poff);
! 249: rlen -= poff;
! 250: }
! 251: if (rsz < 0)
! 252: perror(fname);
! 253: }
! 254:
! 255: int
! 256: main(int argc, char *argv[])
! 257: {
! 258: char *fname;
! 259: size_t fsz, i;
! 260: ssize_t rsz;
! 261: int fd;
! 262:
! 263: fd = -1;
! 264: fname = NULL;
! 265:
! 266: /* Loop over input files. */
! 267: while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
! 268: if (fname[rsz - 1] == '\n')
! 269: fname[--rsz] = '\0';
! 270: if ((fd = open(fname, O_RDONLY, 0)) == -1)
! 271: err(1, "%s", fname);
! 272: parse_file(fd, fname);
! 273: close(fd);
! 274: }
! 275:
! 276: /* Cleanup and error handling. */
! 277: free(fname);
! 278: if (ferror(stdin))
! 279: err(1, "standard input");
! 280: if (fd == -1)
! 281: errx(1, "No input file names found on standard input");
! 282:
! 283: /* Dump results. */
! 284: for (i = 0; i < tablei; i++)
! 285: printf("%d\t%s\t%s\n", table[i].count,
! 286: table[i].parent, table[i].child);
! 287: return 0;
! 288: }
CVSweb