/* $Id: statistics.c,v 1.39 2019/04/29 02:00:50 schwarze Exp $ */ /* * Copyright (c) 2019 Ingo Schwarze * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include "xmalloc.h" /* * Count parent-child element relations in a corpus of DocBook documents. * * Read absolute or relative input file names from standard input, * one per line. * For each parent-child relation, print the total number of occurrences, * the parent name, and the child name, separated by tab characters * and followed by a newline character. * * Typical usage: * statistics < filenames.txt | sort -n * statistics < filenames.txt | grep '\' | sort -n * * Relations already fully implemented are excluded by default. * The option -a shows all relations. * * If two arguments (parent and child) are given, a histogram * of the number of children of the kind in each parent is given * in addition to the normal output. * * Example usage: * statistics tgroup colspec < filenames.txt | grep colspec * * Synchronized with parse.c up to rev. 1.42. */ struct entry { char *parent; char *child; int count; }; static struct entry *table; static size_t tablesz; static size_t tablei; static char **stack; static size_t stacksz; static size_t stacki; static const int nchildsz = 8; struct nchild { char *parent; char *child; int freq[nchildsz]; int count; }; static struct nchild nchild; static char *fname; /* * Count one instance of a parent-child relation. * Before the special call table_add(NULL, NULL), * mark relations to not be counted; * in that phase, child can be NULL as a wildcard. */ static void table_add(const char *parent, const char *child) { static int init_done; size_t i; if (parent == NULL && child == NULL) { init_done = 1; return; } /* Optional parent-child histogram. */ if (init_done && parent != NULL && child != NULL && nchild.parent != NULL && nchild.child != NULL && strcmp(parent, nchild.parent) == 0 && strcmp(child, nchild.child) == 0) { if (nchild.count < nchildsz) { nchild.freq[nchild.count]++; if (nchild.count > 0) nchild.freq[nchild.count - 1]--; } else if (nchild.count == nchildsz) puts(fname); nchild.count++; } /* If the table entry already exists, increment its count. */ for (i = 0; i < tablei; i++) { if (strcmp(parent, table[i].parent) == 0 && (child == NULL || table[i].child == NULL || strcmp(child, table[i].child) == 0)) { assert(init_done); if (table[i].count != -1) table[i].count++; return; } } /* If the table is full, make room. */ if (tablei == tablesz) { tablesz += 64; table = xreallocarray(table, tablesz, sizeof(*table)); } /* Add a new entry to the table. */ table[tablei].parent = xstrdup(parent); table[tablei].child = child == NULL ? NULL : xstrdup(child); table[tablei++].count = init_done ? 1 : -1; } /* * Enter an element. */ static void stack_push(const char *name) { if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0) nchild.count = 0; if (stacki == stacksz) { stacksz += 8; stack = xreallocarray(stack, stacksz, sizeof(*stack)); } stack[stacki++] = xstrdup(name); } /* * Exit an element. */ static void stack_pop(const char *name) { if (stacki > 0 && (name == NULL || strcmp(name, stack[stacki - 1]) == 0)) free(stack[--stacki]); } /* * Simplified version from parse.c. */ static int advance(char *b, size_t rlen, size_t *pend, const char *charset) { int space; if (*charset == ' ') { space = 1; charset++; } else space = 0; while (*pend < rlen) { if (space && isspace((unsigned char)b[*pend])) break; if (strchr(charset, b[*pend]) != NULL) break; ++*pend; } if (*pend == rlen) { b[rlen] = '\0'; return 1; } else return 0; } /* * Simplified version from parse.c. */ static void parse_file(int fd, char *fname) { char b[4096]; char *cp; ssize_t rsz; /* Return value from read(2). */ size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ int in_tag, in_arg, in_quotes, in_doctype, elem_end; rlen = 0; in_tag = in_arg = in_quotes = in_doctype = 0; while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { if ((rlen += rsz) == 0) break; pend = 0; for (;;) { if ((poff = pend) == rlen) break; if (isspace((unsigned char)b[pend])) { pend++; continue; } if (in_arg) { if (in_quotes == 0 && (b[pend] == '\'' || b[pend] == '"')) { in_quotes = b[pend] == '"' ? 2 : 1; pend++; continue; } if (advance(b, rlen, &pend, in_quotes == 2 ? "\"" : in_quotes == 1 ? "'" : " >") && rsz > 0) break; in_arg = in_quotes = elem_end = 0; if (b[pend] == '>') { in_tag = 0; if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } } b[pend] = '\0'; if (pend < rlen) pend++; if (elem_end) stack_pop(NULL); } else if (in_tag) { if (in_doctype && b[pend] == '[') { in_tag = in_doctype = 0; pend++; continue; } if (advance(b, rlen, &pend, " =>") && rsz > 0) break; elem_end = 0; switch (b[pend]) { case '>': in_tag = 0; if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } break; case '=': in_arg = 1; break; default: break; } b[pend] = '\0'; if (pend < rlen) pend++; if (elem_end) stack_pop(NULL); } else if (b[poff] == '<') { if (advance(b, rlen, &pend, " >") && rsz > 0) break; if (pend > poff + 3 && strncmp(b + poff, ""); if (cp == NULL) { pend = rlen; if (rsz > 0) break; } else pend = cp + 3 - b; continue; } elem_end = 0; if (b[pend] != '>') in_tag = 1; else if (pend > 0 && b[pend - 1] == '/') { b[pend - 1] = '\0'; elem_end = 1; } b[pend] = '\0'; if (pend < rlen) pend++; if (b[++poff] == '/') { elem_end = 1; poff++; } else if (strcasecmp(b + poff, "!DOCTYPE") == 0) { in_doctype = 1; } else if (b[poff] != '!' && b[poff] != '?') { table_add(stacki > 0 ? stack[stacki - 1] : "ROOT", b + poff); stack_push(b + poff); if (strcmp(b + poff, "sbr") == 0) elem_end = 1; } if (elem_end) stack_pop(b + poff); } else { advance(b, rlen, &pend, "<"); if (stacki > 0) table_add(stack[stacki - 1], "TEXT"); } } assert(poff > 0); rlen -= poff; memmove(b, b + poff, rlen); } if (rsz < 0) perror(fname); } int main(int argc, char *argv[]) { size_t fsz, i; ssize_t rsz; int ch, fd, show_all; show_all = 0; while ((ch = getopt(argc, argv, "a")) != -1) { switch (ch) { case 'a': show_all = 1; break; default: return 1; } } argc -= optind; argv += optind; if (argc > 1) { nchild.parent = argv[0]; nchild.child = argv[1]; } /* Exclude relations that are already fully implemented. */ if (show_all == 0) { table_add("ROOT", "appendix"); table_add("ROOT", "article"); table_add("ROOT", "book"); table_add("ROOT", "chapter"); table_add("ROOT", "glossary"); table_add("ROOT", "part"); table_add("ROOT", "preface"); table_add("ROOT", "refentry"); table_add("ROOT", "reference"); table_add("ROOT", "sect1"); table_add("ROOT", "sect2"); table_add("abstract", NULL); table_add("acronym", "TEXT"); table_add("affiliation", "orgdiv"); table_add("affiliation", "orgname"); table_add("appendix", NULL); table_add("application", "TEXT"); table_add("arg", "option"); table_add("article", NULL); table_add("articleinfo", "abstract"); table_add("articleinfo", "author"); table_add("articleinfo", "authorgroup"); table_add("articleinfo", "copyright"); table_add("articleinfo", "date"); table_add("articleinfo", "legalnotice"); table_add("articleinfo", "pubdate"); table_add("articleinfo", "releaseinfo"); table_add("articleinfo", "subtitle"); table_add("articleinfo", "title"); table_add("author", "affiliation"); table_add("author", "contrib"); table_add("author", "email"); table_add("author", "firstname"); table_add("author", "othername"); table_add("author", "surname"); table_add("author", "TEXT"); table_add("authorgroup", "author"); table_add("authorgroup", "editor"); table_add("authorgroup", "othercredit"); table_add("blockquote", NULL); table_add("book", NULL); table_add("bookinfo", "abstract"); table_add("bookinfo", "authorgroup"); table_add("bookinfo", "copyright"); table_add("bookinfo", "legalnotice"); table_add("bookinfo", "pubdate"); table_add("bookinfo", "releaseinfo"); table_add("bookinfo", "subtitle"); table_add("bookinfo", "title"); table_add("caption", "TEXT"); table_add("chapter", NULL); table_add("citerefentry", "manvolnum"); table_add("citerefentry", "refentrytitle"); table_add("citetitle", "TEXT"); table_add("cmdsynopsis", "arg"); table_add("cmdsynopsis", "command"); table_add("cmdsynopsis", "group"); table_add("cmdsynopsis", "sbr"); table_add("code", "TEXT"); table_add("command", "TEXT"); table_add("computeroutput", "TEXT"); table_add("constant", "TEXT"); table_add("contrib", "TEXT"); table_add("copyright", "holder"); table_add("copyright", "year"); table_add("date", "TEXT"); table_add("editor", "affiliation"); table_add("editor", "firstname"); table_add("editor", "surname"); table_add("email", "TEXT"); table_add("emphasis", "errorname"); table_add("emphasis", "function"); table_add("emphasis", "TEXT"); table_add("entry", NULL); table_add("errorname", "TEXT"); table_add("figure", "mediaobject"); table_add("figure", "title"); table_add("filename", "TEXT"); table_add("firstname", "TEXT"); table_add("firstterm", "TEXT"); table_add("footnote", "para"); table_add("funcdef", "function"); table_add("funcdef", "TEXT"); table_add("funcprototype", "funcdef"); table_add("funcprototype", "paramdef"); table_add("funcsynopsis", "funcprototype"); table_add("funcsynopsis", "funcsynopsisinfo"); table_add("funcsynopsisinfo", "TEXT"); table_add("function", "replaceable"); table_add("function", "TEXT"); table_add("glossary", "glossdiv"); table_add("glossary", "glossentry"); table_add("glossdef", "para"); table_add("glossdiv", "glossentry"); table_add("glossentry", "glossdef"); table_add("glossentry", "glossterm"); table_add("glossentry", "indexterm"); table_add("glosslist", "glossentry"); table_add("glossterm", "emphasis"); table_add("glossterm", "TEXT"); table_add("group", "arg"); table_add("holder", "TEXT"); table_add("imageobject", "imagedata"); table_add("indexterm", "primary"); table_add("indexterm", "secondary"); table_add("informaltable", "tgroup"); table_add("itemizedlist", "listitem"); table_add("keycap", "TEXT"); table_add("keycode", "TEXT"); table_add("keycombo", "keycap"); table_add("keysym", "TEXT"); table_add("legalnotice", NULL); table_add("link", NULL); table_add("listitem", NULL); table_add("literal", "TEXT"); table_add("literallayout", NULL); table_add("manvolnum", "TEXT"); table_add("markup", "TEXT"); table_add("mediaobject", "caption"); table_add("mediaobject", "imageobject"); table_add("member", "constant"); table_add("member", "emphasis"); table_add("member", "function"); table_add("member", "property"); table_add("member", "symbol"); table_add("member", "TEXT"); table_add("note", NULL); table_add("olink", "citetitle"); table_add("olink", "function"); table_add("olink", "TEXT"); table_add("option", "parameter"); table_add("option", "replaceable"); table_add("option", "TEXT"); table_add("orderedlist", "listitem"); table_add("orgdiv", "TEXT"); table_add("orgname", "TEXT"); table_add("othercredit", "affiliation"); table_add("othercredit", "contrib"); table_add("othercredit", "email"); table_add("othercredit", "firstname"); table_add("othercredit", "othername"); table_add("othercredit", "surname"); table_add("othername", "TEXT"); table_add("para", NULL); table_add("paramdef", "parameter"); table_add("paramdef", "TEXT"); table_add("parameter", "TEXT"); table_add("part", NULL); table_add("personname", "firstname"); table_add("personname", "surname"); table_add("phrase", "TEXT"); table_add("preface", NULL); table_add("primary", NULL); table_add("productname", "TEXT"); table_add("programlisting", NULL); table_add("property", "TEXT"); table_add("pubdate", "TEXT"); table_add("quote", "command"); table_add("quote", "filename"); table_add("quote", "literal"); table_add("quote", "TEXT"); table_add("refentry", "refentryinfo"); table_add("refentry", "refmeta"); table_add("refentry", "refnamediv"); table_add("refentry", "refsect1"); table_add("refentry", "refsynopsisdiv"); table_add("refentryinfo", "author"); table_add("refentryinfo", "authorgroup"); table_add("refentryinfo", "copyright"); table_add("refentryinfo", "date"); table_add("refentryinfo", "productname"); table_add("refentrytitle", "TEXT"); table_add("reference", "refentry"); table_add("refmeta", "manvolnum"); table_add("refmeta", "refentrytitle"); table_add("refmeta", "refmiscinfo"); table_add("refmiscinfo", "TEXT"); table_add("refname", "TEXT"); table_add("refnamediv", "refname"); table_add("refnamediv", "refpurpose"); table_add("refpurpose", "TEXT"); table_add("refsect1", NULL); table_add("refsect2", NULL); table_add("refsynopsisdiv", "cmdsynopsis"); table_add("refsynopsisdiv", "funcsynopsis"); table_add("releaseinfo", "TEXT"); table_add("replaceable", "TEXT"); table_add("returnvalue", "TEXT"); table_add("row", "entry"); table_add("screen", NULL); table_add("secondary", NULL); table_add("section", NULL); table_add("sect1", NULL); table_add("sect2", NULL); table_add("sect3", NULL); table_add("sect4", NULL); table_add("sgmltag", "TEXT"); table_add("simpara", NULL); table_add("simplelist", "member"); table_add("simplesect", NULL); table_add("structfield", "TEXT"); table_add("structname", "TEXT"); table_add("subscript", "TEXT"); table_add("subtitle", "TEXT"); table_add("superscript", "emphasis"); table_add("superscript", "TEXT"); table_add("surname", "TEXT"); table_add("symbol", "TEXT"); table_add("synopsis", "function"); table_add("synopsis", "parameter"); table_add("synopsis", "type"); table_add("synopsis", "TEXT"); table_add("systemitem", "TEXT"); table_add("table", "tgroup"); table_add("table", "title"); table_add("tbody", "row"); table_add("term", NULL); table_add("tgroup", "colspec"); table_add("tgroup", "tbody"); table_add("tgroup", "thead"); table_add("thead", "row"); table_add("title", "acronym"); table_add("title", "emphasis"); table_add("title", "errorname"); table_add("title", "function"); table_add("title", "literal"); table_add("title", "quote"); table_add("title", "trademark"); table_add("title", "type"); table_add("title", "TEXT"); table_add("trademark", "TEXT"); table_add("type", "TEXT"); table_add("ulink", NULL); table_add("userinput", "TEXT"); table_add("variablelist", "varlistentry"); table_add("varlistentry", "listitem"); table_add("varlistentry", "term"); table_add("varname", "TEXT"); table_add("warning", NULL); table_add("year", "TEXT"); } table_add(NULL, NULL); /* Loop over input files. */ fd = -1; fname = NULL; while ((rsz = getline(&fname, &fsz, stdin)) != -1) { if (fname[rsz - 1] == '\n') fname[--rsz] = '\0'; if ((fd = open(fname, O_RDONLY, 0)) == -1) err(1, "%s", fname); parse_file(fd, fname); close(fd); } /* Cleanup and error handling. */ free(fname); if (ferror(stdin)) err(1, "standard input"); if (fd == -1) errx(1, "No input file names found on standard input"); /* Dump results. */ for (i = 0; i < tablei; i++) if (table[i].count != -1) printf("%d\t%s\t%s\n", table[i].count, table[i].parent, table[i].child); /* Optional parent-child histogram. */ if (nchild.parent != NULL) { printf("%s %s", nchild.parent, nchild.child); for (i = 0; i < nchildsz; i++) printf(" %d", nchild.freq[i]); putchar('\n'); } return 0; }