=================================================================== RCS file: /cvs/docbook2mdoc/statistics.c,v retrieving revision 1.2 retrieving revision 1.39 diff -u -p -r1.2 -r1.39 --- docbook2mdoc/statistics.c 2019/03/29 18:09:43 1.2 +++ docbook2mdoc/statistics.c 2019/04/29 02:00:50 1.39 @@ -1,4 +1,4 @@ -/* $Id: statistics.c,v 1.2 2019/03/29 18:09:43 schwarze Exp $ */ +/* $Id: statistics.c,v 1.39 2019/04/29 02:00:50 schwarze Exp $ */ /* * Copyright (c) 2019 Ingo Schwarze * @@ -14,6 +14,8 @@ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ +#include + #include #include #include @@ -24,6 +26,8 @@ #include #include +#include "xmalloc.h" + /* * Count parent-child element relations in a corpus of DocBook documents. * @@ -36,6 +40,18 @@ * Typical usage: * statistics < filenames.txt | sort -n * statistics < filenames.txt | grep '\' | sort -n + * + * Relations already fully implemented are excluded by default. + * The option -a shows all relations. + * + * If two arguments (parent and child) are given, a histogram + * of the number of children of the kind in each parent is given + * in addition to the normal output. + * + * Example usage: + * statistics tgroup colspec < filenames.txt | grep colspec + * + * Synchronized with parse.c up to rev. 1.42. */ struct entry { @@ -52,7 +68,18 @@ static char **stack; static size_t stacksz; static size_t stacki; +static const int nchildsz = 8; +struct nchild { + char *parent; + char *child; + int freq[nchildsz]; + int count; +}; +static struct nchild nchild; +static char *fname; + + /* * Count one instance of a parent-child relation. * Before the special call table_add(NULL, NULL), @@ -70,6 +97,21 @@ table_add(const char *parent, const char *child) return; } + /* Optional parent-child histogram. */ + + if (init_done && parent != NULL && child != NULL && + nchild.parent != NULL && nchild.child != NULL && + strcmp(parent, nchild.parent) == 0 && + strcmp(child, nchild.child) == 0) { + if (nchild.count < nchildsz) { + nchild.freq[nchild.count]++; + if (nchild.count > 0) + nchild.freq[nchild.count - 1]--; + } else if (nchild.count == nchildsz) + puts(fname); + nchild.count++; + } + /* If the table entry already exists, increment its count. */ for (i = 0; i < tablei; i++) { @@ -87,19 +129,13 @@ table_add(const char *parent, const char *child) if (tablei == tablesz) { tablesz += 64; - table = reallocarray(table, tablesz, sizeof(*table)); - if (table == NULL) - err(1, NULL); + table = xreallocarray(table, tablesz, sizeof(*table)); } /* Add a new entry to the table. */ - if ((table[tablei].parent = strdup(parent)) == NULL) - err(1, NULL); - if (child == NULL) - table[tablei].child = NULL; - else if ((table[tablei].child = strdup(child)) == NULL) - err(1, NULL); + table[tablei].parent = xstrdup(parent); + table[tablei].child = child == NULL ? NULL : xstrdup(child); table[tablei++].count = init_done ? 1 : -1; } @@ -109,14 +145,14 @@ table_add(const char *parent, const char *child) static void stack_push(const char *name) { + if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0) + nchild.count = 0; + if (stacki == stacksz) { stacksz += 8; - stack = reallocarray(stack, stacksz, sizeof(*stack)); - if (stack == NULL) - err(1, NULL); + stack = xreallocarray(stack, stacksz, sizeof(*stack)); } - if ((stack[stacki++] = strdup(name)) == NULL) - err(1, NULL); + stack[stacki++] = xstrdup(name); } /* @@ -165,14 +201,15 @@ static void parse_file(int fd, char *fname) { char b[4096]; + char *cp; ssize_t rsz; /* Return value from read(2). */ size_t rlen; /* Number of bytes in b[]. */ size_t poff; /* Parse offset in b[]. */ size_t pend; /* Offset of the end of the current word. */ - int in_tag, in_arg, in_quotes, elem_end; + int in_tag, in_arg, in_quotes, in_doctype, elem_end; rlen = 0; - in_tag = in_arg = in_quotes = 0; + in_tag = in_arg = in_quotes = in_doctype = 0; while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) { if ((rlen += rsz) == 0) break; @@ -185,13 +222,15 @@ parse_file(int fd, char *fname) continue; } if (in_arg) { - if (in_quotes == 0 && b[pend] == '"') { - in_quotes = 1; + if (in_quotes == 0 && + (b[pend] == '\'' || b[pend] == '"')) { + in_quotes = b[pend] == '"' ? 2 : 1; pend++; continue; } if (advance(b, rlen, &pend, - in_quotes ? "\"" : " >") && rsz > 0) + in_quotes == 2 ? "\"" : + in_quotes == 1 ? "'" : " >") && rsz > 0) break; in_arg = in_quotes = elem_end = 0; if (b[pend] == '>') { @@ -207,6 +246,11 @@ parse_file(int fd, char *fname) if (elem_end) stack_pop(NULL); } else if (in_tag) { + if (in_doctype && b[pend] == '[') { + in_tag = in_doctype = 0; + pend++; + continue; + } if (advance(b, rlen, &pend, " =>") && rsz > 0) break; elem_end = 0; @@ -232,6 +276,18 @@ parse_file(int fd, char *fname) } else if (b[poff] == '<') { if (advance(b, rlen, &pend, " >") && rsz > 0) break; + if (pend > poff + 3 && + strncmp(b + poff, ""); + if (cp == NULL) { + pend = rlen; + if (rsz > 0) + break; + } else + pend = cp + 3 - b; + continue; + } elem_end = 0; if (b[pend] != '>') in_tag = 1; @@ -245,11 +301,16 @@ parse_file(int fd, char *fname) if (b[++poff] == '/') { elem_end = 1; poff++; + } else if (strcasecmp(b + poff, + "!DOCTYPE") == 0) { + in_doctype = 1; } else if (b[poff] != '!' && b[poff] != '?') { table_add(stacki > 0 ? stack[stacki - 1] : "ROOT", b + poff); stack_push(b + poff); + if (strcmp(b + poff, "sbr") == 0) + elem_end = 1; } if (elem_end) stack_pop(b + poff); @@ -260,8 +321,8 @@ parse_file(int fd, char *fname) } } assert(poff > 0); - memmove(b, b + poff, rlen - poff); rlen -= poff; + memmove(b, b + poff, rlen); } if (rsz < 0) perror(fname); @@ -270,7 +331,6 @@ parse_file(int fd, char *fname) int main(int argc, char *argv[]) { - char *fname; size_t fsz, i; ssize_t rsz; int ch, fd, show_all; @@ -285,10 +345,256 @@ main(int argc, char *argv[]) return 1; } } + argc -= optind; + argv += optind; + if (argc > 1) { + nchild.parent = argv[0]; + nchild.child = argv[1]; + } + /* Exclude relations that are already fully implemented. */ if (show_all == 0) { + table_add("ROOT", "appendix"); + table_add("ROOT", "article"); + table_add("ROOT", "book"); + table_add("ROOT", "chapter"); + table_add("ROOT", "glossary"); + table_add("ROOT", "part"); + table_add("ROOT", "preface"); + table_add("ROOT", "refentry"); + table_add("ROOT", "reference"); + table_add("ROOT", "sect1"); + table_add("ROOT", "sect2"); + table_add("abstract", NULL); + table_add("acronym", "TEXT"); + table_add("affiliation", "orgdiv"); + table_add("affiliation", "orgname"); + table_add("appendix", NULL); + table_add("application", "TEXT"); + table_add("arg", "option"); + table_add("article", NULL); + table_add("articleinfo", "abstract"); + table_add("articleinfo", "author"); + table_add("articleinfo", "authorgroup"); + table_add("articleinfo", "copyright"); + table_add("articleinfo", "date"); + table_add("articleinfo", "legalnotice"); + table_add("articleinfo", "pubdate"); + table_add("articleinfo", "releaseinfo"); + table_add("articleinfo", "subtitle"); + table_add("articleinfo", "title"); + table_add("author", "affiliation"); + table_add("author", "contrib"); + table_add("author", "email"); + table_add("author", "firstname"); + table_add("author", "othername"); + table_add("author", "surname"); + table_add("author", "TEXT"); + table_add("authorgroup", "author"); + table_add("authorgroup", "editor"); + table_add("authorgroup", "othercredit"); + table_add("blockquote", NULL); + table_add("book", NULL); + table_add("bookinfo", "abstract"); + table_add("bookinfo", "authorgroup"); + table_add("bookinfo", "copyright"); + table_add("bookinfo", "legalnotice"); + table_add("bookinfo", "pubdate"); + table_add("bookinfo", "releaseinfo"); + table_add("bookinfo", "subtitle"); + table_add("bookinfo", "title"); + table_add("caption", "TEXT"); + table_add("chapter", NULL); + table_add("citerefentry", "manvolnum"); + table_add("citerefentry", "refentrytitle"); + table_add("citetitle", "TEXT"); + table_add("cmdsynopsis", "arg"); + table_add("cmdsynopsis", "command"); + table_add("cmdsynopsis", "group"); + table_add("cmdsynopsis", "sbr"); + table_add("code", "TEXT"); + table_add("command", "TEXT"); + table_add("computeroutput", "TEXT"); + table_add("constant", "TEXT"); + table_add("contrib", "TEXT"); + table_add("copyright", "holder"); + table_add("copyright", "year"); + table_add("date", "TEXT"); + table_add("editor", "affiliation"); + table_add("editor", "firstname"); + table_add("editor", "surname"); + table_add("email", "TEXT"); + table_add("emphasis", "errorname"); + table_add("emphasis", "function"); + table_add("emphasis", "TEXT"); + table_add("entry", NULL); + table_add("errorname", "TEXT"); + table_add("figure", "mediaobject"); + table_add("figure", "title"); + table_add("filename", "TEXT"); + table_add("firstname", "TEXT"); + table_add("firstterm", "TEXT"); + table_add("footnote", "para"); + table_add("funcdef", "function"); + table_add("funcdef", "TEXT"); + table_add("funcprototype", "funcdef"); + table_add("funcprototype", "paramdef"); + table_add("funcsynopsis", "funcprototype"); + table_add("funcsynopsis", "funcsynopsisinfo"); + table_add("funcsynopsisinfo", "TEXT"); + table_add("function", "replaceable"); + table_add("function", "TEXT"); + table_add("glossary", "glossdiv"); + table_add("glossary", "glossentry"); + table_add("glossdef", "para"); + table_add("glossdiv", "glossentry"); + table_add("glossentry", "glossdef"); + table_add("glossentry", "glossterm"); + table_add("glossentry", "indexterm"); + table_add("glosslist", "glossentry"); + table_add("glossterm", "emphasis"); + table_add("glossterm", "TEXT"); + table_add("group", "arg"); + table_add("holder", "TEXT"); + table_add("imageobject", "imagedata"); + table_add("indexterm", "primary"); + table_add("indexterm", "secondary"); + table_add("informaltable", "tgroup"); + table_add("itemizedlist", "listitem"); + table_add("keycap", "TEXT"); + table_add("keycode", "TEXT"); + table_add("keycombo", "keycap"); + table_add("keysym", "TEXT"); + table_add("legalnotice", NULL); + table_add("link", NULL); + table_add("listitem", NULL); + table_add("literal", "TEXT"); + table_add("literallayout", NULL); + table_add("manvolnum", "TEXT"); + table_add("markup", "TEXT"); + table_add("mediaobject", "caption"); + table_add("mediaobject", "imageobject"); + table_add("member", "constant"); + table_add("member", "emphasis"); + table_add("member", "function"); + table_add("member", "property"); + table_add("member", "symbol"); + table_add("member", "TEXT"); + table_add("note", NULL); + table_add("olink", "citetitle"); + table_add("olink", "function"); + table_add("olink", "TEXT"); + table_add("option", "parameter"); + table_add("option", "replaceable"); + table_add("option", "TEXT"); + table_add("orderedlist", "listitem"); + table_add("orgdiv", "TEXT"); + table_add("orgname", "TEXT"); + table_add("othercredit", "affiliation"); + table_add("othercredit", "contrib"); + table_add("othercredit", "email"); + table_add("othercredit", "firstname"); + table_add("othercredit", "othername"); + table_add("othercredit", "surname"); + table_add("othername", "TEXT"); table_add("para", NULL); + table_add("paramdef", "parameter"); + table_add("paramdef", "TEXT"); + table_add("parameter", "TEXT"); + table_add("part", NULL); + table_add("personname", "firstname"); + table_add("personname", "surname"); + table_add("phrase", "TEXT"); + table_add("preface", NULL); + table_add("primary", NULL); + table_add("productname", "TEXT"); + table_add("programlisting", NULL); + table_add("property", "TEXT"); + table_add("pubdate", "TEXT"); + table_add("quote", "command"); + table_add("quote", "filename"); + table_add("quote", "literal"); + table_add("quote", "TEXT"); + table_add("refentry", "refentryinfo"); + table_add("refentry", "refmeta"); + table_add("refentry", "refnamediv"); + table_add("refentry", "refsect1"); + table_add("refentry", "refsynopsisdiv"); + table_add("refentryinfo", "author"); + table_add("refentryinfo", "authorgroup"); + table_add("refentryinfo", "copyright"); + table_add("refentryinfo", "date"); + table_add("refentryinfo", "productname"); + table_add("refentrytitle", "TEXT"); + table_add("reference", "refentry"); + table_add("refmeta", "manvolnum"); + table_add("refmeta", "refentrytitle"); + table_add("refmeta", "refmiscinfo"); + table_add("refmiscinfo", "TEXT"); + table_add("refname", "TEXT"); + table_add("refnamediv", "refname"); + table_add("refnamediv", "refpurpose"); + table_add("refpurpose", "TEXT"); + table_add("refsect1", NULL); + table_add("refsect2", NULL); + table_add("refsynopsisdiv", "cmdsynopsis"); + table_add("refsynopsisdiv", "funcsynopsis"); + table_add("releaseinfo", "TEXT"); + table_add("replaceable", "TEXT"); + table_add("returnvalue", "TEXT"); + table_add("row", "entry"); + table_add("screen", NULL); + table_add("secondary", NULL); + table_add("section", NULL); + table_add("sect1", NULL); + table_add("sect2", NULL); + table_add("sect3", NULL); + table_add("sect4", NULL); + table_add("sgmltag", "TEXT"); + table_add("simpara", NULL); + table_add("simplelist", "member"); + table_add("simplesect", NULL); + table_add("structfield", "TEXT"); + table_add("structname", "TEXT"); + table_add("subscript", "TEXT"); + table_add("subtitle", "TEXT"); + table_add("superscript", "emphasis"); + table_add("superscript", "TEXT"); + table_add("surname", "TEXT"); + table_add("symbol", "TEXT"); + table_add("synopsis", "function"); + table_add("synopsis", "parameter"); + table_add("synopsis", "type"); + table_add("synopsis", "TEXT"); + table_add("systemitem", "TEXT"); + table_add("table", "tgroup"); + table_add("table", "title"); + table_add("tbody", "row"); + table_add("term", NULL); + table_add("tgroup", "colspec"); + table_add("tgroup", "tbody"); + table_add("tgroup", "thead"); + table_add("thead", "row"); + table_add("title", "acronym"); + table_add("title", "emphasis"); + table_add("title", "errorname"); + table_add("title", "function"); + table_add("title", "literal"); + table_add("title", "quote"); + table_add("title", "trademark"); + table_add("title", "type"); + table_add("title", "TEXT"); + table_add("trademark", "TEXT"); + table_add("type", "TEXT"); + table_add("ulink", NULL); + table_add("userinput", "TEXT"); + table_add("variablelist", "varlistentry"); + table_add("varlistentry", "listitem"); + table_add("varlistentry", "term"); + table_add("varname", "TEXT"); + table_add("warning", NULL); + table_add("year", "TEXT"); } table_add(NULL, NULL); @@ -316,5 +622,13 @@ main(int argc, char *argv[]) if (table[i].count != -1) printf("%d\t%s\t%s\n", table[i].count, table[i].parent, table[i].child); + + /* Optional parent-child histogram. */ + if (nchild.parent != NULL) { + printf("%s %s", nchild.parent, nchild.child); + for (i = 0; i < nchildsz; i++) + printf(" %d", nchild.freq[i]); + putchar('\n'); + } return 0; }