File: [cvsweb.bsd.lv] / docbook2mdoc / statistics.c (download)
Revision 1.18, Sun Apr 7 19:33:27 2019 UTC (5 years, 5 months ago) by schwarze
Branch: MAIN
Changes since 1.17: +6 -1 lines
handle <appendix>, <article>, <book>, and <legalnotice> similar to <section>
|
/* $Id: statistics.c,v 1.18 2019/04/07 19:33:27 schwarze Exp $ */
/*
* Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
*
* Permission to use, copy, modify, and distribute this software for any
* purpose with or without fee is hereby granted, provided that the above
* copyright notice and this permission notice appear in all copies.
*
* THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
* WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
* MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
* ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
* WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
* ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
* OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
*/
#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
/*
* Count parent-child element relations in a corpus of DocBook documents.
*
* Read absolute or relative input file names from standard input,
* one per line.
* For each parent-child relation, print the total number of occurrences,
* the parent name, and the child name, separated by tab characters
* and followed by a newline character.
*
* Typical usage:
* statistics < filenames.txt | sort -n
* statistics < filenames.txt | grep '\<listitem\>' | sort -n
*
* Relations already fully implemented are excluded by default.
* The option -a shows all relations.
*
* If two arguments (parent and child) are given, a histogram
* of the number of children of the kind in each parent is given
* in addition to the normal output.
*
* Example usage:
* statistics tgroup colspec < filenames.txt | grep colspec
*/
struct entry {
char *parent;
char *child;
int count;
};
static struct entry *table;
static size_t tablesz;
static size_t tablei;
static char **stack;
static size_t stacksz;
static size_t stacki;
static const int nchildsz = 8;
struct nchild {
char *parent;
char *child;
int freq[nchildsz];
int count;
};
static struct nchild nchild;
static char *fname;
/*
* Count one instance of a parent-child relation.
* Before the special call table_add(NULL, NULL),
* mark relations to not be counted;
* in that phase, child can be NULL as a wildcard.
*/
static void
table_add(const char *parent, const char *child)
{
static int init_done;
size_t i;
if (parent == NULL && child == NULL) {
init_done = 1;
return;
}
/* Optional parent-child histogram. */
if (init_done && parent != NULL && child != NULL &&
nchild.parent != NULL && nchild.child != NULL &&
strcmp(parent, nchild.parent) == 0 &&
strcmp(child, nchild.child) == 0) {
if (nchild.count < nchildsz) {
nchild.freq[nchild.count]++;
if (nchild.count > 0)
nchild.freq[nchild.count - 1]--;
} else if (nchild.count == nchildsz)
puts(fname);
nchild.count++;
}
/* If the table entry already exists, increment its count. */
for (i = 0; i < tablei; i++) {
if (strcmp(parent, table[i].parent) == 0 &&
(child == NULL || table[i].child == NULL ||
strcmp(child, table[i].child) == 0)) {
assert(init_done);
if (table[i].count != -1)
table[i].count++;
return;
}
}
/* If the table is full, make room. */
if (tablei == tablesz) {
tablesz += 64;
table = reallocarray(table, tablesz, sizeof(*table));
if (table == NULL)
err(1, NULL);
}
/* Add a new entry to the table. */
if ((table[tablei].parent = strdup(parent)) == NULL)
err(1, NULL);
if (child == NULL)
table[tablei].child = NULL;
else if ((table[tablei].child = strdup(child)) == NULL)
err(1, NULL);
table[tablei++].count = init_done ? 1 : -1;
}
/*
* Enter an element.
*/
static void
stack_push(const char *name)
{
if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0)
nchild.count = 0;
if (stacki == stacksz) {
stacksz += 8;
stack = reallocarray(stack, stacksz, sizeof(*stack));
if (stack == NULL)
err(1, NULL);
}
if ((stack[stacki++] = strdup(name)) == NULL)
err(1, NULL);
}
/*
* Exit an element.
*/
static void
stack_pop(const char *name)
{
if (stacki > 0 && (name == NULL ||
strcmp(name, stack[stacki - 1]) == 0))
free(stack[--stacki]);
}
/*
* Simplified version from parse.c.
*/
static int
advance(char *b, size_t rlen, size_t *pend, const char *charset)
{
int space;
if (*charset == ' ') {
space = 1;
charset++;
} else
space = 0;
while (*pend < rlen) {
if (space && isspace((unsigned char)b[*pend]))
break;
if (strchr(charset, b[*pend]) != NULL)
break;
++*pend;
}
if (*pend == rlen) {
b[rlen] = '\0';
return 1;
} else
return 0;
}
/*
* Simplified version from parse.c.
*/
static void
parse_file(int fd, char *fname)
{
char b[4096];
char *cp;
ssize_t rsz; /* Return value from read(2). */
size_t rlen; /* Number of bytes in b[]. */
size_t poff; /* Parse offset in b[]. */
size_t pend; /* Offset of the end of the current word. */
int in_tag, in_arg, in_quotes, elem_end;
rlen = 0;
in_tag = in_arg = in_quotes = 0;
while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
if ((rlen += rsz) == 0)
break;
pend = 0;
for (;;) {
if ((poff = pend) == rlen)
break;
if (isspace((unsigned char)b[pend])) {
pend++;
continue;
}
if (in_arg) {
if (in_quotes == 0 &&
(b[pend] == '\'' || b[pend] == '"')) {
in_quotes = b[pend] == '"' ? 2 : 1;
pend++;
continue;
}
if (advance(b, rlen, &pend,
in_quotes == 2 ? "\"" :
in_quotes == 1 ? "'" : " >") && rsz > 0)
break;
in_arg = in_quotes = elem_end = 0;
if (b[pend] == '>') {
in_tag = 0;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
}
b[pend] = '\0';
if (pend < rlen)
pend++;
if (elem_end)
stack_pop(NULL);
} else if (in_tag) {
if (advance(b, rlen, &pend, " =>") && rsz > 0)
break;
elem_end = 0;
switch (b[pend]) {
case '>':
in_tag = 0;
if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
break;
case '=':
in_arg = 1;
break;
default:
break;
}
b[pend] = '\0';
if (pend < rlen)
pend++;
if (elem_end)
stack_pop(NULL);
} else if (b[poff] == '<') {
if (advance(b, rlen, &pend, " >") && rsz > 0)
break;
if (pend > poff + 3 &&
strncmp(b + poff, "<!--", 4) == 0) {
/* Skip a comment. */
cp = strstr(b + pend - 2, "-->");
if (cp == NULL) {
pend = rlen;
if (rsz > 0)
break;
} else
pend = cp + 3 - b;
continue;
}
elem_end = 0;
if (b[pend] != '>')
in_tag = 1;
else if (pend > 0 && b[pend - 1] == '/') {
b[pend - 1] = '\0';
elem_end = 1;
}
b[pend] = '\0';
if (pend < rlen)
pend++;
if (b[++poff] == '/') {
elem_end = 1;
poff++;
} else if (b[poff] != '!' && b[poff] != '?') {
table_add(stacki > 0 ?
stack[stacki - 1] : "ROOT",
b + poff);
stack_push(b + poff);
}
if (elem_end)
stack_pop(b + poff);
} else {
advance(b, rlen, &pend, "<");
if (stacki > 0)
table_add(stack[stacki - 1], "TEXT");
}
}
assert(poff > 0);
memmove(b, b + poff, rlen - poff);
rlen -= poff;
}
if (rsz < 0)
perror(fname);
}
int
main(int argc, char *argv[])
{
size_t fsz, i;
ssize_t rsz;
int ch, fd, show_all;
show_all = 0;
while ((ch = getopt(argc, argv, "a")) != -1) {
switch (ch) {
case 'a':
show_all = 1;
break;
default:
return 1;
}
}
argc -= optind;
argv += optind;
if (argc > 1) {
nchild.parent = argv[0];
nchild.child = argv[1];
}
/* Exclude relations that are already fully implemented. */
if (show_all == 0) {
table_add("ROOT", "refentry");
table_add("acronym", "TEXT");
table_add("appendix", NULL);
table_add("article", NULL);
table_add("blockquote", NULL);
table_add("book", NULL);
table_add("chapter", NULL);
table_add("code", "TEXT");
table_add("constant", "TEXT");
table_add("emphasis", "TEXT");
table_add("entry", NULL);
table_add("errorname", "TEXT");
table_add("filename", "TEXT");
table_add("funcdef", "function");
table_add("funcdef", "TEXT");
table_add("funcprototype", "funcdef");
table_add("funcprototype", "paramdef");
table_add("funcsynopsis", "funcprototype");
table_add("funcsynopsis", "funcsynopsisinfo");
table_add("funcsynopsisinfo", "TEXT");
table_add("function", "TEXT");
table_add("glossary", "glossdiv");
table_add("glossary", "glossentry");
table_add("glossdef", "para");
table_add("glossdiv", "glossentry");
table_add("glossentry", "glossdef");
table_add("glossentry", "glossterm");
table_add("glossentry", "indexterm");
table_add("glosslist", "glossentry");
table_add("glossterm", "TEXT");
table_add("indexterm", "primary");
table_add("indexterm", "secondary");
table_add("informaltable", "tgroup");
table_add("itemizedlist", "listitem");
table_add("legalnotice", NULL);
table_add("link", NULL);
table_add("listitem", NULL);
table_add("literal", "TEXT");
table_add("literallayout", NULL);
table_add("member", "TEXT");
table_add("note", NULL);
table_add("orderedlist", "listitem");
table_add("para", NULL);
table_add("paramdef", "parameter");
table_add("paramdef", "TEXT");
table_add("parameter", "TEXT");
table_add("primary", NULL);
table_add("programlisting", NULL);
table_add("refentry", "refmeta");
table_add("refentry", "refnamediv");
table_add("refentry", "refsect1");
table_add("refentry", "refsynopsisdiv");
table_add("refmeta", "manvolnum");
table_add("refmeta", "refentrytitle");
table_add("refname", "TEXT");
table_add("refnamediv", "refname");
table_add("refnamediv", "refpurpose");
table_add("refpurpose", "TEXT");
table_add("refsect1", NULL);
table_add("refsect2", NULL);
table_add("refsynopsisdiv", "funcsynopsis");
table_add("row", "entry");
table_add("screen", NULL);
table_add("secondary", NULL);
table_add("section", NULL);
table_add("sect1", NULL);
table_add("sect2", NULL);
table_add("sect3", NULL);
table_add("sect4", NULL);
table_add("sgmltag", "TEXT");
table_add("simpara", NULL);
table_add("simplelist", "member");
table_add("structfield", "TEXT");
table_add("structname", "TEXT");
table_add("symbol", "TEXT");
table_add("table", "tgroup");
table_add("table", "title");
table_add("tbody", "row");
table_add("term", NULL);
table_add("tgroup", "colspec");
table_add("tgroup", "tbody");
table_add("tgroup", "thead");
table_add("thead", "row");
table_add("title", "TEXT");
table_add("type", "TEXT");
table_add("ulink", NULL);
table_add("userinput", "TEXT");
table_add("variablelist", "varlistentry");
table_add("varlistentry", "listitem");
table_add("varlistentry", "term");
}
table_add(NULL, NULL);
/* Loop over input files. */
fd = -1;
fname = NULL;
while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
if (fname[rsz - 1] == '\n')
fname[--rsz] = '\0';
if ((fd = open(fname, O_RDONLY, 0)) == -1)
err(1, "%s", fname);
parse_file(fd, fname);
close(fd);
}
/* Cleanup and error handling. */
free(fname);
if (ferror(stdin))
err(1, "standard input");
if (fd == -1)
errx(1, "No input file names found on standard input");
/* Dump results. */
for (i = 0; i < tablei; i++)
if (table[i].count != -1)
printf("%d\t%s\t%s\n", table[i].count,
table[i].parent, table[i].child);
/* Optional parent-child histogram. */
if (nchild.parent != NULL) {
printf("%s %s", nchild.parent, nchild.child);
for (i = 0; i < nchildsz; i++)
printf(" %d", nchild.freq[i]);
putchar('\n');
}
return 0;
}