[BACK]Return to statistics.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / docbook2mdoc

File: [cvsweb.bsd.lv] / docbook2mdoc / statistics.c (download)

Revision 1.37, Sun Apr 28 15:03:29 2019 UTC (4 years, 11 months ago) by schwarze
Branch: MAIN
Changes since 1.36: +10 -15 lines

In this program, there is never a need to survive memory allocation
failure, and there are many places allocating memory.  Consequently,
the code can be simplified providing memory allocation functions
that error out on failure, in the conventional way.

/* $Id: statistics.c,v 1.37 2019/04/28 15:03:29 schwarze Exp $ */
/*
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/types.h>

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "xmalloc.h"

/*
 * Count parent-child element relations in a corpus of DocBook documents.
 *
 * Read absolute or relative input file names from standard input,
 * one per line. 
 * For each parent-child relation, print the total number of occurrences,
 * the parent name, and the child name, separated by tab characters
 * and followed by a newline character.
 *
 * Typical usage:
 * statistics < filenames.txt | sort -n
 * statistics < filenames.txt | grep '\<listitem\>' | sort -n
 *
 * Relations already fully implemented are excluded by default.
 * The option -a shows all relations.
 *
 * If two arguments (parent and child) are given, a histogram
 * of the number of children of the kind in each parent is given
 * in addition to the normal output.
 *
 * Example usage:
 * statistics tgroup colspec < filenames.txt | grep colspec
 *
 * Synchronized with parse.c up to rev. 1.42.
 */

struct entry {
	char	*parent;
	char	*child;
	int	 count;
};

static struct entry	 *table;
static size_t		  tablesz;
static size_t		  tablei;

static char		**stack;
static size_t		  stacksz;
static size_t		  stacki;

static const int	  nchildsz = 8;
struct nchild {
	char	*parent;
	char	*child;
	int	 freq[nchildsz];
	int	 count;
};

static struct nchild	  nchild;
static char		 *fname;


/*
 * Count one instance of a parent-child relation.
 * Before the special call table_add(NULL, NULL),
 * mark relations to not be counted;
 * in that phase, child can be NULL as a wildcard.
 */
static void
table_add(const char *parent, const char *child)
{
	static int	 init_done;
	size_t		 i;

	if (parent == NULL && child == NULL) {
		init_done = 1;
		return;
	}

	/* Optional parent-child histogram. */

	if (init_done && parent != NULL && child != NULL &&
	    nchild.parent != NULL && nchild.child != NULL &&
	    strcmp(parent, nchild.parent) == 0 &&
	    strcmp(child, nchild.child) == 0) {
		if (nchild.count < nchildsz) {
			nchild.freq[nchild.count]++;
			if (nchild.count > 0)
				nchild.freq[nchild.count - 1]--;
		} else if (nchild.count == nchildsz)
			puts(fname);
		nchild.count++;
	}

	/* If the table entry already exists, increment its count. */

	for (i = 0; i < tablei; i++) {
		if (strcmp(parent, table[i].parent) == 0 &&
		    (child == NULL || table[i].child == NULL ||
		     strcmp(child, table[i].child) == 0)) {
			assert(init_done);
			if (table[i].count != -1)
				table[i].count++;
			return;
		}
	}

	/* If the table is full, make room. */

	if (tablei == tablesz) {
		tablesz += 64;
		table = xreallocarray(table, tablesz, sizeof(*table));
	}

	/* Add a new entry to the table. */

	table[tablei].parent = xstrdup(parent);
	table[tablei].child = child == NULL ? NULL : xstrdup(child);
	table[tablei++].count = init_done ? 1 : -1;
}

/*
 * Enter an element.
 */
static void
stack_push(const char *name)
{
	if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0)
		nchild.count = 0;

	if (stacki == stacksz) {
		stacksz += 8;
		stack = xreallocarray(stack, stacksz, sizeof(*stack));
	}
	stack[stacki++] = xstrdup(name);
}

/*
 * Exit an element.
 */
static void
stack_pop(const char *name)
{
	if (stacki > 0 && (name == NULL ||
	    strcmp(name, stack[stacki - 1]) == 0))
		free(stack[--stacki]);
}

/*
 * Simplified version from parse.c.
 */
static int
advance(char *b, size_t rlen, size_t *pend, const char *charset)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	while (*pend < rlen) {
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		++*pend;
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return 1;
	} else
		return 0;
}

/*
 * Simplified version from parse.c.
 */
static void
parse_file(int fd, char *fname)
{
	char		 b[4096];
	char		*cp;
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;  /* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 in_tag, in_arg, in_quotes, in_doctype, elem_end;

	rlen = 0;
	in_tag = in_arg = in_quotes = in_doctype = 0;
	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
		if ((rlen += rsz) == 0)
			break;
		pend = 0;
		for (;;) {
			if ((poff = pend) == rlen)
				break;
			if (isspace((unsigned char)b[pend])) {
				pend++;
				continue;
			}
			if (in_arg) {
				if (in_quotes == 0 &&
				    (b[pend] == '\'' || b[pend] == '"')) {
					in_quotes = b[pend] == '"' ? 2 : 1;
					pend++;
					continue;
				}
				if (advance(b, rlen, &pend,
				    in_quotes == 2 ? "\"" :
				    in_quotes == 1 ? "'" : " >") && rsz > 0)
					break;
				in_arg = in_quotes = elem_end = 0;
				if (b[pend] == '>') {
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (in_tag) {
				if (in_doctype && b[pend] == '[') {
					in_tag = in_doctype = 0;
					pend++;
					continue;
				}
				if (advance(b, rlen, &pend, " =>") && rsz > 0)
					break;
				elem_end = 0;
				switch (b[pend]) {
				case '>':
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
					break;
				case '=':
					in_arg = 1;
					break;
				default:
					break;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (b[poff] == '<') {
				if (advance(b, rlen, &pend, " >") && rsz > 0)
					break;
				if (pend > poff + 3 &&
				    strncmp(b + poff, "<!--", 4) == 0) {
					/* Skip a comment. */
					cp = strstr(b + pend - 2, "-->");
					if (cp == NULL) {
						pend = rlen;
						if (rsz > 0)
							break;
					} else
						pend = cp + 3 - b;
					continue;
				}
				elem_end = 0;
				if (b[pend] != '>')
					in_tag = 1;
				else if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (b[++poff] == '/') {
					elem_end = 1;
					poff++;
				} else if (strcasecmp(b + poff,
				    "!DOCTYPE") == 0) {
					in_doctype = 1;
				} else if (b[poff] != '!' && b[poff] != '?') {
					table_add(stacki > 0 ?
					    stack[stacki - 1] : "ROOT",
					    b + poff);
					stack_push(b + poff);
					if (strcmp(b + poff, "sbr") == 0)
						elem_end = 1;
				}
				if (elem_end)
					stack_pop(b + poff);
			} else {
				advance(b, rlen, &pend, "<");
				if (stacki > 0)
					table_add(stack[stacki - 1], "TEXT");
			}
		}
		assert(poff > 0);
		rlen -= poff;
		memmove(b, b + poff, rlen);
	}
	if (rsz < 0)
		perror(fname);
}

int
main(int argc, char *argv[])
{
	size_t		 fsz, i;
	ssize_t		 rsz;
	int		 ch, fd, show_all;

	show_all = 0;
	while ((ch = getopt(argc, argv, "a")) != -1) {
		switch (ch) {
		case 'a':
			show_all = 1;
			break;
		default:
			return 1;
		}
	}
	argc -= optind;
	argv += optind;

	if (argc > 1) {
		nchild.parent = argv[0];
		nchild.child = argv[1];
	}

	/* Exclude relations that are already fully implemented. */
	if (show_all == 0) {
		table_add("ROOT", "appendix");
		table_add("ROOT", "article");
		table_add("ROOT", "book");
		table_add("ROOT", "chapter");
		table_add("ROOT", "glossary");
		table_add("ROOT", "part");
		table_add("ROOT", "preface");
		table_add("ROOT", "refentry");
		table_add("ROOT", "reference");
		table_add("ROOT", "sect1");
		table_add("ROOT", "sect2");
		table_add("acronym", "TEXT");
		table_add("affiliation", "orgdiv");
		table_add("affiliation", "orgname");
		table_add("appendix", NULL);
		table_add("application", "TEXT");
		table_add("arg", "option");
		table_add("article", NULL);
		table_add("articleinfo", "author");
		table_add("articleinfo", "authorgroup");
		table_add("articleinfo", "copyright");
		table_add("articleinfo", "date");
		table_add("articleinfo", "legalnotice");
		table_add("articleinfo", "pubdate");
		table_add("articleinfo", "releaseinfo");
		table_add("articleinfo", "subtitle");
		table_add("articleinfo", "title");
		table_add("author", "affiliation");
		table_add("author", "contrib");
		table_add("author", "email");
		table_add("author", "firstname");
		table_add("author", "othername");
		table_add("author", "surname");
		table_add("author", "TEXT");
		table_add("authorgroup", "author");
		table_add("authorgroup", "editor");
		table_add("authorgroup", "othercredit");
		table_add("blockquote", NULL);
		table_add("book", NULL);
		table_add("bookinfo", "authorgroup");
		table_add("bookinfo", "copyright");
		table_add("bookinfo", "legalnotice");
		table_add("bookinfo", "pubdate");
		table_add("bookinfo", "releaseinfo");
		table_add("bookinfo", "subtitle");
		table_add("bookinfo", "title");
		table_add("caption", "TEXT");
		table_add("chapter", NULL);
		table_add("citerefentry", "manvolnum");
		table_add("citerefentry", "refentrytitle");
		table_add("citetitle", "TEXT");
		table_add("cmdsynopsis", "arg");
		table_add("cmdsynopsis", "command");
		table_add("cmdsynopsis", "group");
		table_add("cmdsynopsis", "sbr");
		table_add("code", "TEXT");
		table_add("command", "TEXT");
		table_add("computeroutput", "TEXT");
		table_add("constant", "TEXT");
		table_add("contrib", "TEXT");
		table_add("copyright", "holder");
		table_add("copyright", "year");
		table_add("date", "TEXT");
		table_add("editor", "affiliation");
		table_add("editor", "firstname");
		table_add("editor", "surname");
		table_add("email", "TEXT");
		table_add("emphasis", "errorname");
		table_add("emphasis", "function");
		table_add("emphasis", "TEXT");
		table_add("entry", NULL);
		table_add("errorname", "TEXT");
		table_add("figure", "mediaobject");
		table_add("figure", "title");
		table_add("filename", "TEXT");
		table_add("firstname", "TEXT");
		table_add("firstterm", "TEXT");
		table_add("footnote", "para");
		table_add("funcdef", "function");
		table_add("funcdef", "TEXT");
		table_add("funcprototype", "funcdef");
		table_add("funcprototype", "paramdef");
		table_add("funcsynopsis", "funcprototype");
		table_add("funcsynopsis", "funcsynopsisinfo");
		table_add("funcsynopsisinfo", "TEXT");
		table_add("function", "replaceable");
		table_add("function", "TEXT");
		table_add("glossary", "glossdiv");
		table_add("glossary", "glossentry");
		table_add("glossdef", "para");
		table_add("glossdiv", "glossentry");
		table_add("glossentry", "glossdef");
		table_add("glossentry", "glossterm");
		table_add("glossentry", "indexterm");
		table_add("glosslist", "glossentry");
		table_add("glossterm", "emphasis");
		table_add("glossterm", "TEXT");
		table_add("group", "arg");
		table_add("holder", "TEXT");
		table_add("imageobject", "imagedata");
		table_add("indexterm", "primary");
		table_add("indexterm", "secondary");
		table_add("informaltable", "tgroup");
		table_add("itemizedlist", "listitem");
		table_add("keycap", "TEXT");
		table_add("keycode", "TEXT");
		table_add("keysym", "TEXT");
		table_add("legalnotice", NULL);
		table_add("link", NULL);
		table_add("listitem", NULL);
		table_add("literal", "TEXT");
		table_add("literallayout", NULL);
		table_add("manvolnum", "TEXT");
		table_add("markup", "TEXT");
		table_add("mediaobject", "caption");
		table_add("mediaobject", "imageobject");
		table_add("member", "constant");
		table_add("member", "emphasis");
		table_add("member", "function");
		table_add("member", "property");
		table_add("member", "symbol");
		table_add("member", "TEXT");
		table_add("note", NULL);
		table_add("olink", "citetitle");
		table_add("olink", "function");
		table_add("olink", "TEXT");
		table_add("option", "parameter");
		table_add("option", "replaceable");
		table_add("option", "TEXT");
		table_add("orderedlist", "listitem");
		table_add("orgdiv", "TEXT");
		table_add("orgname", "TEXT");
		table_add("othercredit", "affiliation");
		table_add("othercredit", "contrib");
		table_add("othercredit", "email");
		table_add("othercredit", "firstname");
		table_add("othercredit", "othername");
		table_add("othercredit", "surname");
		table_add("othername", "TEXT");
		table_add("para", NULL);
		table_add("paramdef", "parameter");
		table_add("paramdef", "TEXT");
		table_add("parameter", "TEXT");
		table_add("part", NULL);
		table_add("personname", "firstname");
		table_add("personname", "surname");
		table_add("phrase", "TEXT");
		table_add("preface", NULL);
		table_add("primary", NULL);
		table_add("productname", "TEXT");
		table_add("programlisting", NULL);
		table_add("property", "TEXT");
		table_add("pubdate", "TEXT");
		table_add("quote", "command");
		table_add("quote", "filename");
		table_add("quote", "literal");
		table_add("quote", "TEXT");
		table_add("refentry", "refentryinfo");
		table_add("refentry", "refmeta");
		table_add("refentry", "refnamediv");
		table_add("refentry", "refsect1");
		table_add("refentry", "refsynopsisdiv");
		table_add("refentryinfo", "author");
		table_add("refentryinfo", "authorgroup");
		table_add("refentryinfo", "copyright");
		table_add("refentryinfo", "date");
		table_add("refentryinfo", "productname");
		table_add("refentrytitle", "TEXT");
		table_add("reference", "refentry");
		table_add("refmeta", "manvolnum");
		table_add("refmeta", "refentrytitle");
		table_add("refmeta", "refmiscinfo");
		table_add("refmiscinfo", "TEXT");
		table_add("refname", "TEXT");
		table_add("refnamediv", "refname");
		table_add("refnamediv", "refpurpose");
		table_add("refpurpose", "TEXT");
		table_add("refsect1", NULL);
		table_add("refsect2", NULL);
		table_add("refsynopsisdiv", "cmdsynopsis");
		table_add("refsynopsisdiv", "funcsynopsis");
		table_add("releaseinfo", "TEXT");
		table_add("replaceable", "TEXT");
		table_add("returnvalue", "TEXT");
		table_add("row", "entry");
		table_add("screen", NULL);
		table_add("secondary", NULL);
		table_add("section", NULL);
		table_add("sect1", NULL);
		table_add("sect2", NULL);
		table_add("sect3", NULL);
		table_add("sect4", NULL);
		table_add("sgmltag", "TEXT");
		table_add("simpara", NULL);
		table_add("simplelist", "member");
		table_add("simplesect", NULL);
		table_add("structfield", "TEXT");
		table_add("structname", "TEXT");
		table_add("subscript", "TEXT");
		table_add("subtitle", "TEXT");
		table_add("superscript", "emphasis");
		table_add("superscript", "TEXT");
		table_add("surname", "TEXT");
		table_add("symbol", "TEXT");
		table_add("synopsis", "function");
		table_add("synopsis", "parameter");
		table_add("synopsis", "type");
		table_add("synopsis", "TEXT");
		table_add("systemitem", "TEXT");
		table_add("table", "tgroup");
		table_add("table", "title");
		table_add("tbody", "row");
		table_add("term", NULL);
		table_add("tgroup", "colspec");
		table_add("tgroup", "tbody");
		table_add("tgroup", "thead");
		table_add("thead", "row");
		table_add("title", "acronym");
		table_add("title", "emphasis");
		table_add("title", "errorname");
		table_add("title", "function");
		table_add("title", "literal");
		table_add("title", "quote");
		table_add("title", "trademark");
		table_add("title", "type");
		table_add("title", "TEXT");
		table_add("trademark", "TEXT");
		table_add("type", "TEXT");
		table_add("ulink", NULL);
		table_add("userinput", "TEXT");
		table_add("variablelist", "varlistentry");
		table_add("varlistentry", "listitem");
		table_add("varlistentry", "term");
		table_add("varname", "TEXT");
		table_add("warning", NULL);
		table_add("year", "TEXT");
	}
	table_add(NULL, NULL);

	/* Loop over input files. */
	fd = -1;
	fname = NULL;
	while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
		if (fname[rsz - 1] == '\n')
			fname[--rsz] = '\0';
		if ((fd = open(fname, O_RDONLY, 0)) == -1)
			err(1, "%s", fname);
		parse_file(fd, fname);
		close(fd);
	}

	/* Cleanup and error handling. */
	free(fname);
	if (ferror(stdin))
		err(1, "standard input");
	if (fd == -1)
		errx(1, "No input file names found on standard input");

	/* Dump results. */
	for (i = 0; i < tablei; i++)
		if (table[i].count != -1)
			printf("%d\t%s\t%s\n", table[i].count,
			    table[i].parent, table[i].child);

	/* Optional parent-child histogram. */
	if (nchild.parent != NULL) {
		printf("%s %s", nchild.parent, nchild.child);
		for (i = 0; i < nchildsz; i++)
			printf(" %d", nchild.freq[i]);
		putchar('\n');
	}
	return 0;
}