[BACK]Return to statistics.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / docbook2mdoc

File: [cvsweb.bsd.lv] / docbook2mdoc / statistics.c (download)

Revision 1.41, Thu May 2 11:58:18 2019 UTC (4 years, 10 months ago) by schwarze
Branch: MAIN
CVS Tags: VERSION_1_1_0, HEAD
Changes since 1.40: +3 -1 lines

ignore <jobtitle> and <orgdiv>

/* $Id: statistics.c,v 1.41 2019/05/02 11:58:18 schwarze Exp $ */
/*
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <sys/types.h>

#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "xmalloc.h"

/*
 * Count parent-child element relations in a corpus of DocBook documents.
 *
 * Read absolute or relative input file names from standard input,
 * one per line. 
 * For each parent-child relation, print the total number of occurrences,
 * the parent name, and the child name, separated by tab characters
 * and followed by a newline character.
 *
 * Typical usage:
 * statistics < filenames.txt | sort -n
 * statistics < filenames.txt | grep '\<listitem\>' | sort -n
 *
 * Relations already fully implemented are excluded by default.
 * The option -a shows all relations.
 *
 * If two arguments (parent and child) are given, a histogram
 * of the number of children of the kind in each parent is given
 * in addition to the normal output.
 *
 * Example usage:
 * statistics tgroup colspec < filenames.txt | grep colspec
 *
 * Synchronized with parse.c up to rev. 1.42.
 */

struct entry {
	char	*parent;
	char	*child;
	int	 count;
};

static struct entry	 *table;
static size_t		  tablesz;
static size_t		  tablei;

static char		**stack;
static size_t		  stacksz;
static size_t		  stacki;

static const int	  nchildsz = 8;
struct nchild {
	char	*parent;
	char	*child;
	int	 freq[nchildsz];
	int	 count;
};

static struct nchild	  nchild;
static char		 *fname;


/*
 * Count one instance of a parent-child relation.
 * Before the special call table_add(NULL, NULL),
 * mark relations to not be counted;
 * in that phase, child can be NULL as a wildcard.
 */
static void
table_add(const char *parent, const char *child)
{
	static int	 init_done;
	size_t		 i;

	if (parent == NULL && child == NULL) {
		init_done = 1;
		return;
	}

	/* Optional parent-child histogram. */

	if (init_done && parent != NULL && child != NULL &&
	    nchild.parent != NULL && nchild.child != NULL &&
	    strcmp(parent, nchild.parent) == 0 &&
	    strcmp(child, nchild.child) == 0) {
		if (nchild.count < nchildsz) {
			nchild.freq[nchild.count]++;
			if (nchild.count > 0)
				nchild.freq[nchild.count - 1]--;
		} else if (nchild.count == nchildsz)
			puts(fname);
		nchild.count++;
	}

	/* If the table entry already exists, increment its count. */

	for (i = 0; i < tablei; i++) {
		if (strcmp(parent, table[i].parent) == 0 &&
		    (child == NULL || table[i].child == NULL ||
		     strcmp(child, table[i].child) == 0)) {
			assert(init_done);
			if (table[i].count != -1)
				table[i].count++;
			return;
		}
	}

	/* If the table is full, make room. */

	if (tablei == tablesz) {
		tablesz += 64;
		table = xreallocarray(table, tablesz, sizeof(*table));
	}

	/* Add a new entry to the table. */

	table[tablei].parent = xstrdup(parent);
	table[tablei].child = child == NULL ? NULL : xstrdup(child);
	table[tablei++].count = init_done ? 1 : -1;
}

/*
 * Enter an element.
 */
static void
stack_push(const char *name)
{
	if (nchild.parent != NULL && strcmp(name, nchild.parent) == 0)
		nchild.count = 0;

	if (stacki == stacksz) {
		stacksz += 8;
		stack = xreallocarray(stack, stacksz, sizeof(*stack));
	}
	stack[stacki++] = xstrdup(name);
}

/*
 * Exit an element.
 */
static void
stack_pop(const char *name)
{
	if (stacki > 0 && (name == NULL ||
	    strcmp(name, stack[stacki - 1]) == 0))
		free(stack[--stacki]);
}

/*
 * Simplified version from parse.c.
 */
static int
advance(char *b, size_t rlen, size_t *pend, const char *charset)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	while (*pend < rlen) {
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		++*pend;
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return 1;
	} else
		return 0;
}

/*
 * Simplified version from parse.c.
 */
static void
parse_file(int fd, char *fname)
{
	char		 b[4096];
	char		*cp;
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;  /* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 in_tag, in_arg, in_quotes, in_doctype, elem_end;

	rlen = 0;
	in_tag = in_arg = in_quotes = in_doctype = 0;
	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
		if ((rlen += rsz) == 0)
			break;
		pend = 0;
		for (;;) {
			if ((poff = pend) == rlen)
				break;
			if (isspace((unsigned char)b[pend])) {
				pend++;
				continue;
			}
			if (in_arg) {
				if (in_quotes == 0 &&
				    (b[pend] == '\'' || b[pend] == '"')) {
					in_quotes = b[pend] == '"' ? 2 : 1;
					pend++;
					continue;
				}
				if (advance(b, rlen, &pend,
				    in_quotes == 2 ? "\"" :
				    in_quotes == 1 ? "'" : " >") && rsz > 0)
					break;
				in_arg = in_quotes = elem_end = 0;
				if (b[pend] == '>') {
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (in_tag) {
				if (in_doctype && b[pend] == '[') {
					in_tag = in_doctype = 0;
					pend++;
					continue;
				}
				if (advance(b, rlen, &pend, " =>") && rsz > 0)
					break;
				elem_end = 0;
				switch (b[pend]) {
				case '>':
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
					break;
				case '=':
					in_arg = 1;
					break;
				default:
					break;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (b[poff] == '<') {
				if (advance(b, rlen, &pend, " >") && rsz > 0)
					break;
				if (pend > poff + 3 &&
				    strncmp(b + poff, "<!--", 4) == 0) {
					/* Skip a comment. */
					cp = strstr(b + pend - 2, "-->");
					if (cp == NULL) {
						pend = rlen;
						if (rsz > 0)
							break;
					} else
						pend = cp + 3 - b;
					continue;
				}
				elem_end = 0;
				if (b[pend] != '>')
					in_tag = 1;
				else if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (b[++poff] == '/') {
					elem_end = 1;
					poff++;
				} else if (strcasecmp(b + poff,
				    "!DOCTYPE") == 0) {
					in_doctype = 1;
				} else if (b[poff] != '!' && b[poff] != '?') {
					table_add(stacki > 0 ?
					    stack[stacki - 1] : "ROOT",
					    b + poff);
					stack_push(b + poff);
					if (strcmp(b + poff, "sbr") == 0)
						elem_end = 1;
				}
				if (elem_end)
					stack_pop(b + poff);
			} else {
				advance(b, rlen, &pend, "<");
				if (stacki > 0)
					table_add(stack[stacki - 1], "TEXT");
			}
		}
		assert(poff > 0);
		rlen -= poff;
		memmove(b, b + poff, rlen);
	}
	if (rsz < 0)
		perror(fname);
}

int
main(int argc, char *argv[])
{
	size_t		 fsz, i;
	ssize_t		 rsz;
	int		 ch, fd, show_all;

	show_all = 0;
	while ((ch = getopt(argc, argv, "a")) != -1) {
		switch (ch) {
		case 'a':
			show_all = 1;
			break;
		default:
			return 1;
		}
	}
	argc -= optind;
	argv += optind;

	if (argc > 1) {
		nchild.parent = argv[0];
		nchild.child = argv[1];
	}

	/* Exclude relations that are already fully implemented. */
	if (show_all == 0) {
		table_add("ROOT", "appendix");
		table_add("ROOT", "article");
		table_add("ROOT", "book");
		table_add("ROOT", "chapter");
		table_add("ROOT", "glossary");
		table_add("ROOT", "part");
		table_add("ROOT", "preface");
		table_add("ROOT", "refentry");
		table_add("ROOT", "reference");
		table_add("ROOT", "sect1");
		table_add("ROOT", "sect2");
		table_add("abstract", NULL);
		table_add("acronym", "TEXT");
		table_add("affiliation", "jobtitle");
		table_add("affiliation", "orgdiv");
		table_add("affiliation", "orgname");
		table_add("appendix", NULL);
		table_add("application", "TEXT");
		table_add("arg", "option");
		table_add("article", NULL);
		table_add("articleinfo", "abstract");
		table_add("articleinfo", "author");
		table_add("articleinfo", "authorgroup");
		table_add("articleinfo", "copyright");
		table_add("articleinfo", "date");
		table_add("articleinfo", "legalnotice");
		table_add("articleinfo", "pubdate");
		table_add("articleinfo", "releaseinfo");
		table_add("articleinfo", "subtitle");
		table_add("articleinfo", "title");
		table_add("author", "affiliation");
		table_add("author", "contrib");
		table_add("author", "email");
		table_add("author", "firstname");
		table_add("author", "othername");
		table_add("author", "surname");
		table_add("author", "TEXT");
		table_add("authorgroup", "author");
		table_add("authorgroup", "editor");
		table_add("authorgroup", "othercredit");
		table_add("blockquote", NULL);
		table_add("book", NULL);
		table_add("bookinfo", "abstract");
		table_add("bookinfo", "authorgroup");
		table_add("bookinfo", "copyright");
		table_add("bookinfo", "legalnotice");
		table_add("bookinfo", "pubdate");
		table_add("bookinfo", "releaseinfo");
		table_add("bookinfo", "subtitle");
		table_add("bookinfo", "title");
		table_add("caption", "TEXT");
		table_add("chapter", NULL);
		table_add("citerefentry", "manvolnum");
		table_add("citerefentry", "refentrytitle");
		table_add("citetitle", "TEXT");
		table_add("cmdsynopsis", "arg");
		table_add("cmdsynopsis", "command");
		table_add("cmdsynopsis", "group");
		table_add("cmdsynopsis", "sbr");
		table_add("code", "TEXT");
		table_add("command", "TEXT");
		table_add("computeroutput", "TEXT");
		table_add("constant", "TEXT");
		table_add("contrib", "TEXT");
		table_add("copyright", "holder");
		table_add("copyright", "year");
		table_add("date", "TEXT");
		table_add("editor", "affiliation");
		table_add("editor", "firstname");
		table_add("editor", "surname");
		table_add("email", "TEXT");
		table_add("emphasis", "errorname");
		table_add("emphasis", "function");
		table_add("emphasis", "TEXT");
		table_add("entry", NULL);
		table_add("errorname", "TEXT");
		table_add("figure", "mediaobject");
		table_add("figure", "title");
		table_add("filename", "TEXT");
		table_add("firstname", "TEXT");
		table_add("firstterm", "TEXT");
		table_add("footnote", "para");
		table_add("funcdef", "function");
		table_add("funcdef", "TEXT");
		table_add("funcparams", "TEXT");
		table_add("funcprototype", "funcdef");
		table_add("funcprototype", "paramdef");
		table_add("funcprototype", "void");
		table_add("funcsynopsis", "funcprototype");
		table_add("funcsynopsis", "funcsynopsisinfo");
		table_add("funcsynopsisinfo", "TEXT");
		table_add("function", "replaceable");
		table_add("function", "TEXT");
		table_add("glossary", "glossdiv");
		table_add("glossary", "glossentry");
		table_add("glossdef", "para");
		table_add("glossdiv", "glossentry");
		table_add("glossentry", "glossdef");
		table_add("glossentry", "glossterm");
		table_add("glossentry", "indexterm");
		table_add("glosslist", "glossentry");
		table_add("glossterm", "emphasis");
		table_add("glossterm", "TEXT");
		table_add("group", "arg");
		table_add("holder", "TEXT");
		table_add("imageobject", "imagedata");
		table_add("indexterm", "primary");
		table_add("indexterm", "secondary");
		table_add("informaltable", "tgroup");
		table_add("itemizedlist", "listitem");
		table_add("jobtitle", "TEXT");
		table_add("keycap", "TEXT");
		table_add("keycode", "TEXT");
		table_add("keycombo", "keycap");
		table_add("keysym", "TEXT");
		table_add("legalnotice", NULL);
		table_add("link", NULL);
		table_add("listitem", NULL);
		table_add("literal", "TEXT");
		table_add("literallayout", NULL);
		table_add("manvolnum", "TEXT");
		table_add("markup", "TEXT");
		table_add("mediaobject", "caption");
		table_add("mediaobject", "imageobject");
		table_add("member", "constant");
		table_add("member", "emphasis");
		table_add("member", "function");
		table_add("member", "property");
		table_add("member", "symbol");
		table_add("member", "TEXT");
		table_add("note", NULL);
		table_add("olink", "citetitle");
		table_add("olink", "function");
		table_add("olink", "TEXT");
		table_add("option", "parameter");
		table_add("option", "replaceable");
		table_add("option", "TEXT");
		table_add("orderedlist", "listitem");
		table_add("orgdiv", "TEXT");
		table_add("orgname", "TEXT");
		table_add("othercredit", "affiliation");
		table_add("othercredit", "contrib");
		table_add("othercredit", "email");
		table_add("othercredit", "firstname");
		table_add("othercredit", "othername");
		table_add("othercredit", "surname");
		table_add("othername", "TEXT");
		table_add("para", NULL);
		table_add("paramdef", "funcparams");
		table_add("paramdef", "parameter");
		table_add("paramdef", "TEXT");
		table_add("parameter", "TEXT");
		table_add("part", NULL);
		table_add("personname", "firstname");
		table_add("personname", "surname");
		table_add("phrase", "TEXT");
		table_add("preface", NULL);
		table_add("primary", NULL);
		table_add("productname", "TEXT");
		table_add("programlisting", NULL);
		table_add("property", "TEXT");
		table_add("pubdate", "TEXT");
		table_add("quote", "command");
		table_add("quote", "filename");
		table_add("quote", "literal");
		table_add("quote", "TEXT");
		table_add("refentry", "refentryinfo");
		table_add("refentry", "refmeta");
		table_add("refentry", "refnamediv");
		table_add("refentry", "refsect1");
		table_add("refentry", "refsynopsisdiv");
		table_add("refentryinfo", "author");
		table_add("refentryinfo", "authorgroup");
		table_add("refentryinfo", "copyright");
		table_add("refentryinfo", "date");
		table_add("refentryinfo", "productname");
		table_add("refentrytitle", "TEXT");
		table_add("reference", "refentry");
		table_add("refmeta", "manvolnum");
		table_add("refmeta", "refentrytitle");
		table_add("refmeta", "refmiscinfo");
		table_add("refmiscinfo", "TEXT");
		table_add("refname", "TEXT");
		table_add("refnamediv", "refname");
		table_add("refnamediv", "refpurpose");
		table_add("refpurpose", "TEXT");
		table_add("refsect1", NULL);
		table_add("refsect2", NULL);
		table_add("refsynopsisdiv", "cmdsynopsis");
		table_add("refsynopsisdiv", "funcsynopsis");
		table_add("releaseinfo", "TEXT");
		table_add("replaceable", "TEXT");
		table_add("returnvalue", "TEXT");
		table_add("row", "entry");
		table_add("screen", NULL);
		table_add("secondary", NULL);
		table_add("section", NULL);
		table_add("sect1", NULL);
		table_add("sect2", NULL);
		table_add("sect3", NULL);
		table_add("sect4", NULL);
		table_add("sgmltag", "TEXT");
		table_add("simpara", NULL);
		table_add("simplelist", "member");
		table_add("simplesect", NULL);
		table_add("structfield", "TEXT");
		table_add("structname", "TEXT");
		table_add("subscript", "TEXT");
		table_add("subtitle", "TEXT");
		table_add("superscript", "emphasis");
		table_add("superscript", "TEXT");
		table_add("surname", "TEXT");
		table_add("symbol", "TEXT");
		table_add("synopsis", "function");
		table_add("synopsis", "parameter");
		table_add("synopsis", "type");
		table_add("synopsis", "TEXT");
		table_add("systemitem", "TEXT");
		table_add("table", "tgroup");
		table_add("table", "title");
		table_add("tbody", "row");
		table_add("term", NULL);
		table_add("tgroup", "colspec");
		table_add("tgroup", "tbody");
		table_add("tgroup", "thead");
		table_add("thead", "row");
		table_add("title", "acronym");
		table_add("title", "emphasis");
		table_add("title", "errorname");
		table_add("title", "function");
		table_add("title", "literal");
		table_add("title", "quote");
		table_add("title", "trademark");
		table_add("title", "type");
		table_add("title", "TEXT");
		table_add("trademark", "TEXT");
		table_add("type", "TEXT");
		table_add("ulink", NULL);
		table_add("userinput", "TEXT");
		table_add("variablelist", "varlistentry");
		table_add("varlistentry", "listitem");
		table_add("varlistentry", "term");
		table_add("varname", "TEXT");
		table_add("warning", NULL);
		table_add("year", "TEXT");
	}
	table_add(NULL, NULL);

	/* Loop over input files. */
	fd = -1;
	fname = NULL;
	while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
		if (fname[rsz - 1] == '\n')
			fname[--rsz] = '\0';
		if ((fd = open(fname, O_RDONLY, 0)) == -1)
			err(1, "%s", fname);
		parse_file(fd, fname);
		close(fd);
	}

	/* Cleanup and error handling. */
	free(fname);
	if (ferror(stdin))
		err(1, "standard input");
	if (fd == -1)
		errx(1, "No input file names found on standard input");

	/* Dump results. */
	for (i = 0; i < tablei; i++)
		if (table[i].count != -1)
			printf("%d\t%s\t%s\n", table[i].count,
			    table[i].parent, table[i].child);

	/* Optional parent-child histogram. */
	if (nchild.parent != NULL) {
		printf("%s %s", nchild.parent, nchild.child);
		for (i = 0; i < nchildsz; i++)
			printf(" %d", nchild.freq[i]);
		putchar('\n');
	}
	return 0;
}