[BACK]Return to statistics.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / docbook2mdoc

File: [cvsweb.bsd.lv] / docbook2mdoc / statistics.c (download)

Revision 1.2, Fri Mar 29 18:09:43 2019 UTC (5 years ago) by schwarze
Branch: MAIN
Changes since 1.1: +44 -12 lines

allow excluding relations that are already fully implemented

/* $Id: statistics.c,v 1.2 2019/03/29 18:09:43 schwarze Exp $ */
/*
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <assert.h>
#include <ctype.h>
#include <err.h>
#include <fcntl.h>
#include <getopt.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

/*
 * Count parent-child element relations in a corpus of DocBook documents.
 *
 * Read absolute or relative input file names from standard input,
 * one per line. 
 * For each parent-child relation, print the total number of occurrences,
 * the parent name, and the child name, separated by tab characters
 * and followed by a newline character.
 *
 * Typical usage:
 * statistics < filenames.txt | sort -n
 * statistics < filenames.txt | grep '\<listitem\>' | sort -n
 */

struct entry {
	char	*parent;
	char	*child;
	int	 count;
};

static struct entry	 *table;
static size_t		  tablesz;
static size_t		  tablei;

static char		**stack;
static size_t		  stacksz;
static size_t		  stacki;


/*
 * Count one instance of a parent-child relation.
 * Before the special call table_add(NULL, NULL),
 * mark relations to not be counted;
 * in that phase, child can be NULL as a wildcard.
 */
static void
table_add(const char *parent, const char *child)
{
	static int	 init_done;
	size_t		 i;

	if (parent == NULL && child == NULL) {
		init_done = 1;
		return;
	}

	/* If the table entry already exists, increment its count. */

	for (i = 0; i < tablei; i++) {
		if (strcmp(parent, table[i].parent) == 0 &&
		    (child == NULL || table[i].child == NULL ||
		     strcmp(child, table[i].child) == 0)) {
			assert(init_done);
			if (table[i].count != -1)
				table[i].count++;
			return;
		}
	}

	/* If the table is full, make room. */

	if (tablei == tablesz) {
		tablesz += 64;
		table = reallocarray(table, tablesz, sizeof(*table));
		if (table == NULL)
			err(1, NULL);
	}

	/* Add a new entry to the table. */

	if ((table[tablei].parent = strdup(parent)) == NULL)
		err(1, NULL);
	if (child == NULL)
		table[tablei].child = NULL;
	else if ((table[tablei].child = strdup(child)) == NULL)
		err(1, NULL);
	table[tablei++].count = init_done ? 1 : -1;
}

/*
 * Enter an element.
 */
static void
stack_push(const char *name)
{
	if (stacki == stacksz) {
		stacksz += 8;
		stack = reallocarray(stack, stacksz, sizeof(*stack));
		if (stack == NULL)
			err(1, NULL);
	}
	if ((stack[stacki++] = strdup(name)) == NULL)
		err(1, NULL);
}

/*
 * Exit an element.
 */
static void
stack_pop(const char *name)
{
	if (stacki > 0 && (name == NULL ||
	    strcmp(name, stack[stacki - 1]) == 0))
		free(stack[--stacki]);
}

/*
 * Simplified version from parse.c.
 */
static int
advance(char *b, size_t rlen, size_t *pend, const char *charset)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	while (*pend < rlen) {
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		++*pend;
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return 1;
	} else
		return 0;
}

/*
 * Simplified version from parse.c.
 */
static void
parse_file(int fd, char *fname)
{
	char		 b[4096];
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;  /* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 in_tag, in_arg, in_quotes, elem_end;

	rlen = 0;
	in_tag = in_arg = in_quotes = 0;
	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
		if ((rlen += rsz) == 0)
			break;
		pend = 0;
		for (;;) {
			if ((poff = pend) == rlen)
				break;
			if (isspace((unsigned char)b[pend])) {
				pend++;
				continue;
			}
			if (in_arg) {
				if (in_quotes == 0 && b[pend] == '"') {
					in_quotes = 1;
					pend++;
					continue;
				}
				if (advance(b, rlen, &pend,
				    in_quotes ? "\"" : " >") && rsz > 0)
					break;
				in_arg = in_quotes = elem_end = 0;
				if (b[pend] == '>') {
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (in_tag) {
				if (advance(b, rlen, &pend, " =>") && rsz > 0)
					break;
				elem_end = 0;
				switch (b[pend]) {
				case '>':
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
					break;
				case '=':
					in_arg = 1;
					break;
				default:
					break;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (elem_end)
					stack_pop(NULL);
			} else if (b[poff] == '<') {
				if (advance(b, rlen, &pend, " >") && rsz > 0)
					break;
				elem_end = 0;
				if (b[pend] != '>')
					in_tag = 1;
				else if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (b[++poff] == '/') {
					elem_end = 1;
					poff++;
				} else if (b[poff] != '!' && b[poff] != '?') {
					table_add(stacki > 0 ?
					    stack[stacki - 1] : "ROOT",
					    b + poff);
					stack_push(b + poff);
				}
				if (elem_end)
					stack_pop(b + poff);
			} else {
				advance(b, rlen, &pend, "<");
				if (stacki > 0)
					table_add(stack[stacki - 1], "TEXT");
			}
		}
		assert(poff > 0);
		memmove(b, b + poff, rlen - poff);
		rlen -= poff;
	}
	if (rsz < 0)
		perror(fname);
}

int
main(int argc, char *argv[])
{
	char		*fname;
	size_t		 fsz, i;
	ssize_t		 rsz;
	int		 ch, fd, show_all;

	show_all = 0;
	while ((ch = getopt(argc, argv, "a")) != -1) {
		switch (ch) {
		case 'a':
			show_all = 1;
			break;
		default:
			return 1;
		}
	}

	/* Exclude relations that are already fully implemented. */
	if (show_all == 0) {
		table_add("para", NULL);
	}
	table_add(NULL, NULL);

	/* Loop over input files. */
	fd = -1;
	fname = NULL;
	while ((rsz = getline(&fname, &fsz, stdin)) != -1) {
		if (fname[rsz - 1] == '\n')
			fname[--rsz] = '\0';
		if ((fd = open(fname, O_RDONLY, 0)) == -1)
			err(1, "%s", fname);
		parse_file(fd, fname);
		close(fd);
	}

	/* Cleanup and error handling. */
	free(fname);
	if (ferror(stdin))
		err(1, "standard input");
	if (fd == -1)
		errx(1, "No input file names found on standard input");

	/* Dump results. */
	for (i = 0; i < tablei; i++)
		if (table[i].count != -1)
			printf("%d\t%s\t%s\n", table[i].count,
			    table[i].parent, table[i].child);
	return 0;
}