[BACK]Return to parse.c CVS log [TXT][DIR] Up to [cvsweb.bsd.lv] / docbook2mdoc

File: [cvsweb.bsd.lv] / docbook2mdoc / parse.c (download)

Revision 1.7, Thu Mar 28 20:41:33 2019 UTC (5 years ago) by schwarze
Branch: MAIN
Changes since 1.6: +5 -4 lines

Implement a formatter for <author> elements,
handling <contrib>, <personname>, <firstname>, <othername>, <surname>,
as well as arbitrary children properly.

This required minor work on the formatting infrastructure:
Improve macro_addnode() such that it also handles text nodes.
Add a companion function print_textnode().
Let print_text() optionally work without ARG_SPACE.

Triggered by a report from Stephen Gregoratto <dev at sgregoratto
dot me> that and how GTK documentation uses <contrib>.

/* $Id: parse.c,v 1.7 2019/03/28 20:41:33 schwarze Exp $ */
/*
 * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
 * Copyright (c) 2019 Ingo Schwarze <schwarze@openbsd.org>
 *
 * Permission to use, copy, modify, and distribute this software for any
 * purpose with or without fee is hereby granted, provided that the above
 * copyright notice and this permission notice appear in all copies.
 *
 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
 */
#include <assert.h>
#include <ctype.h>
#include <stdarg.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>

#include "node.h"
#include "parse.h"

/*
 * The implementation of the DocBook parser.
 */

/*
 * Global parse state.
 * Keep this as simple and small as possible.
 */
struct	parse {
	const char	*fname;  /* Name of the input file. */
	struct ptree	*tree;   /* Complete parse result. */
	struct pnode	*cur;	 /* Current node in the tree. */
	enum nodeid	 ncur;   /* Type of the current node. */
	int		 line;   /* Line number in the input file. */
	int		 col;	 /* Column number in the input file. */
	int		 nline;  /* Line number of next token. */
	int		 ncol;   /* Column number of next token. */
	int		 del;    /* Levels of nested nodes being deleted. */
	int		 attr;   /* The most recent attribute is valid. */
	int		 warn;
};

struct	element {
	const char	*name;   /* DocBook element name. */
	enum nodeid	 node;   /* Node type to generate. */
};

static	const struct element elements[] = {
	{ "acronym",		NODE_IGNORE },
	{ "affiliation",	NODE_AFFILIATION },
	{ "anchor",		NODE_DELETE },
	{ "application",	NODE_APPLICATION },
	{ "arg",		NODE_ARG },
	{ "author",		NODE_AUTHOR },
	{ "authorgroup",	NODE_AUTHORGROUP },
	{ "blockquote",		NODE_BLOCKQUOTE },
	{ "book",		NODE_BOOK },
	{ "bookinfo",		NODE_BOOKINFO },
	{ "caution",		NODE_CAUTION },
	{ "chapter",		NODE_SECTION },
	{ "citerefentry",	NODE_CITEREFENTRY },
	{ "citetitle",		NODE_CITETITLE },
	{ "cmdsynopsis",	NODE_CMDSYNOPSIS },
	{ "code",		NODE_CODE },
	{ "colspec",		NODE_COLSPEC },
	{ "command",		NODE_COMMAND },
	{ "constant",		NODE_CONSTANT },
	{ "contrib",		NODE_CONTRIB },
	{ "copyright",		NODE_COPYRIGHT },
	{ "date",		NODE_DATE },
	{ "editor",		NODE_EDITOR },
	{ "email",		NODE_EMAIL },
	{ "emphasis",		NODE_EMPHASIS },
	{ "entry",		NODE_ENTRY },
	{ "envar",		NODE_ENVAR },
	{ "fieldsynopsis",	NODE_FIELDSYNOPSIS },
	{ "filename",		NODE_FILENAME },
	{ "firstname",		NODE_PERSONNAME },
	{ "firstterm",		NODE_FIRSTTERM },
	{ "footnote",		NODE_FOOTNOTE },
	{ "funcdef",		NODE_FUNCDEF },
	{ "funcprototype",	NODE_FUNCPROTOTYPE },
	{ "funcsynopsis",	NODE_FUNCSYNOPSIS },
	{ "funcsynopsisinfo",	NODE_FUNCSYNOPSISINFO },
	{ "function",		NODE_FUNCTION },
	{ "glossterm",		NODE_GLOSSTERM },
	{ "group",		NODE_GROUP },
	{ "holder",		NODE_HOLDER },
	{ "index",		NODE_INDEX },
	{ "indexterm",		NODE_DELETE },
	{ "info",		NODE_INFO },
	{ "informalequation",	NODE_INFORMALEQUATION },
	{ "informaltable",	NODE_INFORMALTABLE },
	{ "inlineequation",	NODE_INLINEEQUATION },
	{ "itemizedlist",	NODE_ITEMIZEDLIST },
	{ "keysym",		NODE_KEYSYM },
	{ "legalnotice",	NODE_LEGALNOTICE },
	{ "link",		NODE_LINK },
	{ "listitem",		NODE_LISTITEM },
	{ "literal",		NODE_LITERAL },
	{ "literallayout",	NODE_LITERALLAYOUT },
	{ "manvolnum",		NODE_MANVOLNUM },
	{ "member",		NODE_MEMBER },
	{ "mml:math",		NODE_MML_MATH },
	{ "mml:mfenced",	NODE_MML_MFENCED },
	{ "mml:mfrac",		NODE_MML_MFRAC },
	{ "mml:mi",		NODE_MML_MI },
	{ "mml:mn",		NODE_MML_MN },
	{ "mml:mo",		NODE_MML_MO },
	{ "mml:mrow",		NODE_MML_MROW },
	{ "mml:msub",		NODE_MML_MSUB },
	{ "mml:msup",		NODE_MML_MSUP },
	{ "modifier",		NODE_MODIFIER },
	{ "note",		NODE_NOTE },
	{ "option",		NODE_OPTION },
	{ "orderedlist",	NODE_ORDEREDLIST },
	{ "orgname",		NODE_ORGNAME },
	{ "othername",		NODE_PERSONNAME },
	{ "para",		NODE_PARA },
	{ "paramdef",		NODE_PARAMDEF },
	{ "parameter",		NODE_PARAMETER },
	{ "part",		NODE_SECTION },
	{ "personname",		NODE_PERSONNAME },
	{ "phrase",		NODE_IGNORE },
	{ "preface",		NODE_PREFACE },
	{ "primary",		NODE_DELETE },
	{ "programlisting",	NODE_PROGRAMLISTING },
	{ "prompt",		NODE_PROMPT },
	{ "quote",		NODE_QUOTE },
	{ "refclass",		NODE_REFCLASS },
	{ "refdescriptor",	NODE_REFDESCRIPTOR },
	{ "refentry",		NODE_REFENTRY },
	{ "refentryinfo",	NODE_REFENTRYINFO },
	{ "refentrytitle",	NODE_REFENTRYTITLE },
	{ "refmeta",		NODE_REFMETA },
	{ "refmetainfo",	NODE_REFMETAINFO },
	{ "refmiscinfo",	NODE_REFMISCINFO },
	{ "refname",		NODE_REFNAME },
	{ "refnamediv",		NODE_REFNAMEDIV },
	{ "refpurpose",		NODE_REFPURPOSE },
	{ "refsect1",		NODE_SECTION },
	{ "refsect2",		NODE_SECTION },
	{ "refsect3",		NODE_SECTION },
	{ "refsection",		NODE_SECTION },
	{ "refsynopsisdiv",	NODE_REFSYNOPSISDIV },
	{ "releaseinfo",	NODE_RELEASEINFO },
	{ "replaceable",	NODE_REPLACEABLE },
	{ "row",		NODE_ROW },
	{ "sbr",		NODE_SBR },
	{ "screen",		NODE_SCREEN },
	{ "secondary",		NODE_DELETE },
	{ "sect1",		NODE_SECTION },
	{ "sect2",		NODE_SECTION },
	{ "section",		NODE_SECTION },
	{ "sgmltag",		NODE_SGMLTAG },
	{ "simplelist",		NODE_SIMPLELIST },
	{ "spanspec",		NODE_SPANSPEC },
	{ "structname",		NODE_STRUCTNAME },
	{ "subtitle",		NODE_SUBTITLE },
	{ "surname",		NODE_PERSONNAME },
	{ "synopsis",		NODE_SYNOPSIS },
	{ "table",		NODE_TABLE },
	{ "tbody",		NODE_TBODY },
	{ "term",		NODE_TERM },
	{ "tfoot",		NODE_TFOOT },
	{ "tgroup",		NODE_TGROUP },
	{ "thead",		NODE_THEAD },
	{ "tip",		NODE_TIP },
	{ "title",		NODE_TITLE },
	{ "trademark",		NODE_IGNORE },
	{ "type",		NODE_TYPE },
	{ "ulink",		NODE_ULINK },
	{ "userinput",		NODE_USERINPUT },
	{ "variablelist",	NODE_VARIABLELIST },
	{ "varlistentry",	NODE_VARLISTENTRY },
	{ "varname",		NODE_VARNAME },
	{ "warning",		NODE_WARNING },
	{ "wordasword",		NODE_WORDASWORD },
	{ "xi:include",		NODE_DELETE_WARN },
	{ "year",		NODE_YEAR },
	{ NULL,			NODE_IGNORE }
};

static void
error_msg(struct parse *p, const char *fmt, ...)
{
	va_list		 ap;

	fprintf(stderr, "%s:%d:%d: ", p->fname, p->line, p->col);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	fputc('\n', stderr);
	p->tree->flags |= TREE_FAIL;
}

static void
warn_msg(struct parse *p, const char *fmt, ...)
{
	va_list		 ap;

	if (p->warn == 0)
		return;

	fprintf(stderr, "%s:%d:%d: warning: ", p->fname, p->line, p->col);
	va_start(ap, fmt);
	vfprintf(stderr, fmt, ap);
	va_end(ap);
	fputc('\n', stderr);
}

/*
 * Process a string of characters.
 * If a text node is already open, append to it.
 * Otherwise, create a new one as a child of the current node.
 */
static void
xml_char(struct parse *ps, const char *p, int sz)
{
	struct pnode	*dat;

	if (ps->del > 0)
		return;

	if (ps->cur == NULL) {
		error_msg(ps, "discarding text before document: %.*s", sz, p);
		return;
	}

	if (ps->cur->node != NODE_TEXT) {
		if ((dat = calloc(1, sizeof(*dat))) == NULL) {
			perror(NULL);
			exit(1);
		}
		dat->node = NODE_TEXT;
		dat->parent = ps->cur;
		TAILQ_INIT(&dat->childq);
		TAILQ_INIT(&dat->attrq);
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);
		ps->cur = dat;
	}

	if (ps->tree->flags & TREE_CLOSED &&
	    ps->cur->parent == ps->tree->root)
		warn_msg(ps, "text after end of document: %.*s", sz, p);

	/* Append to the current text node. */

	assert(sz >= 0);
	ps->cur->b = realloc(ps->cur->b, ps->cur->bsz + sz + 1);
	if (ps->cur->b == NULL) {
		perror(NULL);
		exit(1);
	}
	memcpy(ps->cur->b + ps->cur->bsz, p, sz);
	ps->cur->bsz += sz;
	ps->cur->b[ps->cur->bsz] = '\0';
	ps->cur->real = ps->cur->b;
}

static void
pnode_trim(struct pnode *pn)
{
	assert(pn->node == NODE_TEXT);
	for (; pn->bsz > 0; pn->b[--pn->bsz] = '\0')
		if (isspace((unsigned char)pn->b[pn->bsz - 1]) == 0)
			break;
}

/*
 * Begin an element.
 */
static void
xml_elem_start(struct parse *ps, const char *name)
{
	const struct element	*elem;
	struct pnode		*dat;

	if (*name == '!' || *name == '?')
		return;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 0) {
		ps->del++;
		return;
	}

	/* Close out the text node, if there is one. */
	if (ps->cur != NULL && ps->cur->node == NODE_TEXT) {
		pnode_trim(ps->cur);
		ps->cur = ps->cur->parent;
	}

	for (elem = elements; elem->name != NULL; elem++)
		if (strcmp(elem->name, name) == 0)
			break;

	if (elem->name == NULL)
		error_msg(ps, "unknown element <%s>", name);

	ps->ncur = elem->node;

	switch (ps->ncur) {
	case NODE_DELETE_WARN:
		warn_msg(ps, "skipping element <%s>", name);
		/* FALLTHROUGH */
	case NODE_DELETE:
		ps->del = 1;
		/* FALLTHROUGH */
	case NODE_IGNORE:
		return;
	case NODE_INLINEEQUATION:
		ps->tree->flags |= TREE_EQN;
		break;
	default:
		break;
	}

	if (ps->tree->flags & TREE_CLOSED && ps->cur->parent == NULL)
		warn_msg(ps, "element after end of document: <%s>", name);

	if ((dat = calloc(1, sizeof(*dat))) == NULL) {
		perror(NULL);
		exit(1);
	}
	dat->node = elem->node;
	dat->parent = ps->cur;
	TAILQ_INIT(&dat->childq);
	TAILQ_INIT(&dat->attrq);

	if (ps->cur != NULL)
		TAILQ_INSERT_TAIL(&ps->cur->childq, dat, child);

	ps->cur = dat;
	if (ps->tree->root == NULL)
		ps->tree->root = dat;
}

static void
xml_attrkey(struct parse *ps, const char *name)
{
	struct pattr	*attr;
	enum attrkey	 key;

	if (ps->del > 0 || *name == '\0')
		return;
	if ((key = attrkey_parse(name)) == ATTRKEY__MAX) {
		ps->attr = 0;
		return;
	}
	if ((attr = calloc(1, sizeof(*attr))) == NULL) {
		perror(NULL);
		exit(1);
	}
	attr->key = key;
	attr->val = ATTRVAL__MAX;
	attr->rawval = NULL;
	TAILQ_INSERT_TAIL(&ps->cur->attrq, attr, child);
	ps->attr = 1;
}

static void
xml_attrval(struct parse *ps, const char *name)
{
	struct pattr	*attr;

	if (ps->del > 0 || ps->attr == 0)
		return;
	if ((attr = TAILQ_LAST(&ps->cur->attrq, pattrq)) == NULL)
		return;
	if ((attr->val = attrval_parse(name)) == ATTRVAL__MAX &&
	    (attr->rawval = strdup(name)) == NULL) {
		perror(NULL);
		exit(1);
	}
}

/*
 * Roll up the parse tree.
 * If we're at a text node, roll that one up first.
 */
static void
xml_elem_end(struct parse *ps, const char *name)
{
	const struct element	*elem;
	enum nodeid		 node;

	/*
	 * An ancestor is excluded from the tree;
	 * keep track of the number of levels excluded.
	 */
	if (ps->del > 1) {
		ps->del--;
		return;
	}

	/* Close out the text node, if there is one. */
	if (ps->del == 0 && ps->cur != NULL && ps->cur->node == NODE_TEXT) {
		pnode_trim(ps->cur);
		ps->cur = ps->cur->parent;
	}

	if (name != NULL) {
		for (elem = elements; elem->name != NULL; elem++)
			if (strcmp(elem->name, name) == 0)
				break;
		node = elem->node;
	} else
		node = ps->ncur;

	switch (node) {
	case NODE_DELETE_WARN:
	case NODE_DELETE:
		if (ps->del > 0)
			ps->del--;
		break;
	case NODE_IGNORE:
		break;
	default:
		if (ps->cur == NULL || node != ps->cur->node) {
			warn_msg(ps, "element not open: </%s>", name);
			break;
		}

		/*
		 * Refrain from actually closing the document element.
		 * If no more content follows, no harm is done, but if
		 * some content still follows, simply processing it is
		 * obviously better than discarding it or crashing.
		 */

		if (ps->cur->parent == NULL)
			ps->tree->flags |= TREE_CLOSED;
		else
			ps->cur = ps->cur->parent;
		break;
	}
	assert(ps->del == 0);
}

struct parse *
parse_alloc(int warn)
{
	struct parse	*p;

	if ((p = calloc(1, sizeof(*p))) == NULL)
		return NULL;

	if ((p->tree = calloc(1, sizeof(*p->tree))) == NULL) {
		free(p);
		return NULL;
	}
	p->warn = warn;
	return p;
}

void
parse_free(struct parse *p)
{
	if (p == NULL)
		return;
	if (p->tree != NULL) {
		pnode_unlink(p->tree->root);
		free(p->tree);
	}
	free(p);
}

/*
 * Advance the pend pointer to the next character in the charset.
 * If the charset starts with a space, it stands for any whitespace.
 * Update the new input file position, used for messages.
 * Do not overrun the buffer b of length rlen.
 * When reaching the end, NUL-terminate the buffer and return 1;
 * otherwise, return 0.
 */
static int
advance(struct parse *p, char *b, size_t rlen, size_t *pend,
    const char *charset)
{
	int		 space;

	if (*charset == ' ') {
		space = 1;
		charset++;
	} else
		space = 0;

	p->nline = p->line;
	p->ncol = p->col;
	while (*pend < rlen) {
		if (b[*pend] == '\n') {
			p->nline++;
			p->ncol = 1;
		} else
			p->ncol++;
		if (space && isspace((unsigned char)b[*pend]))
			break;
		if (strchr(charset, b[*pend]) != NULL)
			break;
		++*pend;
	}
	if (*pend == rlen) {
		b[rlen] = '\0';
		return 1;
	} else
		return 0;
}

struct ptree *
parse_file(struct parse *p, int fd, const char *fname)
{
	char		 b[4096];
	ssize_t		 rsz;	/* Return value from read(2). */
	size_t		 rlen;  /* Number of bytes in b[]. */
	size_t		 poff;  /* Parse offset in b[]. */
	size_t		 pend;  /* Offset of the end of the current word. */
	int		 in_tag, in_arg, in_quotes, elem_end;

	p->fname = fname;
	p->nline = 1;
	p->ncol = 1;
	rlen = 0;
	in_tag = in_arg = in_quotes = 0;

	/*
	 * Read loop.
	 *
	 * We have to enter the read loop once more even on EOF
	 * because the previous token may have been incomplete,
	 * such that it asked for more input.
	 * Once rsz is 0, incomplete tokens will no longer ask
	 * for more input but instead use whatever there is,
	 * and then exit the read loop.
	 * The minus one on the size limit for read(2) is needed
	 * such that advance() can set b[rlen] to NUL when needed.
	 */

	while ((rsz = read(fd, b + rlen, sizeof(b) - rlen - 1)) >= 0) {
		if ((rlen += rsz) == 0)
			break;

		/* Token loop. */

		pend = 0;
		for (;;) {

			/* Proceed to the next token, skipping whitespace. */

			p->line = p->nline;
			p->col = p->ncol;
			if ((poff = pend) == rlen)
				break;
			if (isspace((unsigned char)b[pend])) {
				if (b[pend++] == '\n') {
					p->nline++;
					p->ncol = 1;
				} else
					p->ncol++;
				continue;
			}

			/*
			 * The following three cases (in_arg, in_tag,
			 * and starting a tag) all parse a word or
			 * quoted string.  If that extends beyond the
			 * read buffer and the last read(2) still got
			 * data, they all break out of the token loop
			 * to request more data from the read loop.
			 *
			 * Also, they all detect self-closing tags,
			 * those ending with "/>", setting the flag
			 * elem_end and calling xml_elem_end() at the
			 * very end, after handling the attribute value,
			 * attribute name, or tag name, respectively.
			 */

			/* Parse an attribute value. */

			if (in_arg) {
				if (in_quotes == 0 && b[pend] == '"') {
					in_quotes = 1;
					p->ncol++;
					pend++;
					continue;
				}
				if (advance(p, b, rlen, &pend,
				    in_quotes ? "\"" : " >") && rsz > 0)
					break;
				in_arg = in_quotes = elem_end = 0;
				if (b[pend] == '>') {
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				xml_attrval(p, b + poff);
				if (elem_end)
					xml_elem_end(p, NULL);

			/* Look for an attribute name. */

			} else if (in_tag) {
				if (advance(p, b, rlen, &pend, " =>") &&
				    rsz > 0)
					break;
				elem_end = 0;
				switch (b[pend]) {
				case '>':
					in_tag = 0;
					if (pend > 0 && b[pend - 1] == '/') {
						b[pend - 1] = '\0';
						elem_end = 1;
					}
					break;
				case '=':
					in_arg = 1;
					break;
				default:
					break;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				xml_attrkey(p, b + poff);
				if (elem_end)
					xml_elem_end(p, NULL);

			/* Begin an opening or closing tag. */

			} else if (b[poff] == '<') {
				if (advance(p, b, rlen, &pend, " >") &&
				    rsz > 0)
					break;
				elem_end = 0;
				if (b[pend] != '>')
					in_tag = 1;
				else if (pend > 0 && b[pend - 1] == '/') {
					b[pend - 1] = '\0';
					elem_end = 1;
				}
				b[pend] = '\0';
				if (pend < rlen)
					pend++;
				if (b[++poff] == '/') {
					elem_end = 1;
					poff++;
				} else
					xml_elem_start(p, b + poff);
				if (elem_end)
					xml_elem_end(p, b + poff);

			/* Process text up to the next tag. */

			} else {
				if (advance(p, b, rlen, &pend, "<") == 0)
					p->ncol--;
				xml_char(p, b + poff, pend - poff);
			}
		}

		/* Buffer exhausted; shift left and re-fill. */

		assert(poff > 0);
		memmove(b, b + poff, rlen - poff);
		rlen -= poff;
	}
	if (rsz < 0) {
		perror(fname);
		p->tree->flags |= TREE_FAIL;
	}
	if (p->cur != NULL && p->cur->node == NODE_TEXT) {
		pnode_trim(p->cur);
		p->cur = p->cur->parent;
	}
	if ((p->tree->flags & TREE_CLOSED) == 0)
		warn_msg(p, "document not closed");
	return p->tree;
}