===================================================================
RCS file: /cvs/mandoc/mandoc.c,v
retrieving revision 1.39
retrieving revision 1.64
diff -u -p -r1.39 -r1.64
--- mandoc/mandoc.c	2011/03/15 16:23:51	1.39
+++ mandoc/mandoc.c	2012/05/31 22:34:06	1.64
@@ -1,7 +1,7 @@
-/*	$Id: mandoc.c,v 1.39 2011/03/15 16:23:51 kristaps Exp $ */
+/*	$Id: mandoc.c,v 1.64 2012/05/31 22:34:06 schwarze Exp $ */
 /*
- * Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ * Copyright (c) 2011, 2012 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -23,6 +23,8 @@
 
 #include <assert.h>
 #include <ctype.h>
+#include <errno.h>
+#include <limits.h>
 #include <stdlib.h>
 #include <stdio.h>
 #include <string.h>
@@ -36,75 +38,195 @@
 static	int	 a2time(time_t *, const char *, const char *);
 static	char	*time2a(time_t);
 
-int
-mandoc_special(char *p)
+
+enum mandoc_esc
+mandoc_escape(const char **end, const char **start, int *sz)
 {
-	int		 len, i;
-	char		 term;
-	char		*sv;
-	
-	len = 0;
+	char		 c, term;
+	int		 i, rlim;
+	const char	*cp, *rstart;
+	enum mandoc_esc	 gly; 
+
+	cp = *end;
+	rstart = cp;
+	if (start)
+		*start = rstart;
+	i = rlim = 0;
+	gly = ESCAPE_ERROR;
 	term = '\0';
-	sv = p;
 
-	assert('\\' == *p);
-	p++;
+	switch ((c = cp[i++])) {
+	/*
+	 * First the glyphs.  There are several different forms of
+	 * these, but each eventually returns a substring of the glyph
+	 * name.
+	 */
+	case ('('):
+		gly = ESCAPE_SPECIAL;
+		rlim = 2;
+		break;
+	case ('['):
+		gly = ESCAPE_SPECIAL;
+		/*
+		 * Unicode escapes are defined in groff as \[uXXXX] to
+		 * \[u10FFFF], where the contained value must be a valid
+		 * Unicode codepoint.  Here, however, only check whether
+		 * it's not a zero-width escape.
+		 */
+		if ('u' == cp[i] && ']' != cp[i + 1])
+			gly = ESCAPE_UNICODE;
+		term = ']';
+		break;
+	case ('C'):
+		if ('\'' != cp[i])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_SPECIAL;
+		term = '\'';
+		break;
 
-	switch (*p++) {
-#if 0
-	case ('Z'):
+	/*
+	 * The \z escape is supposed to output the following
+	 * character without advancing the cursor position.  
+	 * Since we are mostly dealing with terminal mode,
+	 * let us just skip the next character.
+	 */
+	case ('z'):
+		(*end)++;
+		return(ESCAPE_SKIPCHAR);
+
+	/*
+	 * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
+	 * 'X' is the trigger.  These have opaque sub-strings.
+	 */
+	case ('F'):
 		/* FALLTHROUGH */
-	case ('X'):
+	case ('g'):
 		/* FALLTHROUGH */
-	case ('x'):
+	case ('k'):
 		/* FALLTHROUGH */
-	case ('S'):
+	case ('M'):
 		/* FALLTHROUGH */
-	case ('R'):
+	case ('m'):
 		/* FALLTHROUGH */
-	case ('N'):
+	case ('n'):
 		/* FALLTHROUGH */
-	case ('l'):
+	case ('V'):
 		/* FALLTHROUGH */
-	case ('L'):
+	case ('Y'):
+		gly = ESCAPE_IGNORE;
 		/* FALLTHROUGH */
-	case ('H'):
+	case ('f'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_FONT;
+
+		rstart= &cp[i];
+		if (start) 
+			*start = rstart;
+
+		switch (cp[i++]) {
+		case ('('):
+			rlim = 2;
+			break;
+		case ('['):
+			term = ']';
+			break;
+		default:
+			rlim = 1;
+			i--;
+			break;
+		}
+		break;
+
+	/*
+	 * These escapes are of the form \X'Y', where 'X' is the trigger
+	 * and 'Y' is any string.  These have opaque sub-strings.
+	 */
+	case ('A'):
 		/* FALLTHROUGH */
-	case ('h'):
+	case ('b'):
 		/* FALLTHROUGH */
 	case ('D'):
 		/* FALLTHROUGH */
-	case ('C'):
+	case ('o'):
 		/* FALLTHROUGH */
-	case ('b'):
+	case ('R'):
 		/* FALLTHROUGH */
-	case ('B'):
+	case ('X'):
 		/* FALLTHROUGH */
-	case ('a'):
-		/* FALLTHROUGH */
-	case ('A'):
-		if (*p++ != '\'')
-			return(0);
+	case ('Z'):
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		gly = ESCAPE_IGNORE;
 		term = '\'';
 		break;
-#endif
+
+	/*
+	 * These escapes are of the form \X'N', where 'X' is the trigger
+	 * and 'N' resolves to a numerical expression.
+	 */
+	case ('B'):
+		/* FALLTHROUGH */
 	case ('h'):
 		/* FALLTHROUGH */
+	case ('H'):
+		/* FALLTHROUGH */
+	case ('L'):
+		/* FALLTHROUGH */
+	case ('l'):
+		gly = ESCAPE_NUMBERED;
+		/* FALLTHROUGH */
+	case ('S'):
+		/* FALLTHROUGH */
 	case ('v'):
 		/* FALLTHROUGH */
+	case ('w'):
+		/* FALLTHROUGH */
+	case ('x'):
+		if (ESCAPE_ERROR == gly)
+			gly = ESCAPE_IGNORE;
+		if ('\'' != cp[i++])
+			return(ESCAPE_ERROR);
+		term = '\'';
+		break;
+
+	/*
+	 * Special handling for the numbered character escape.
+	 * XXX Do any other escapes need similar handling?
+	 */
+	case ('N'):
+		if ('\0' == cp[i])
+			return(ESCAPE_ERROR);
+		*end = &cp[++i];
+		if (isdigit((unsigned char)cp[i-1]))
+			return(ESCAPE_IGNORE);
+		while (isdigit((unsigned char)**end))
+			(*end)++;
+		if (start)
+			*start = &cp[i];
+		if (sz)
+			*sz = *end - &cp[i];
+		if ('\0' != **end)
+			(*end)++;
+		return(ESCAPE_NUMBERED);
+
+	/* 
+	 * Sizes get a special category of their own.
+	 */
 	case ('s'):
-		if (ASCII_HYPH == *p)
-			*p = '-';
+		gly = ESCAPE_IGNORE;
 
-		i = 0;
-		if ('+' == *p || '-' == *p) {
-			p++;
-			i = 1;
-		}
+		rstart = &cp[i];
+		if (start) 
+			*start = rstart;
 
-		switch (*p++) {
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
+
+		switch (cp[i++]) {
 		case ('('):
-			len = 2;
+			rlim = 2;
 			break;
 		case ('['):
 			term = ']';
@@ -112,122 +234,113 @@ mandoc_special(char *p)
 		case ('\''):
 			term = '\'';
 			break;
-		case ('0'):
-			i = 1;
-			/* FALLTHROUGH */
 		default:
-			len = 1;
-			p--;
+			rlim = 1;
+			i--;
 			break;
 		}
 
-		if (ASCII_HYPH == *p)
-			*p = '-';
-		if ('+' == *p || '-' == *p) {
-			if (i)
-				return(0);
-			p++;
-		} 
-		
-		/* Handle embedded numerical subexp or escape. */
+		/* See +/- counts as a sign. */
+		c = cp[i];
+		if ('+' == c || '-' == c || ASCII_HYPH == c)
+			++i;
 
-		if ('(' == *p) {
-			while (*p && ')' != *p)
-				if ('\\' == *p++) {
-					i = mandoc_special(--p);
-					if (0 == i)
-						return(0);
-					p += i;
-				}
+		break;
 
-			if (')' == *p++)
-				break;
+	/*
+	 * Anything else is assumed to be a glyph.
+	 */
+	default:
+		gly = ESCAPE_SPECIAL;
+		rlim = 1;
+		i--;
+		break;
+	}
 
-			return(0);
-		} else if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
+	assert(ESCAPE_ERROR != gly);
+
+	*end = rstart = &cp[i];
+	if (start)
+		*start = rstart;
+
+	/*
+	 * Read up to the terminating character,
+	 * paying attention to nested escapes.
+	 */
+
+	if ('\0' != term) {
+		while (**end != term) {
+			switch (**end) {
+			case ('\0'):
+				return(ESCAPE_ERROR);
+			case ('\\'):
+				(*end)++;
+				if (ESCAPE_ERROR ==
+				    mandoc_escape(end, NULL, NULL))
+					return(ESCAPE_ERROR);
+				break;
+			default:
+				(*end)++;
+				break;
+			}
 		}
+		rlim = (*end)++ - rstart;
+	} else {
+		assert(rlim > 0);
+		if ((size_t)rlim > strlen(rstart))
+			return(ESCAPE_ERROR);
+		*end += rlim;
+	}
+	if (sz)
+		*sz = rlim;
 
-		break;
-#if 0
-	case ('Y'):
-		/* FALLTHROUGH */
-	case ('V'):
-		/* FALLTHROUGH */
-	case ('$'):
-		/* FALLTHROUGH */
-	case ('n'):
-		/* FALLTHROUGH */
-#endif
-	case ('k'):
-		/* FALLTHROUGH */
-	case ('M'):
-		/* FALLTHROUGH */
-	case ('m'):
-		/* FALLTHROUGH */
-	case ('f'):
-		/* FALLTHROUGH */
-	case ('F'):
-		/* FALLTHROUGH */
-	case ('*'):
-		switch (*p++) {
-		case ('('):
-			len = 2;
+	/* Run post-processors. */
+
+	switch (gly) {
+	case (ESCAPE_FONT):
+		/*
+		 * Pretend that the constant-width font modes are the
+		 * same as the regular font modes.
+		 */
+		if (2 == rlim && 'C' == *rstart)
+			rstart++;
+		else if (1 != rlim)
 			break;
-		case ('['):
-			term = ']';
+
+		switch (*rstart) {
+		case ('3'):
+			/* FALLTHROUGH */
+		case ('B'):
+			gly = ESCAPE_FONTBOLD;
 			break;
-		default:
-			len = 1;
-			p--;
+		case ('2'):
+			/* FALLTHROUGH */
+		case ('I'):
+			gly = ESCAPE_FONTITALIC;
 			break;
+		case ('P'):
+			gly = ESCAPE_FONTPREV;
+			break;
+		case ('1'):
+			/* FALLTHROUGH */
+		case ('R'):
+			gly = ESCAPE_FONTROMAN;
+			break;
 		}
 		break;
-	case ('('):
-		len = 2;
-		break;
-	case ('['):
-		term = ']';
-		break;
-	case ('z'):
-		len = 1;
-		if ('\\' == *p) {
-			if (0 == (i = mandoc_special(p)))
-				return(0);
-			p += i;
-			return(*p ? (int)(p - sv) : 0);
-		}
-		break;
-	case ('o'):
-		/* FALLTHROUGH */
-	case ('w'):
-		if ('\'' == *p++) {
-			term = '\'';
+	case (ESCAPE_SPECIAL):
+		if (1 != rlim)
 			break;
-		}
-		/* FALLTHROUGH */
+		if ('c' == *rstart)
+			gly = ESCAPE_NOSPACE;
+		break;
 	default:
-		len = 1;
-		p--;
 		break;
 	}
 
-	if (term) {
-		for ( ; *p && term != *p; p++)
-			if (ASCII_HYPH == *p)
-				*p = '-';
-		return(*p ? (int)(p - sv) : 0);
-	}
-
-	for (i = 0; *p && i < len; i++, p++)
-		if (ASCII_HYPH == *p)
-			*p = '-';
-	return(i == len ? (int)(p - sv) : 0);
+	return(gly);
 }
 
-
 void *
 mandoc_calloc(size_t num, size_t size)
 {
@@ -271,7 +384,17 @@ mandoc_realloc(void *ptr, size_t size)
 	return(ptr);
 }
 
+char *
+mandoc_strndup(const char *ptr, size_t sz)
+{
+	char		*p;
 
+	p = mandoc_malloc(sz + 1);
+	memcpy(p, ptr, sz);
+	p[(int)sz] = '\0';
+	return(p);
+}
+
 char *
 mandoc_strdup(const char *ptr)
 {
@@ -296,18 +419,18 @@ mandoc_strdup(const char *ptr)
  * or to the null byte terminating the argument line.
  */
 char *
-mandoc_getarg(char **cpp, mandocmsg msg, void *data, int ln, int *pos)
+mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
 {
 	char	 *start, *cp;
 	int	  quoted, pairs, white;
 
 	/* Quoting can only start with a new word. */
 	start = *cpp;
+	quoted = 0;
 	if ('"' == *start) {
 		quoted = 1;
 		start++;
-	} else
-		quoted = 0;
+	} 
 
 	pairs = 0;
 	white = 0;
@@ -343,8 +466,8 @@ mandoc_getarg(char **cpp, mandocmsg msg, void *data, i
 	}
 
 	/* Quoted argument without a closing quote. */
-	if (1 == quoted && msg)
-		(*msg)(MANDOCERR_BADQUOTE, data, ln, *pos, NULL);
+	if (1 == quoted)
+		mandoc_msg(MANDOCERR_BADQUOTE, parse, ln, *pos, NULL);
 
 	/* Null-terminate this argument and move to the next one. */
 	if (pairs)
@@ -357,13 +480,12 @@ mandoc_getarg(char **cpp, mandocmsg msg, void *data, i
 	*pos += (int)(cp - start) + (quoted ? 1 : 0);
 	*cpp = cp;
 
-	if ('\0' == *cp && msg && (white || ' ' == cp[-1]))
-		(*msg)(MANDOCERR_EOLNSPACE, data, ln, *pos, NULL);
+	if ('\0' == *cp && (white || ' ' == cp[-1]))
+		mandoc_msg(MANDOCERR_EOLNSPACE, parse, ln, *pos, NULL);
 
 	return(start);
 }
 
-
 static int
 a2time(time_t *t, const char *fmt, const char *p)
 {
@@ -372,7 +494,10 @@ a2time(time_t *t, const char *fmt, const char *p)
 
 	memset(&tm, 0, sizeof(struct tm));
 
+	pp = NULL;
+#ifdef	HAVE_STRPTIME
 	pp = strptime(p, fmt, &tm);
+#endif
 	if (NULL != pp && '\0' == *pp) {
 		*t = mktime(&tm);
 		return(1);
@@ -381,16 +506,15 @@ a2time(time_t *t, const char *fmt, const char *p)
 	return(0);
 }
 
-
 static char *
 time2a(time_t t)
 {
-	struct tm	 tm;
+	struct tm	*tm;
 	char		*buf, *p;
 	size_t		 ssz;
 	int		 isz;
 
-	localtime_r(&t, &tm);
+	tm = localtime(&t);
 
 	/*
 	 * Reserve space:
@@ -400,15 +524,15 @@ time2a(time_t t)
 	 */
 	p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 
-	if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm)))
+	if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm)))
 		goto fail;
 	p += (int)ssz;
 
-	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday)))
+	if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)))
 		goto fail;
 	p += isz;
 
-	if (0 == strftime(p, 4 + 1, "%Y", &tm))
+	if (0 == strftime(p, 4 + 1, "%Y", tm))
 		goto fail;
 	return(buf);
 
@@ -417,29 +541,28 @@ fail:
 	return(NULL);
 }
 
-
 char *
-mandoc_normdate(char *in, mandocmsg msg, void *data, int ln, int pos)
+mandoc_normdate(struct mparse *parse, char *in, int ln, int pos)
 {
 	char		*out;
 	time_t		 t;
 
 	if (NULL == in || '\0' == *in ||
 	    0 == strcmp(in, "$" "Mdocdate$")) {
-		(*msg)(MANDOCERR_NODATE, data, ln, pos, NULL);
+		mandoc_msg(MANDOCERR_NODATE, parse, ln, pos, NULL);
 		time(&t);
 	}
+	else if (a2time(&t, "%Y-%m-%d", in))
+		t = 0;
 	else if (!a2time(&t, "$" "Mdocdate: %b %d %Y $", in) &&
-	    !a2time(&t, "%b %d, %Y", in) &&
-	    !a2time(&t, "%Y-%m-%d", in)) {
-		(*msg)(MANDOCERR_BADDATE, data, ln, pos, NULL);
+	    !a2time(&t, "%b %d, %Y", in)) {
+		mandoc_msg(MANDOCERR_BADDATE, parse, ln, pos, NULL);
 		t = 0;
 	}
 	out = t ? time2a(t) : NULL;
 	return(out ? out : mandoc_strdup(in));
 }
 
-
 int
 mandoc_eos(const char *p, size_t sz, int enclosed)
 {
@@ -452,7 +575,7 @@ mandoc_eos(const char *p, size_t sz, int enclosed)
 	/*
 	 * End-of-sentence recognition must include situations where
 	 * some symbols, such as `)', allow prior EOS punctuation to
-	 * propogate outward.
+	 * propagate outward.
 	 */
 
 	found = 0;
@@ -483,30 +606,59 @@ mandoc_eos(const char *p, size_t sz, int enclosed)
 	return(found && !enclosed);
 }
 
-
+/*
+ * Find out whether a line is a macro line or not.  If it is, adjust the
+ * current position and return one; if it isn't, return zero and don't
+ * change the current position.
+ */
 int
-mandoc_hyph(const char *start, const char *c)
+mandoc_getcontrol(const char *cp, int *ppos)
 {
+	int		pos;
 
-	/*
-	 * Choose whether to break at a hyphenated character.  We only
-	 * do this if it's free-standing within a word.
-	 */
+	pos = *ppos;
 
-	/* Skip first/last character of buffer. */
-	if (c == start || '\0' == *(c + 1))
+	if ('\\' == cp[pos] && '.' == cp[pos + 1])
+		pos += 2;
+	else if ('.' == cp[pos] || '\'' == cp[pos])
+		pos++;
+	else
 		return(0);
-	/* Skip first/last character of word. */
-	if ('\t' == *(c + 1) || '\t' == *(c - 1))
-		return(0);
-	if (' ' == *(c + 1) || ' ' == *(c - 1))
-		return(0);
-	/* Skip double invocations. */
-	if ('-' == *(c + 1) || '-' == *(c - 1))
-		return(0);
-	/* Skip escapes. */
-	if ('\\' == *(c - 1))
-		return(0);
 
+	while (' ' == cp[pos] || '\t' == cp[pos])
+		pos++;
+
+	*ppos = pos;
 	return(1);
+}
+
+/*
+ * Convert a string to a long that may not be <0.
+ * If the string is invalid, or is less than 0, return -1.
+ */
+int
+mandoc_strntoi(const char *p, size_t sz, int base)
+{
+	char		 buf[32];
+	char		*ep;
+	long		 v;
+
+	if (sz > 31)
+		return(-1);
+
+	memcpy(buf, p, sz);
+	buf[(int)sz] = '\0';
+
+	errno = 0;
+	v = strtol(buf, &ep, base);
+
+	if (buf[0] == '\0' || *ep != '\0')
+		return(-1);
+
+	if (v > INT_MAX)
+		v = INT_MAX;
+	if (v < INT_MIN)
+		v = INT_MIN;
+
+	return((int)v);
 }