===================================================================
RCS file: /cvs/mandoc/mandocdb.c,v
retrieving revision 1.93
retrieving revision 1.95
diff -u -p -r1.93 -r1.95
--- mandoc/mandocdb.c	2014/01/02 18:52:15	1.93
+++ mandoc/mandocdb.c	2014/01/02 22:19:41	1.95
@@ -1,7 +1,7 @@
-/*	$Id: mandocdb.c,v 1.93 2014/01/02 18:52:15 schwarze Exp $ */
+/*	$Id: mandocdb.c,v 1.95 2014/01/02 22:19:41 schwarze Exp $ */
 /*
  * Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
  *
  * Permission to use, copy, modify, and distribute this software for any
  * purpose with or without fee is hereby granted, provided that the above
@@ -176,6 +176,7 @@ static	int	 	 use_all; /* use all found files */
 static	int		 nodb; /* no database changes */
 static	int	  	 verb; /* print what we're doing */
 static	int	  	 warnings; /* warn about crap */
+static	int		 write_utf8; /* write UTF-8 output; else ASCII */
 static	int		 exitcode; /* to be returned by main */
 static	enum op	  	 op; /* operational mode */
 static	char		 basedir[PATH_MAX]; /* current base directory */
@@ -351,7 +352,7 @@ main(int argc, char *argv[])
 	path_arg = NULL;
 	op = OP_DEFAULT;
 
-	while (-1 != (ch = getopt(argc, argv, "aC:d:ntu:vW")))
+	while (-1 != (ch = getopt(argc, argv, "aC:d:nT:tu:vW")))
 		switch (ch) {
 		case ('a'):
 			use_all = 1;
@@ -369,6 +370,14 @@ main(int argc, char *argv[])
 		case ('n'):
 			nodb = 1;
 			break;
+		case ('T'):
+			if (strcmp(optarg, "utf8")) {
+				fprintf(stderr, "-T%s: Unsupported "
+				    "output format\n", optarg);
+				goto usage;
+			}
+			write_utf8 = 1;
+			break;
 		case ('t'):
 			CHECKOP(op, ch);
 			dup2(STDOUT_FILENO, STDERR_FILENO);
@@ -490,9 +499,9 @@ out:
 	ohash_delete(&mlinks);
 	return(exitcode);
 usage:
-	fprintf(stderr, "usage: %s [-anvW] [-C file]\n"
-			"       %s [-anvW] dir ...\n"
-			"       %s [-nvW] -d dir [file ...]\n"
+	fprintf(stderr, "usage: %s [-anvW] [-C file] [-Tutf8]\n"
+			"       %s [-anvW] [-Tutf8] dir ...\n"
+			"       %s [-nvW] [-Tutf8] -d dir [file ...]\n"
 			"       %s [-nvW] -u dir [file ...]\n"
 			"       %s -t file ...\n",
 		       progname, progname, progname, 
@@ -522,8 +531,8 @@ treescan(void)
 	FTSENT		*ff;
 	struct mlink	*mlink;
 	int		 dform;
-	char		*fsec;
-	const char	*dsec, *arch, *cp, *path;
+	char		*dsec, *arch, *fsec, *cp;
+	const char	*path;
 	const char	*argv[2];
 
 	argv[0] = ".";
@@ -588,16 +597,14 @@ treescan(void)
 				continue;
 			} else
 				fsec[-1] = '\0';
+
 			mlink = mandoc_calloc(1, sizeof(struct mlink));
 			strlcpy(mlink->file, path, sizeof(mlink->file));
 			mlink->dform = dform;
-			if (NULL != dsec)
-				mlink->dsec = mandoc_strdup(dsec);
-			if (NULL != arch)
-				mlink->arch = mandoc_strdup(arch);
-			mlink->name = mandoc_strdup(ff->fts_name);
-			if (NULL != fsec)
-				mlink->fsec = mandoc_strdup(fsec);
+			mlink->dsec = dsec;
+			mlink->arch = arch;
+			mlink->name = ff->fts_name;
+			mlink->fsec = fsec;
 			mlink_add(mlink, ff->fts_statp);
 			continue;
 		} else if (FTS_D != ff->fts_info &&
@@ -617,8 +624,6 @@ treescan(void)
 			 * Try to infer this from the name.
 			 * If we're not in use_all, enforce it.
 			 */
-			dsec = NULL;
-			dform = FORM_NONE;
 			cp = ff->fts_name;
 			if (FTS_DP == ff->fts_info)
 				break;
@@ -629,6 +634,9 @@ treescan(void)
 			} else if (0 == strncmp(cp, "cat", 3)) {
 				dform = FORM_CAT;
 				dsec = cp + 3;
+			} else {
+				dform = FORM_NONE;
+				dsec = NULL;
 			}
 
 			if (NULL != dsec || use_all) 
@@ -643,9 +651,10 @@ treescan(void)
 			 * Possibly our architecture.
 			 * If we're descending, keep tabs on it.
 			 */
-			arch = NULL;
 			if (FTS_DP != ff->fts_info && NULL != dsec)
 				arch = ff->fts_name;
+			else
+				arch = NULL;
 			break;
 		default:
 			if (FTS_DP == ff->fts_info || use_all)
@@ -719,16 +728,16 @@ filescan(const char *file)
 		*p++ = '\0';
 		if (0 == strncmp(start, "man", 3)) {
 			mlink->dform = FORM_SRC;
-			mlink->dsec = mandoc_strdup(start + 3);
+			mlink->dsec = start + 3;
 		} else if (0 == strncmp(start, "cat", 3)) {
 			mlink->dform = FORM_CAT;
-			mlink->dsec = mandoc_strdup(start + 3);
+			mlink->dsec = start + 3;
 		}
 
 		start = p;
 		if (NULL != mlink->dsec && NULL != (p = strchr(start, '/'))) {
 			*p++ = '\0';
-			mlink->arch = mandoc_strdup(start);
+			mlink->arch = start;
 			start = p;
 		}
 	}
@@ -743,7 +752,7 @@ filescan(const char *file)
 
 	if ('.' == *p) {
 		*p++ = '\0';
-		mlink->fsec = mandoc_strdup(p);
+		mlink->fsec = p;
 	}
 
 	/*
@@ -755,8 +764,6 @@ filescan(const char *file)
 		mlink->name = p + 1;
 		*p = '\0';
 	}
-	mlink->name = mandoc_strdup(mlink->name);
-
 	mlink_add(mlink, &st);
 }
 
@@ -769,14 +776,10 @@ mlink_add(struct mlink *mlink, const struct stat *st)
 
 	assert(NULL != mlink->file);
 
-	if (NULL == mlink->dsec)
-		mlink->dsec = mandoc_strdup("");
-	if (NULL == mlink->arch)
-		mlink->arch = mandoc_strdup("");
-	if (NULL == mlink->name)
-		mlink->name = mandoc_strdup("");
-	if (NULL == mlink->fsec)
-		mlink->fsec = mandoc_strdup("");
+	mlink->dsec = mandoc_strdup(mlink->dsec ? mlink->dsec : "");
+	mlink->arch = mandoc_strdup(mlink->arch ? mlink->arch : "");
+	mlink->name = mandoc_strdup(mlink->name ? mlink->name : "");
+	mlink->fsec = mandoc_strdup(mlink->fsec ? mlink->fsec : "");
 
 	if ('0' == *mlink->fsec) {
 		free(mlink->fsec);
@@ -1728,31 +1731,40 @@ utf8key(struct mchars *mc, struct str *key)
 		 * Parse the escape sequence and see if it's a
 		 * predefined character or special character.
 		 */
+
 		esc = mandoc_escape
 			((const char **)&val, &seq, &len);
 		if (ESCAPE_ERROR == esc)
 			break;
-
 		if (ESCAPE_SPECIAL != esc)
 			continue;
-		if (0 == (u = mchars_spec2cp(mc, seq, len)))
-			continue;
 
 		/*
-		 * If we have a Unicode codepoint, try to convert that
-		 * to a UTF-8 byte string.
+		 * Render the special character
+		 * as either UTF-8 or ASCII.
 		 */
-		cpp = utfbuf;
-		if (0 == (sz = utf8(u, utfbuf)))
-			continue;
 
+		if (write_utf8) {
+			if (0 == (u = mchars_spec2cp(mc, seq, len)))
+				continue;
+			cpp = utfbuf;
+			if (0 == (sz = utf8(u, utfbuf)))
+				continue;
+			sz = strlen(cpp);
+		} else {
+			cpp = mchars_spec2str(mc, seq, len, &sz);
+			if (NULL == cpp)
+				continue;
+			if (ASCII_NBRSP == *cpp) {
+				cpp = " ";
+				sz = 1;
+			}
+		}
+
 		/* Copy the rendered glyph into the stream. */
 
-		sz = strlen(cpp);
 		bsz += sz;
-
 		buf = mandoc_realloc(buf, bsz);
-
 		memcpy(&buf[pos], cpp, sz);
 		pos += sz;
 	}