===================================================================
RCS file: /cvs/pod2mdoc/pod2mdoc.c,v
retrieving revision 1.5
retrieving revision 1.9
diff -u -p -r1.5 -r1.9
--- pod2mdoc/pod2mdoc.c	2014/03/23 13:00:24	1.5
+++ pod2mdoc/pod2mdoc.c	2014/03/24 01:07:30	1.9
@@ -1,4 +1,4 @@
-/*	$Id: pod2mdoc.c,v 1.5 2014/03/23 13:00:24 kristaps Exp $ */
+/*	$Id: pod2mdoc.c,v 1.9 2014/03/24 01:07:30 kristaps Exp $ */
 /*
  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  *
@@ -108,6 +108,8 @@ static	const char fmts[FMT__MAX] = {
 	'Z'		/* FMT_NULL */
 };
 
+static	int 	last;
+
 /*
  * Given buf[*start] is at the start of an escape name, read til the end
  * of the escape ('>') then try to do something with it.
@@ -141,7 +143,7 @@ formatescape(const char *buf, size_t *start, size_t en
 	 * TODO: right now, we only recognise the named escapes.
 	 * Just let the rest of them go. 
 	 */
-	if (0 == strcmp(esc, "lt"))
+	if (0 == strcmp(esc, "lt")) 
 		printf("\\(la");
 	else if (0 == strcmp(esc, "gt"))
 		printf("\\(ra");
@@ -149,20 +151,81 @@ formatescape(const char *buf, size_t *start, size_t en
 		printf("\\(ba");
 	else if (0 == strcmp(esc, "sol"))
 		printf("\\(sl");
+	else
+		return;
+
+	last = 'a';
 }
 
 /*
- * Skip space characters.
+ * Run some heuristics to intuit a link format.
+ * I recognise L<foo::bar> as a Perl manpage, printing it in section 3p;
+ * or a general UNIX foo(5) manpage.
+ * If I recognise one, I set "start" to be the end of the sequence so
+ * that the caller can safely just continue processing.
+ * Otherwise, I don't touch "start".
  */
 static int
-skipspace(const char *buf, size_t *start, size_t end)
+trylink(const char *buf, size_t *start, size_t end, size_t dsz)
 {
-	size_t		 sv = *start;
+	size_t		sv, nstart, nend, i, j;
+	int		hasdouble;
 
-	while (*start < end && ' ' == buf[*start])
-		(*start)++;
+	/* 
+	 * Scan to the start of the terminus. 
+	 * This function is more or less replicated in the formatcode()
+	 * for null or index formatting codes.
+	 */
+	hasdouble = 0;
+	for (sv = nstart = *start; nstart < end; nstart++) {
+		/* Do we have a double-colon? */
+		if (':' == buf[nstart] && 
+			nstart > sv &&
+			':' == buf[nstart - 1])
+			hasdouble = 1;
+		if ('>' != buf[nstart])
+			continue;
+		else if (dsz == 1)
+			break;
+		assert(nstart > 0);
+		if (' ' != buf[nstart - 1])
+			continue;
+		i = nstart;
+		for (j = 0; i < end && j < dsz; j++) 
+			if ('>' != buf[i++])
+				break;
+		if (dsz == j) 
+			break;
+	}
+	
+	/* We don't care about stubs. */
+	if (nstart == end || nstart == *start)
+		return(0);
 
-	return(*start > sv);
+	/* Set nend to the end of content. */
+	nend = nstart;
+	if (dsz > 1)
+		nend--;
+
+	/*
+	 * Provide for some common invocations of the link primitive.
+	 * First, allow us to link to other Perl manuals.
+	 */
+	if (hasdouble)
+		printf("Xr %.*s 3p", 
+			(int)(nend - sv), &buf[sv]);
+	else if (nend - sv > 3 && isalnum(buf[sv]) && 
+			')' == buf[nend - 1] && 
+			isdigit((int)buf[nend - 2]) &&
+			'(' == buf[nend - 3]) 
+		printf("Xr %.*s %c", 
+			(int)(nend - 3 - sv),
+			&buf[sv], buf[nend - 2]);
+	else
+		return(0);
+
+	*start = nstart;
+	return(1);
 }
 
 /*
@@ -172,13 +235,15 @@ skipspace(const char *buf, size_t *start, size_t end)
  * the end of matched production.
  * If "reentrant", then we're being called after a macro has already
  * been printed to the current line.
- * "last" is set to the last read character: this is used to determine
- * whether we should buffer with space or not.
- * If "nomacro", then we don't print any macros, just contained data.
+ * If "nomacro", then we don't print any macros, just contained data
+ * (e.g., following "Sh" or "Nm").
+ * Return whether we've printed a macro or not--in other words, whether
+ * this should trigger a subsequent newline (this should be ignored when
+ * reentrant).
  */
 static int
 formatcode(const char *buf, size_t *start, 
-	size_t end, int reentrant, int last, int nomacro)
+	size_t end, int reentrant, int nomacro)
 {
 	enum fmt	 fmt;
 	size_t		 i, j, dsz;
@@ -186,6 +251,21 @@ formatcode(const char *buf, size_t *start, 
 	assert(*start + 1 < end);
 	assert('<' == buf[*start + 1]);
 
+	/* 
+	 * First, look up the format code. 
+	 * If it's not valid, then exit immediately.
+	 */
+	for (fmt = 0; fmt < FMT__MAX; fmt++) 
+		if (buf[*start] == fmts[fmt])
+			break;
+
+	if (FMT__MAX == fmt) {
+		putchar(last = buf[(*start)++]);
+		if ('\\' == last)
+			putchar('e');
+		return(0);
+	}
+
 	/*
 	 * Determine whether we're overriding our delimiter.
 	 * According to POD, if we have more than one '<' followed by a
@@ -201,33 +281,21 @@ formatcode(const char *buf, size_t *start, 
 	if (dsz > 1 && (i >= end || ' ' != buf[i]))
 		dsz = 1;
 
-	for (fmt = 0; fmt < FMT__MAX; fmt++) 
-		if (buf[*start] == fmts[fmt])
-			break;
-
-	/* Invalid macros are just regular text. */
-
-	if (FMT__MAX == fmt) {
-		putchar(buf[*start]);
-		(*start)++;
-		return(0);
-	}
-
 	/* Remember, if dsz>1, to jump the trailing space. */
 	*start += dsz + 1 + (dsz > 1 ? 1 : 0);
 
 	/*
-	 * Escapes don't print macro sequences, so just output them like
-	 * normal text before processing for macros.
+	 * Escapes and ignored codes (NULL and INDEX) don't print macro
+	 * sequences, so just output them like normal text before
+	 * processing for real macros.
 	 */
 	if (FMT_ESCAPE == fmt) {
 		formatescape(buf, start, end);
 		return(0);
 	} else if (FMT_NULL == fmt || FMT_INDEX == fmt) {
 		/* 
-		 * For indices and nulls, just consume. 
-		 * Be wary of encountering custom delimiters (dsz>1),
-		 * which require special handling.
+		 * Just consume til the end delimiter, accounting for
+		 * whether it's a custom one.
 		 */
 		for ( ; *start < end; (*start)++) {
 			if ('>' != buf[*start])
@@ -249,6 +317,10 @@ formatcode(const char *buf, size_t *start, 
 		return(0);
 	}
 
+	/*
+	 * Check whether we're supposed to print macro stuff (this is
+	 * suppressed in, e.g., "Nm" and "Sh" macros).
+	 */
 	if ( ! nomacro) {
 		/*
 		 * Print out the macro describing this format code.
@@ -257,19 +329,21 @@ formatcode(const char *buf, size_t *start, 
 		 * indicator.
 		 * Otherwise, offset us with a space.
 		 */
-		if ( ! reentrant && last != '\n')
-			putchar('\n');
-		if ( ! reentrant)
+		if ( ! reentrant) {
+			if (last != '\n')
+				putchar('\n');
 			putchar('.');
-		else
+		} else 
 			putchar(' ');
 		
 		/*
-		 * If we don't have whitespace before us, then suppress
-		 * macro whitespace with Ns.
+		 * If we don't have whitespace before us (and none after
+		 * the opening delimiter), then suppress macro
+		 * whitespace with Pf.
 		 */
-		if (' ' != last)
-			printf("Ns ");
+		if (' ' != last && '\n' != last && ' ' != buf[*start])
+			printf("Pf ");
+
 		switch (fmt) {
 		case (FMT_ITALIC):
 			printf("Em ");
@@ -281,13 +355,13 @@ formatcode(const char *buf, size_t *start, 
 			printf("Qo Li ");
 			break;
 		case (FMT_LINK):
-			printf("Lk ");
+			if ( ! trylink(buf, start, end, dsz))
+				printf("No ");
 			break;
 		case (FMT_FILE):
 			printf("Pa ");
 			break;
 		case (FMT_NBSP):
-			/* TODO. */
 			printf("No ");
 			break;
 		default:
@@ -296,7 +370,7 @@ formatcode(const char *buf, size_t *start, 
 	}
 
 	/*
-	 * Read until we reach the end market (e.g., '>') or until we
+	 * Process until we reach the end marker (e.g., '>') or until we
 	 * find a nested format code.
 	 * Don't emit any newlines: since we're on a macro line, we
 	 * don't want to break the line.
@@ -323,7 +397,7 @@ formatcode(const char *buf, size_t *start, 
 			}
 		}
 		if (*start + 1 < end && '<' == buf[*start + 1]) {
-			formatcode(buf, start, end, 1, last, nomacro);
+			formatcode(buf, start, end, 1, nomacro);
 			continue;
 		}
 
@@ -343,43 +417,34 @@ formatcode(const char *buf, size_t *start, 
 			printf("\\&");
 
 		/* Suppress newline. */
-		if ('\n' == (last = buf[(*start)++]))
-			last = ' ';
+		if ('\n' == buf[*start])
+			putchar(last = ' ');
+		else
+			putchar(last = buf[*start]);
 
-		putchar(last);
+		/* Protect against character escapes. */
+		if ('\\' == last)
+			putchar('e');
+
+		(*start)++;
+
+		if (' ' == last)
+			while (*start < end && ' ' == buf[*start])
+				(*start)++;
 	}
 
 	if ( ! nomacro && FMT_CODE == fmt)
 		printf(" Qc ");
 
-	if (reentrant)
-		return(1);
-
-	/* FIXME: with the "Qc", this doens't work good. */
-
 	/*
-	 * If we're not reentrant, we want to put ending punctuation on
-	 * the macro line so that it's properly handled by being
-	 * smooshed against the terminal word.
+	 * We're now subsequent the format code.
+	 * If there isn't a space (or newline) here, and we haven't just
+	 * printed a space, then suppress space.
 	 */
-	skipspace(buf, start, end);
+	if ( ! nomacro && ' ' != last)
+		if (' ' != buf[*start] && '\n' != buf[*start])
+			printf(" Ns ");
 
-	if (',' != buf[*start] && '.' != buf[*start] &&
-		'!' != buf[*start] && '?' != buf[*start] &&
-		')' != buf[*start])
-		return(1);
-	while (*start < end) {
-		if (',' != buf[*start] &&
-			'.' != buf[*start] &&
-			'!' != buf[*start] &&
-			'?' != buf[*start] &&
-			')' != buf[*start])
-			break;
-		putchar(' ');
-		putchar(buf[*start]);
-		(*start)++;
-	}
-	skipspace(buf, start, end);
 	return(1);
 }
 
@@ -389,12 +454,11 @@ formatcode(const char *buf, size_t *start, 
 static void
 formatcodeln(const char *buf, size_t *start, size_t end, int nomacro)
 {
-	int		 last;
 
 	last = ' ';
 	while (*start < end)  {
 		if (*start + 1 < end && '<' == buf[*start + 1]) {
-			formatcode(buf, start, end, 1, last, nomacro);
+			formatcode(buf, start, end, 1, nomacro);
 			continue;
 		}
 		/*
@@ -411,10 +475,15 @@ formatcodeln(const char *buf, size_t *start, size_t en
 				 ' ' == buf[*start + 2]))
 			printf("\\&");
 
-		if ('\n' != buf[*start])
-			putchar(last = buf[*start]);
-		else
+		if ('\n' == buf[*start])
 			putchar(last = ' ');
+		else
+			putchar(last = buf[*start]);
+
+		/* Protect against character escapes. */
+		if ('\\' == last)
+			putchar('e');
+
 		(*start)++;
 	}
 }
@@ -470,7 +539,9 @@ command(struct state *st, const char *buf, size_t star
 		return;
 
 	start += csz;
-	skipspace(buf, &start, end);
+	while (start < end && ' ' == buf[start])
+		start++;
+
 	len = end - start;
 
 	if (st->paused) {
@@ -534,6 +605,14 @@ command(struct state *st, const char *buf, size_t star
 		st->lstack[st->lpos - 1] = LIST__MAX;
 		break;
 	case (CMD_ITEM):
+		if (0 == st->lpos) {
+			/* 
+			 * Bad markup.
+			 * Try to compensate.
+			 */
+			st->lstack[st->lpos] = LIST__MAX;
+			st->lpos++;
+		}
 		assert(st->lpos > 0);
 		/*
 		 * If we're the first =item, guess at what our content
@@ -615,12 +694,25 @@ command(struct state *st, const char *buf, size_t star
 static void
 verbatim(struct state *st, const char *buf, size_t start, size_t end)
 {
+	int		 last;
 
 	if ( ! st->parsing || st->paused)
 		return;
 
 	puts(".Bd -literal");
-	printf("%.*s\n", (int)(end - start), &buf[start]);
+	for (last = ' '; start < end; start++) {
+		/*
+		 * Handle accidental macros (newline starting with
+		 * control character) and escapes.
+		 */
+		if ('\n' == last)
+			if ('.' == buf[start] || '\'' == buf[start])
+				printf("\\&");
+		putchar(last = buf[start]);
+		if ('\\' == buf[start])
+			printf("e");
+	}
+	putchar('\n');
 	puts(".Ed");
 }
 
@@ -635,7 +727,6 @@ verbatim(struct state *st, const char *buf, size_t sta
 static void
 ordinary(struct state *st, const char *buf, size_t start, size_t end)
 {
-	int		last;
 	size_t		i, j;
 
 	if ( ! st->parsing || st->paused)
@@ -689,6 +780,8 @@ ordinary(struct state *st, const char *buf, size_t sta
 			else if ('\n' == last && '\'' == buf[start])
 				printf("\\&");
 			putchar(last = buf[start++]);
+			if ('\\' == last)
+				putchar('e');
 		}
 
 		if (start < end - 1 && '<' == buf[start + 1]) {
@@ -698,9 +791,14 @@ ordinary(struct state *st, const char *buf, size_t sta
 			 * what, so print a newline now.
 			 * Then print the (possibly nested) macros and
 			 * following that, a newline.
+			 * Consume all whitespace so we don't
+			 * accidentally start an implicit literal line.
 			 */
-			if (formatcode(buf, &start, end, 0, last, 0))
+			if (formatcode(buf, &start, end, 0, 0)) {
 				putchar(last = '\n');
+				while (start < end && ' ' == buf[start])
+					start++;
+			}
 		} else if (start < end && '\n' == buf[start]) {
 			/*
 			 * Print the newline only if we haven't already
@@ -735,6 +833,8 @@ ordinary(struct state *st, const char *buf, size_t sta
 			else if ('\n' == last && '\'' == buf[start])
 				printf("\\&");
 			putchar(last = buf[start++]);
+			if ('\\' == last)
+				putchar('e');
 		}
 	}