===================================================================
RCS file: /cvs/pod2mdoc/pod2mdoc.c,v
retrieving revision 1.12
retrieving revision 1.25
diff -u -p -r1.12 -r1.25
--- pod2mdoc/pod2mdoc.c	2014/04/01 11:58:32	1.12
+++ pod2mdoc/pod2mdoc.c	2014/07/11 09:05:03	1.25
@@ -1,4 +1,4 @@
-/*	$Id: pod2mdoc.c,v 1.12 2014/04/01 11:58:32 kristaps Exp $ */
+/*	$Id: pod2mdoc.c,v 1.25 2014/07/11 09:05:03 schwarze Exp $ */
 /*
  * Copyright (c) 2014 Kristaps Dzonsons <kristaps@bsd.lv>
  *
@@ -27,7 +27,9 @@
 #include <unistd.h>
 
 /*
- * In what section can we find Perl manuals?
+ * In what section can we find Perl module manuals?
+ * Sometimes (Mac OS X) it's 3pm, sometimes (OpenBSD, etc.) 3p.
+ * XXX IF YOU CHANGE THIS, CHANGE POD2MDOC.1 AS WELL.
  */
 #define	PERL_SECTION	"3p"
 
@@ -170,76 +172,206 @@ formatescape(const char *buf, size_t *start, size_t en
 
 /*
  * Run some heuristics to intuit a link format.
- * I recognise L<foo::bar> as a Perl manpage, printing it in section 3p;
- * or a general UNIX foo(5) manpage.
- * If I recognise one, I set "start" to be the end of the sequence so
+ * I set "start" to be the end of the sequence (last right-carrot) so
  * that the caller can safely just continue processing.
- * Otherwise, I don't touch "start".
+ * If this is just an empty tag, I'll return 0.
  */
 static int
 trylink(const char *buf, size_t *start, size_t end, size_t dsz)
 {
-	size_t		sv, nstart, nend, i, j;
-	int		hasdouble;
+	size_t		 linkstart, realend, linkend, 
+			 i, j, textsz, stack;
+	const char	*text;
 
 	/* 
 	 * Scan to the start of the terminus. 
 	 * This function is more or less replicated in the formatcode()
 	 * for null or index formatting codes.
+	 * However, we're slightly different because we might have
+	 * nested escapes we need to ignore.
 	 */
-	hasdouble = 0;
-	for (sv = nstart = *start; nstart < end; nstart++) {
-		/* Do we have a double-colon? */
-		if (':' == buf[nstart] && 
-			nstart > sv &&
-			':' == buf[nstart - 1])
-			hasdouble = 1;
-		if ('>' != buf[nstart])
+	stack = 0;
+	for (linkstart = realend = *start; realend < end; realend++) {
+		if ('<' == buf[realend])
+			stack++;
+		if ('>' != buf[realend])
 			continue;
-		else if (dsz == 1)
+		else if (stack-- > 0)
+			continue;
+		if (dsz == 1)
 			break;
-		assert(nstart > 0);
-		if (' ' != buf[nstart - 1])
+		assert(realend > 0);
+		if (' ' != buf[realend - 1])
 			continue;
-		i = nstart;
-		for (j = 0; i < end && j < dsz; j++) 
+		for (i = realend, j = 0; i < end && j < dsz; j++) 
 			if ('>' != buf[i++])
 				break;
 		if (dsz == j) 
 			break;
 	}
-	
-	/* We don't care about stubs. */
-	if (nstart == end || nstart == *start)
+
+	/* Ignore stubs. */
+	if (realend == end || realend == *start)
 		return(0);
 
-	/* Set nend to the end of content. */
-	nend = nstart;
-	if (dsz > 1)
-		nend--;
+	/* Set linkend to the end of content. */
+	linkend = dsz > 1 ? realend - 1 : realend;
 
-	/*
-	 * Provide for some common invocations of the link primitive.
-	 * First, allow us to link to other Perl manuals.
-	 */
-	if (hasdouble)
+	/* Re-scan to see if we have a title or section. */
+	text = &buf[*start];
+	for (textsz = *start; textsz < linkend; textsz++)
+		if ('|' == buf[textsz] || '/' == buf[textsz])
+			break;
+
+	if (textsz < linkend && '|' == buf[textsz]) {
+		/* With title: set start, then end at section. */
+		linkstart = textsz + 1;
+		textsz = textsz - *start;
+		for (i = linkstart; i < linkend; i++)
+			if ('/' == buf[i])
+				break;
+		if (i < linkend)
+			linkend = i;
+	} else if (textsz < linkend && '/' == buf[textsz]) {
+		/* With section: set end at section. */
+		linkend = textsz;
+		textsz = 0;
+	} else
+		/* No title, no section. */
+		textsz = 0;
+
+	*start = realend;
+	j = linkend - linkstart;
+
+	/* Do we have only subsection material? */
+	if (0 == j && '/' == buf[linkend]) {
+		linkstart = linkend + 1;
+		linkend = dsz > 1 ? realend - 1 : realend;
+		if (0 == (j = linkend - linkstart))
+			return(0);
+		printf("Sx %.*s", (int)j, &buf[linkstart]);
+		return(1);
+	} else if (0 == j)
+		return(0);
+
+	/* See if we qualify as being a link or not. */
+	if ((j > 4 && 0 == memcmp("http:", &buf[linkstart], j)) ||
+		(j > 5 && 0 == memcmp("https:", &buf[linkstart], j)) ||
+		(j > 3 && 0 == memcmp("ftp:", &buf[linkstart], j)) ||
+		(j > 4 && 0 == memcmp("sftp:", &buf[linkstart], j)) ||
+		(j > 3 && 0 == memcmp("smb:", &buf[linkstart], j)) ||
+		(j > 3 && 0 == memcmp("afs:", &buf[linkstart], j))) {
+		/* Gross. */
+		printf("Lk %.*s", (int)((dsz > 1 ? realend - 1 : 
+			realend) - linkstart), &buf[linkstart]);
+		return(1);
+	} 
+	
+	/* See if we qualify as a mailto. */
+	if (j > 6 && 0 == memcmp("mailto:", &buf[linkstart], j)) {
+		printf("Mt %.*s", (int)j, &buf[linkstart]);
+		return(1);
+	}
+
+	/* See if we're a foo(5), foo(5x), or foo(5xx) manpage. */
+	if ((j > 3 && ')' == buf[linkend - 1]) && 
+		('(' == buf[linkend - 3])) {
+		printf("Xr %.*s %c", (int)(j - 3), 
+			&buf[linkstart], buf[linkend - 2]);
+		return(1);
+	} else if ((j > 4 && ')' == buf[linkend - 1]) &&
+		('(' == buf[linkend - 4])) {
+		printf("Xr %.*s %.*s", (int)(j - 4), 
+			&buf[linkstart], 2, &buf[linkend - 3]);
+		return(1);
+	} else if ((j > 5 && ')' == buf[linkend - 1]) &&
+		('(' == buf[linkend - 5])) {
+		printf("Xr %.*s %.*s", (int)(j - 5), 
+			&buf[linkstart], 3, &buf[linkend - 4]);
+		return(1);
+	}
+
+	/* Last try: do we have a double-colon? */
+	for (i = linkstart + 1; i < linkend; i++)
+		if (':' == buf[i] && ':' == buf[i - 1])
+			break;
+
+	if (i < linkend)
 		printf("Xr %.*s " PERL_SECTION, 
-			(int)(nend - sv), &buf[sv]);
-	else if (nend - sv > 3 && isalnum(buf[sv]) && 
-			')' == buf[nend - 1] && 
-			isdigit((int)buf[nend - 2]) &&
-			'(' == buf[nend - 3]) 
-		printf("Xr %.*s %c", 
-			(int)(nend - 3 - sv),
-			&buf[sv], buf[nend - 2]);
+			(int)j, &buf[linkstart]);
 	else
-		return(0);
+		printf("Xr %.*s 1", (int)j, &buf[linkstart]);
 
-	*start = nstart;
 	return(1);
 }
 
 /*
+ * Doclifting: if we're a bold "-xx" and we're in the SYNOPSIS section,
+ * then it's likely that we're a flag.
+ * Our flag might be followed by an argument, so make sure that we're
+ * accounting for that, too.
+ * If we don't have a flag at all, however, then assume we're an "Ar".
+ */
+static void
+dosynopsisfl(const char *buf, size_t *start, size_t end)
+{
+	size_t	 i;
+again:
+	assert(*start + 1 < end);
+	assert('-' == buf[*start]);
+
+	if ( ! isalnum((int)buf[*start + 1]) &&
+		'?' != buf[*start + 1] &&
+		'-' != buf[*start + 1]) {
+		(*start)--;
+		fputs("Ar ", stdout);
+		return;
+	}
+
+	(*start)++;
+	for (i = *start; i < end; i++)
+		if (isalnum((int)buf[i]))
+			continue;
+		else if ('?' == buf[i])
+			continue;
+		else if ('-' == buf[i])
+			continue;
+		else if ('_' == buf[i])
+			continue;
+		else
+			break;
+
+	assert(i < end);
+
+	if ( ! (' ' == buf[i] || '>' == buf[i])) {
+		printf("Ar ");
+		return;
+	}
+
+	printf("Fl ");
+	if (end - *start > 1 && 
+		isupper((int)buf[*start]) &&
+		islower((int)buf[*start + 1]) &&
+		(end - *start == 2 ||
+		 ' ' == buf[*start + 2]))
+		printf("\\&");
+	printf("%.*s ", (int)(i - *start), &buf[*start]);
+	*start = i;
+
+	if (' ' == buf[i]) {
+		while (i < end && ' ' == buf[i])
+			i++;
+		assert(i < end);
+		if ('-' == buf[i]) {
+			*start = i;
+			goto again;
+		}
+		printf("Ar ");
+		*start = i;
+	}
+}
+
+/*
  * We're at the character in front of a format code, which is structured
  * like X<...> and can contain nested format codes.
  * This consumes the whole format code, and any nested format codes, til
@@ -248,13 +380,16 @@ trylink(const char *buf, size_t *start, size_t end, si
  * been printed to the current line.
  * If "nomacro", then we don't print any macros, just contained data
  * (e.g., following "Sh" or "Nm").
+ * "pos" is only significant in SYNOPSIS, and should be 0 when invoked
+ * as the first format code on a line (for decoration as an "Nm"),
+ * non-zero otherwise.
  * Return whether we've printed a macro or not--in other words, whether
  * this should trigger a subsequent newline (this should be ignored when
  * reentrant).
  */
 static int
-formatcode(struct state *st, const char *buf, 
-	size_t *start, size_t end, int reentrant, int nomacro)
+formatcode(struct state *st, const char *buf, size_t *start, 
+	size_t end, int reentrant, int nomacro, int pos)
 {
 	enum fmt	 fmt;
 	size_t		 i, j, dsz;
@@ -325,6 +460,13 @@ formatcode(struct state *st, const char *buf, 
 			(*start) += dsz;
 			break;
 		}
+		if (*start < end) {
+			assert('>' == buf[*start]);
+			(*start)++;
+		}
+		if (isspace(last))
+			while (*start < end && isspace((int)buf[*start]))
+				(*start)++;
 		return(0);
 	}
 
@@ -360,27 +502,14 @@ formatcode(struct state *st, const char *buf, 
 			printf("Em ");
 			break;
 		case (FMT_BOLD):
-			/*
-			 * Doclifting: if we're a bold "-xx" and we're
-			 * in the SYNOPSIS section, then it's likely
-			 * that we're a flag.
-			 * Be really strict: only do this when the dash
-			 * is followed by alnums til the end marker,
-			 * which mustn't be a custom.
-			 */
-			if (SECT_SYNOPSIS == st->sect &&
-				end - *start > 1 &&
-				'-' == buf[*start] &&
-				(isalnum((int)buf[*start + 1]) ||
-				 '?' == buf[*start + 1])) {
-				for (i = *start + 1; i < end; i++)
-					if ( ! isalnum((int)buf[i]))
-						break;
-				if (i < end && '>' == buf[i]) {
-					(*start)++;
-					printf("Fl ");
-					break;
-				}
+			if (SECT_SYNOPSIS == st->sect) { 
+				if (1 == dsz && '-' == buf[*start])
+					dosynopsisfl(buf, start, end);
+				else if (0 == pos)
+					printf("Nm ");
+				else
+					printf("Ar ");
+				break;
 			} 
 			printf("Sy ");
 			break;
@@ -388,6 +517,7 @@ formatcode(struct state *st, const char *buf, 
 			printf("Qo Li ");
 			break;
 		case (FMT_LINK):
+			/* Try to link; use "No" if it's empty. */
 			if ( ! trylink(buf, start, end, dsz))
 				printf("No ");
 			break;
@@ -430,7 +560,7 @@ formatcode(struct state *st, const char *buf, 
 			}
 		}
 		if (*start + 1 < end && '<' == buf[*start + 1]) {
-			formatcode(st, buf, start, end, 1, nomacro);
+			formatcode(st, buf, start, end, 1, nomacro, 1);
 			continue;
 		}
 
@@ -492,7 +622,7 @@ formatcodeln(struct state *st, const char *buf, 
 	last = ' ';
 	while (*start < end)  {
 		if (*start + 1 < end && '<' == buf[*start + 1]) {
-			formatcode(st, buf, start, end, 1, nomacro);
+			formatcode(st, buf, start, end, 1, nomacro, 1);
 			continue;
 		}
 		/*
@@ -733,10 +863,47 @@ static void
 verbatim(struct state *st, const char *buf, size_t start, size_t end)
 {
 	int		 last;
+	size_t		 i;
 
 	if ( ! st->parsing || st->paused)
 		return;
-
+again:
+	/* 
+	 * If we're in the SYNOPSIS, see if we're an #include block.
+	 * If we are, then print the "In" macro and re-loop.
+	 * This handles any number of inclusions, but only when they
+	 * come before the remaining parts...
+	 */
+	if (SECT_SYNOPSIS == st->sect) {
+		i = start;
+		for (i = start; i < end && ' ' == buf[i]; i++)
+			/* Spin. */ ;
+		if (i == end)
+			return;
+		/* We're an include block! */
+		if (end - i > 10 && 
+			0 == memcmp(&buf[i], "#include <", 10)) {
+			start = i + 10;
+			while (start < end && ' ' == buf[start])
+				start++;
+			fputs(".In ", stdout);
+			/* Stop til the '>' marker or we hit eoln. */
+			while (start < end && 
+				'>' != buf[start] && '\n' != buf[start])
+				putchar(buf[start++]);
+			putchar('\n');
+			if (start < end && '>' == buf[start])
+				start++;
+			if (start < end && '\n' == buf[start])
+				start++;
+			if (start < end) 
+				goto again;
+			return;
+		}
+	}
+	
+	if (start == end)
+		return;
 	puts(".Bd -literal");
 	for (last = ' '; start < end; start++) {
 		/*
@@ -755,6 +922,95 @@ verbatim(struct state *st, const char *buf, size_t sta
 }
 
 /*
+ * See dosynopsisop().
+ */
+static int
+hasmatch(const char *buf, size_t start, size_t end)
+{
+	size_t	 stack;
+
+	for (stack = 0; start < end; start++) 
+		if (buf[start] == '[')
+			stack++;
+		else if (buf[start] == ']' && 0 == stack)
+			return(1);
+		else if (buf[start] == ']')
+			stack--;
+	return(0);
+}
+
+/*
+ * If we're in the SYNOPSIS section and we've encounter braces in an
+ * ordinary paragraph, then try to see whether we're an [-option].
+ * Do this, if we're an opening bracket, by first seeing if we have a
+ * matching end via hasmatch().
+ * If we're an ending bracket, see if we have a stack already.
+ */
+static int
+dosynopsisop(const char *buf, int *last,
+	size_t *start, size_t end, size_t *opstack)
+{
+
+	assert('[' == buf[*start] || ']' == buf[*start]);
+
+	if ('[' == buf[*start] && hasmatch(buf, *start + 1, end)) {
+		if ('\n' != *last)
+			putchar('\n');
+		puts(".Oo");
+		(*opstack)++;
+	} else if ('[' == buf[*start])
+		return(0);
+
+	if (']' == buf[*start] && *opstack > 0) {
+		if ('\n' != *last)
+			putchar('\n');
+		puts(".Oc");
+		(*opstack)--;
+	} else if (']' == buf[*start])
+		return(0);
+
+	(*start)++;
+	*last = '\n';
+	while (' ' == buf[*start])
+		(*start)++;
+	return(1);
+}
+
+/*
+ * Format multiple "Nm" manpage names in the NAME section.
+ */
+static void
+donamenm(struct state *st, const char *buf, size_t *start, size_t end)
+{
+	size_t	 word;
+
+	while (*start < end && ' ' == buf[*start])
+		(*start)++;
+
+	if (end == *start) {
+		puts(".Nm unknown");
+		return;
+	}
+
+	while (*start < end) {
+		fputs(".Nm ", stdout);
+		for (word = *start; word < end; word++)
+			if (',' == buf[word])
+				break;
+		formatcodeln(st, buf, start, word, 1);
+		if (*start == end) {
+			putchar('\n');
+			continue;
+		}
+		assert(',' == buf[*start]);
+		puts(" ,");
+		(*start)++;
+		while (*start < end && ' ' == buf[*start])
+			(*start)++;
+	}
+}
+
+/*
  * Ordinary paragraph.
  * Well, this is really the hardest--POD seems to assume that, for
  * example, a leading space implies a newline, and so on.
@@ -765,7 +1021,8 @@ verbatim(struct state *st, const char *buf, size_t sta
 static void
 ordinary(struct state *st, const char *buf, size_t start, size_t end)
 {
-	size_t		i, j;
+	size_t		i, j, opstack;
+	int		seq;
 
 	if ( ! st->parsing || st->paused)
 		return;
@@ -777,8 +1034,8 @@ ordinary(struct state *st, const char *buf, size_t sta
 	 * To wit, print out a "Nm" and "Nd" in that format.
 	 */
 	if (SECT_NAME == st->sect) {
-		for (i = end - 1; i > start; i--)
-			if ('-' == buf[i])
+		for (i = end - 2; i > start; i--)
+			if ('-' == buf[i] && ' ' == buf[i + 1])
 				break;
 		if ('-' == buf[i]) {
 			j = i;
@@ -786,11 +1043,11 @@ ordinary(struct state *st, const char *buf, size_t sta
 			for ( ; i > start; i--)
 				if ('-' != buf[i])
 					break;
-			printf(".Nm ");
-			formatcodeln(st, buf, &start, i + 1, 1);
-			putchar('\n');
+			donamenm(st, buf, &start, i + 1);
 			start = j + 1;
-			printf(".Nd ");
+			while (start < end && ' ' == buf[start])
+				start++;
+			fputs(".Nd ", stdout);
 			formatcodeln(st, buf, &start, end, 1);
 			putchar('\n');
 			return;
@@ -802,8 +1059,9 @@ ordinary(struct state *st, const char *buf, size_t sta
 
 	st->haspar = 0;
 	last = '\n';
+	opstack = 0;
 
-	while (start < end) {
+	for (seq = 0; start < end; seq++) {
 		/* 
 		 * Loop til we get either to a newline or escape. 
 		 * Escape initial control characters.
@@ -817,26 +1075,17 @@ ordinary(struct state *st, const char *buf, size_t sta
 				printf("\\&");
 			else if ('\n' == last && '\'' == buf[start])
 				printf("\\&");
-#if notyet
 			/*
 			 * If we're in the SYNOPSIS, have square
 			 * brackets indicate that we're opening and
 			 * closing an optional context.
 			 */
-			if (SECT_SYNOPSIS == st->sect) {
-				if ('[' == buf[start] || 
-					']' == buf[start]) {
-					if (last != '\n')
-						putchar('\n');
-					if ('[' == buf[start]) 
-						printf(".Oo\n");
-					else
-						printf(".Oc\n");
-					start++;
-					continue;
-				}
-			}
-#endif
+			if (SECT_SYNOPSIS == st->sect &&
+				('[' == buf[start] || 
+				 ']' == buf[start]) &&
+				dosynopsisop(buf, &last, 
+					&start, end, &opstack))
+				continue;
 			putchar(last = buf[start++]);
 			if ('\\' == last)
 				putchar('e');
@@ -851,8 +1100,19 @@ ordinary(struct state *st, const char *buf, size_t sta
 			 * following that, a newline.
 			 * Consume all whitespace so we don't
 			 * accidentally start an implicit literal line.
+			 * If the macro ends with a flush comma or
+			 * period, let mdoc(7) handle it for us.
 			 */
-			if (formatcode(st, buf, &start, end, 0, 0)) {
+			if (formatcode(st, buf, &start, end, 0, 0, seq)) {
+				if ((start == end - 1 || 
+					(start < end - 1 && 
+					 (' ' == buf[start + 1] ||
+					  '\n' == buf[start + 1]))) &&
+					('.' == buf[start] ||
+					 ',' == buf[start])) {
+					putchar(' ');
+					putchar(buf[start++]);
+				}
 				putchar(last = '\n');
 				while (start < end && ' ' == buf[start])
 					start++;
@@ -1117,8 +1377,8 @@ main(int argc, char *argv[])
 
 	/* Accept only a single input file. */
 
-	if (argc > 2)
-		return(EXIT_FAILURE);
+	if (argc > 1)
+		goto usage;
 	else if (1 == argc)
 		fname = *argv;
 
@@ -1127,7 +1387,7 @@ main(int argc, char *argv[])
 
 usage:
 	fprintf(stderr, "usage: %s [-d date] " 
-		"[-n title] [-s section]\n", name);
+	    "[-n title] [-s section] [file]\n", name);
 
 	return(EXIT_FAILURE);
 }