![]() ![]() | ![]() |
version 1.43, 2011/03/22 14:05:45 | version 1.60, 2011/10/24 20:30:57 | ||
---|---|---|---|
|
|
||
/* $Id$ */ | /* $Id$ */ | ||
/* | /* | ||
* Copyright (c) 2008, 2009, 2010 Kristaps Dzonsons <kristaps@bsd.lv> | * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv> | ||
* Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> | * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org> | ||
* | * | ||
* Permission to use, copy, modify, and distribute this software for any | * Permission to use, copy, modify, and distribute this software for any | ||
|
|
||
#include <assert.h> | #include <assert.h> | ||
#include <ctype.h> | #include <ctype.h> | ||
#include <errno.h> | |||
#include <limits.h> | |||
#include <stdlib.h> | #include <stdlib.h> | ||
#include <stdio.h> | #include <stdio.h> | ||
#include <string.h> | #include <string.h> | ||
|
|
||
static int a2time(time_t *, const char *, const char *); | static int a2time(time_t *, const char *, const char *); | ||
static char *time2a(time_t); | static char *time2a(time_t); | ||
static int numescape(const char *); | |||
int | /* | ||
mandoc_special(char *p) | * Pass over recursive numerical expressions. This context of this | ||
* function is important: it's only called within character-terminating | |||
* escapes (e.g., \s[xxxyyy]), so all we need to do is handle initial | |||
* recursion: we don't care about what's in these blocks. | |||
* This returns the number of characters skipped or -1 if an error | |||
* occurs (the caller should bail). | |||
*/ | |||
static int | |||
numescape(const char *start) | |||
{ | { | ||
int len, i; | int i; | ||
char term; | size_t sz; | ||
char *sv; | const char *cp; | ||
len = 0; | |||
term = '\0'; | |||
sv = p; | |||
assert('\\' == *p); | i = 0; | ||
p++; | |||
switch (*p++) { | /* The expression consists of a subexpression. */ | ||
#if 0 | |||
case ('Z'): | if ('\\' == start[i]) { | ||
cp = &start[++i]; | |||
/* | |||
* Read past the end of the subexpression. | |||
* Bail immediately on errors. | |||
*/ | |||
if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) | |||
return(-1); | |||
return(i + cp - &start[i]); | |||
} | |||
if ('(' != start[i++]) | |||
return(0); | |||
/* | |||
* A parenthesised subexpression. Read until the closing | |||
* parenthesis, making sure to handle any nested subexpressions | |||
* that might ruin our parse. | |||
*/ | |||
while (')' != start[i]) { | |||
sz = strcspn(&start[i], ")\\"); | |||
i += (int)sz; | |||
if ('\0' == start[i]) | |||
return(-1); | |||
else if ('\\' != start[i]) | |||
continue; | |||
cp = &start[++i]; | |||
if (ESCAPE_ERROR == mandoc_escape(&cp, NULL, NULL)) | |||
return(-1); | |||
i += cp - &start[i]; | |||
} | |||
/* Read past the terminating ')'. */ | |||
return(++i); | |||
} | |||
enum mandoc_esc | |||
mandoc_escape(const char **end, const char **start, int *sz) | |||
{ | |||
char c, term, numeric; | |||
int i, lim, ssz, rlim; | |||
const char *cp, *rstart; | |||
enum mandoc_esc gly; | |||
cp = *end; | |||
rstart = cp; | |||
if (start) | |||
*start = rstart; | |||
i = lim = 0; | |||
gly = ESCAPE_ERROR; | |||
term = numeric = '\0'; | |||
switch ((c = cp[i++])) { | |||
/* | |||
* First the glyphs. There are several different forms of | |||
* these, but each eventually returns a substring of the glyph | |||
* name. | |||
*/ | |||
case ('('): | |||
gly = ESCAPE_SPECIAL; | |||
lim = 2; | |||
break; | |||
case ('['): | |||
gly = ESCAPE_SPECIAL; | |||
/* | |||
* Unicode escapes are defined in groff as \[uXXXX] to | |||
* \[u10FFFF], where the contained value must be a valid | |||
* Unicode codepoint. Here, however, only check whether | |||
* it's not a zero-width escape. | |||
*/ | |||
if ('u' == cp[i] && ']' != cp[i + 1]) | |||
gly = ESCAPE_UNICODE; | |||
term = ']'; | |||
break; | |||
case ('C'): | |||
if ('\'' != cp[i]) | |||
return(ESCAPE_ERROR); | |||
gly = ESCAPE_SPECIAL; | |||
term = '\''; | |||
break; | |||
/* | |||
* Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where | |||
* 'X' is the trigger. These have opaque sub-strings. | |||
*/ | |||
case ('F'): | |||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('X'): | case ('g'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('x'): | case ('k'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('S'): | case ('M'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('R'): | case ('m'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('N'): | case ('n'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('l'): | case ('V'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('L'): | case ('Y'): | ||
gly = ESCAPE_IGNORE; | |||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('H'): | case ('f'): | ||
if (ESCAPE_ERROR == gly) | |||
gly = ESCAPE_FONT; | |||
rstart= &cp[i]; | |||
if (start) | |||
*start = rstart; | |||
switch (cp[i++]) { | |||
case ('('): | |||
lim = 2; | |||
break; | |||
case ('['): | |||
term = ']'; | |||
break; | |||
default: | |||
lim = 1; | |||
i--; | |||
break; | |||
} | |||
break; | |||
/* | |||
* These escapes are of the form \X'Y', where 'X' is the trigger | |||
* and 'Y' is any string. These have opaque sub-strings. | |||
*/ | |||
case ('A'): | |||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('h'): | case ('b'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('D'): | case ('D'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('C'): | case ('o'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('b'): | case ('R'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('B'): | case ('X'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('a'): | case ('Z'): | ||
/* FALLTHROUGH */ | if ('\'' != cp[i++]) | ||
case ('A'): | return(ESCAPE_ERROR); | ||
if (*p++ != '\'') | gly = ESCAPE_IGNORE; | ||
return(0); | |||
term = '\''; | term = '\''; | ||
break; | break; | ||
#endif | |||
/* | |||
* These escapes are of the form \X'N', where 'X' is the trigger | |||
* and 'N' resolves to a numerical expression. | |||
*/ | |||
case ('B'): | |||
/* FALLTHROUGH */ | |||
case ('h'): | case ('h'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('H'): | |||
/* FALLTHROUGH */ | |||
case ('L'): | |||
/* FALLTHROUGH */ | |||
case ('l'): | |||
gly = ESCAPE_NUMBERED; | |||
/* FALLTHROUGH */ | |||
case ('S'): | |||
/* FALLTHROUGH */ | |||
case ('v'): | case ('v'): | ||
/* FALLTHROUGH */ | /* FALLTHROUGH */ | ||
case ('w'): | |||
/* FALLTHROUGH */ | |||
case ('x'): | |||
if (ESCAPE_ERROR == gly) | |||
gly = ESCAPE_IGNORE; | |||
if ('\'' != cp[i++]) | |||
return(ESCAPE_ERROR); | |||
term = numeric = '\''; | |||
break; | |||
/* | |||
* Special handling for the numbered character escape. | |||
* XXX Do any other escapes need similar handling? | |||
*/ | |||
case ('N'): | |||
if ('\0' == cp[i]) | |||
return(ESCAPE_ERROR); | |||
*end = &cp[++i]; | |||
if (isdigit((unsigned char)cp[i-1])) | |||
return(ESCAPE_IGNORE); | |||
while (isdigit((unsigned char)**end)) | |||
(*end)++; | |||
if (start) | |||
*start = &cp[i]; | |||
if (sz) | |||
*sz = *end - &cp[i]; | |||
if ('\0' != **end) | |||
(*end)++; | |||
return(ESCAPE_NUMBERED); | |||
/* | |||
* Sizes get a special category of their own. | |||
*/ | |||
case ('s'): | case ('s'): | ||
if (ASCII_HYPH == *p) | gly = ESCAPE_IGNORE; | ||
*p = '-'; | |||
i = 0; | rstart = &cp[i]; | ||
if ('+' == *p || '-' == *p) { | if (start) | ||
p++; | *start = rstart; | ||
i = 1; | |||
} | |||
switch (*p++) { | /* See +/- counts as a sign. */ | ||
c = cp[i]; | |||
if ('+' == c || '-' == c || ASCII_HYPH == c) | |||
++i; | |||
switch (cp[i++]) { | |||
case ('('): | case ('('): | ||
len = 2; | lim = 2; | ||
break; | break; | ||
case ('['): | case ('['): | ||
term = ']'; | term = numeric = ']'; | ||
break; | break; | ||
case ('\''): | case ('\''): | ||
term = '\''; | term = numeric = '\''; | ||
break; | break; | ||
case ('0'): | |||
i = 1; | |||
/* FALLTHROUGH */ | |||
default: | default: | ||
len = 1; | lim = 1; | ||
p--; | i--; | ||
break; | break; | ||
} | } | ||
if (ASCII_HYPH == *p) | /* See +/- counts as a sign. */ | ||
*p = '-'; | c = cp[i]; | ||
if ('+' == *p || '-' == *p) { | if ('+' == c || '-' == c || ASCII_HYPH == c) | ||
if (i) | ++i; | ||
return(0); | |||
p++; | |||
} | |||
/* Handle embedded numerical subexp or escape. */ | |||
if ('(' == *p) { | break; | ||
while (*p && ')' != *p) | |||
if ('\\' == *p++) { | |||
i = mandoc_special(--p); | |||
if (0 == i) | |||
return(0); | |||
p += i; | |||
} | |||
if (')' == *p++) | /* | ||
break; | * Anything else is assumed to be a glyph. | ||
*/ | |||
default: | |||
gly = ESCAPE_SPECIAL; | |||
lim = 1; | |||
i--; | |||
break; | |||
} | |||
return(0); | assert(ESCAPE_ERROR != gly); | ||
} else if ('\\' == *p) { | |||
if (0 == (i = mandoc_special(p))) | |||
return(0); | |||
p += i; | |||
} | |||
break; | rstart = &cp[i]; | ||
#if 0 | if (start) | ||
case ('Y'): | *start = rstart; | ||
/* FALLTHROUGH */ | |||
case ('V'): | /* | ||
/* FALLTHROUGH */ | * If a terminating block has been specified, we need to | ||
case ('$'): | * handle the case of recursion, which could have their | ||
/* FALLTHROUGH */ | * own terminating blocks that mess up our parse. This, by the | ||
case ('n'): | * way, means that the "start" and "size" values will be | ||
/* FALLTHROUGH */ | * effectively meaningless. | ||
#endif | */ | ||
case ('k'): | |||
/* FALLTHROUGH */ | ssz = 0; | ||
case ('M'): | if (numeric && -1 == (ssz = numescape(&cp[i]))) | ||
/* FALLTHROUGH */ | return(ESCAPE_ERROR); | ||
case ('m'): | |||
/* FALLTHROUGH */ | i += ssz; | ||
case ('f'): | rlim = -1; | ||
/* FALLTHROUGH */ | |||
case ('F'): | /* | ||
/* FALLTHROUGH */ | * We have a character terminator. Try to read up to that | ||
case ('*'): | * character. If we can't (i.e., we hit the nil), then return | ||
switch (*p++) { | * an error; if we can, calculate our length, read past the | ||
case ('('): | * terminating character, and exit. | ||
len = 2; | */ | ||
if ('\0' != term) { | |||
*end = strchr(&cp[i], term); | |||
if ('\0' == *end) | |||
return(ESCAPE_ERROR); | |||
rlim = *end - &cp[i]; | |||
if (sz) | |||
*sz = rlim; | |||
(*end)++; | |||
goto out; | |||
} | |||
assert(lim > 0); | |||
/* | |||
* We have a numeric limit. If the string is shorter than that, | |||
* stop and return an error. Else adjust our endpoint, length, | |||
* and return the current glyph. | |||
*/ | |||
if ((size_t)lim > strlen(&cp[i])) | |||
return(ESCAPE_ERROR); | |||
rlim = lim; | |||
if (sz) | |||
*sz = rlim; | |||
*end = &cp[i] + lim; | |||
out: | |||
assert(rlim >= 0 && rstart); | |||
/* Run post-processors. */ | |||
switch (gly) { | |||
case (ESCAPE_FONT): | |||
if (1 != rlim) | |||
break; | break; | ||
case ('['): | switch (*rstart) { | ||
term = ']'; | case ('3'): | ||
/* FALLTHROUGH */ | |||
case ('B'): | |||
gly = ESCAPE_FONTBOLD; | |||
break; | break; | ||
default: | case ('2'): | ||
len = 1; | /* FALLTHROUGH */ | ||
p--; | case ('I'): | ||
gly = ESCAPE_FONTITALIC; | |||
break; | break; | ||
case ('P'): | |||
gly = ESCAPE_FONTPREV; | |||
break; | |||
case ('1'): | |||
/* FALLTHROUGH */ | |||
case ('R'): | |||
gly = ESCAPE_FONTROMAN; | |||
break; | |||
} | } | ||
break; | break; | ||
case ('('): | case (ESCAPE_SPECIAL): | ||
len = 2; | if (1 != rlim) | ||
break; | |||
case ('['): | |||
term = ']'; | |||
break; | |||
case ('z'): | |||
len = 1; | |||
if ('\\' == *p) { | |||
if (0 == (i = mandoc_special(p))) | |||
return(0); | |||
p += i; | |||
return(*p ? (int)(p - sv) : 0); | |||
} | |||
break; | |||
case ('o'): | |||
/* FALLTHROUGH */ | |||
case ('w'): | |||
if ('\'' == *p++) { | |||
term = '\''; | |||
break; | break; | ||
} | if ('c' == *rstart) | ||
/* FALLTHROUGH */ | gly = ESCAPE_NOSPACE; | ||
break; | |||
default: | default: | ||
len = 1; | |||
p--; | |||
break; | break; | ||
} | } | ||
if (term) { | return(gly); | ||
for ( ; *p && term != *p; p++) | |||
if (ASCII_HYPH == *p) | |||
*p = '-'; | |||
return(*p ? (int)(p - sv) : 0); | |||
} | |||
for (i = 0; *p && i < len; i++, p++) | |||
if (ASCII_HYPH == *p) | |||
*p = '-'; | |||
return(i == len ? (int)(p - sv) : 0); | |||
} | } | ||
void * | void * | ||
mandoc_calloc(size_t num, size_t size) | mandoc_calloc(size_t num, size_t size) | ||
{ | { | ||
|
|
||
return(ptr); | return(ptr); | ||
} | } | ||
char * | |||
mandoc_strndup(const char *ptr, size_t sz) | |||
{ | |||
char *p; | |||
p = mandoc_malloc(sz + 1); | |||
memcpy(p, ptr, sz); | |||
p[(int)sz] = '\0'; | |||
return(p); | |||
} | |||
char * | char * | ||
mandoc_strdup(const char *ptr) | mandoc_strdup(const char *ptr) | ||
{ | { | ||
|
|
||
/* Quoting can only start with a new word. */ | /* Quoting can only start with a new word. */ | ||
start = *cpp; | start = *cpp; | ||
quoted = 0; | |||
if ('"' == *start) { | if ('"' == *start) { | ||
quoted = 1; | quoted = 1; | ||
start++; | start++; | ||
} else | } | ||
quoted = 0; | |||
pairs = 0; | pairs = 0; | ||
white = 0; | white = 0; | ||
|
|
||
memset(&tm, 0, sizeof(struct tm)); | memset(&tm, 0, sizeof(struct tm)); | ||
pp = NULL; | |||
#ifdef HAVE_STRPTIME | |||
pp = strptime(p, fmt, &tm); | pp = strptime(p, fmt, &tm); | ||
#endif | |||
if (NULL != pp && '\0' == *pp) { | if (NULL != pp && '\0' == *pp) { | ||
*t = mktime(&tm); | *t = mktime(&tm); | ||
return(1); | return(1); | ||
|
|
||
static char * | static char * | ||
time2a(time_t t) | time2a(time_t t) | ||
{ | { | ||
struct tm tm; | struct tm *tm; | ||
char *buf, *p; | char *buf, *p; | ||
size_t ssz; | size_t ssz; | ||
int isz; | int isz; | ||
localtime_r(&t, &tm); | tm = localtime(&t); | ||
/* | /* | ||
* Reserve space: | * Reserve space: | ||
|
|
||
*/ | */ | ||
p = buf = mandoc_malloc(10 + 4 + 4 + 1); | p = buf = mandoc_malloc(10 + 4 + 4 + 1); | ||
if (0 == (ssz = strftime(p, 10 + 1, "%B ", &tm))) | if (0 == (ssz = strftime(p, 10 + 1, "%B ", tm))) | ||
goto fail; | goto fail; | ||
p += (int)ssz; | p += (int)ssz; | ||
if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm.tm_mday))) | if (-1 == (isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday))) | ||
goto fail; | goto fail; | ||
p += isz; | p += isz; | ||
if (0 == strftime(p, 4 + 1, "%Y", &tm)) | if (0 == strftime(p, 4 + 1, "%Y", tm)) | ||
goto fail; | goto fail; | ||
return(buf); | return(buf); | ||
|
|
||
/* | /* | ||
* End-of-sentence recognition must include situations where | * End-of-sentence recognition must include situations where | ||
* some symbols, such as `)', allow prior EOS punctuation to | * some symbols, such as `)', allow prior EOS punctuation to | ||
* propogate outward. | * propagate outward. | ||
*/ | */ | ||
found = 0; | found = 0; | ||
|
|
||
return(found && !enclosed); | return(found && !enclosed); | ||
} | } | ||
/* | |||
* Find out whether a line is a macro line or not. If it is, adjust the | |||
* current position and return one; if it isn't, return zero and don't | |||
* change the current position. | |||
*/ | |||
int | int | ||
mandoc_hyph(const char *start, const char *c) | mandoc_getcontrol(const char *cp, int *ppos) | ||
{ | { | ||
int pos; | |||
/* | pos = *ppos; | ||
* Choose whether to break at a hyphenated character. We only | |||
* do this if it's free-standing within a word. | |||
*/ | |||
/* Skip first/last character of buffer. */ | if ('\\' == cp[pos] && '.' == cp[pos + 1]) | ||
if (c == start || '\0' == *(c + 1)) | pos += 2; | ||
else if ('.' == cp[pos] || '\'' == cp[pos]) | |||
pos++; | |||
else | |||
return(0); | return(0); | ||
/* Skip first/last character of word. */ | |||
if ('\t' == *(c + 1) || '\t' == *(c - 1)) | |||
return(0); | |||
if (' ' == *(c + 1) || ' ' == *(c - 1)) | |||
return(0); | |||
/* Skip double invocations. */ | |||
if ('-' == *(c + 1) || '-' == *(c - 1)) | |||
return(0); | |||
/* Skip escapes. */ | |||
if ('\\' == *(c - 1)) | |||
return(0); | |||
while (' ' == cp[pos] || '\t' == cp[pos]) | |||
pos++; | |||
*ppos = pos; | |||
return(1); | return(1); | ||
} | } | ||
/* | |||
* Convert a string to a long that may not be <0. | |||
* If the string is invalid, or is less than 0, return -1. | |||
*/ | |||
int | |||
mandoc_strntoi(const char *p, size_t sz, int base) | |||
{ | |||
char buf[32]; | |||
char *ep; | |||
long v; | |||
if (sz > 31) | |||
return(-1); | |||
memcpy(buf, p, sz); | |||
buf[(int)sz] = '\0'; | |||
errno = 0; | |||
v = strtol(buf, &ep, base); | |||
if (buf[0] == '\0' || *ep != '\0') | |||
return(-1); | |||
if (v > INT_MAX) | |||
v = INT_MAX; | |||
if (v < INT_MIN) | |||
v = INT_MIN; | |||
return((int)v); | |||
} |