version 1.11, 2014/11/01 04:08:43 |
version 1.17, 2018/12/13 11:55:47 |
|
|
|
|
#include <sys/types.h> |
#include <sys/types.h> |
|
|
|
#include <assert.h> |
#include <stdio.h> |
#include <stdio.h> |
#include <string.h> |
#include <string.h> |
|
|
#include "mandoc.h" |
#include "mandoc.h" |
|
#include "roff.h" |
|
#include "mandoc_parse.h" |
#include "libmandoc.h" |
#include "libmandoc.h" |
|
|
int |
int |
preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, |
preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi, |
int *filenc) |
int *filenc) |
{ |
{ |
size_t i; |
const unsigned char *cu; |
const long one = 1L; |
int nby; |
int state, be; |
unsigned int accum; |
unsigned int accum; |
|
unsigned char cu; |
|
|
|
|
cu = (const unsigned char *)ib->buf + *ii; |
|
assert(*cu & 0x80); |
|
|
if ( ! (*filenc & MPARSE_UTF8)) |
if ( ! (*filenc & MPARSE_UTF8)) |
goto latin; |
goto latin; |
|
|
state = 0; |
nby = 1; |
accum = 0U; |
while (nby < 5 && *cu & (1 << (7 - nby))) |
be = 0; |
nby++; |
|
|
/* Quick test for big-endian value. */ |
switch (nby) { |
|
case 2: |
|
accum = *cu & 0x1f; |
|
if (accum < 0x02) /* Obfuscated ASCII. */ |
|
goto latin; |
|
break; |
|
case 3: |
|
accum = *cu & 0x0f; |
|
break; |
|
case 4: |
|
accum = *cu & 0x07; |
|
if (accum > 0x04) /* Beyond Unicode. */ |
|
goto latin; |
|
break; |
|
default: /* Bad sequence header. */ |
|
goto latin; |
|
} |
|
|
if ( ! (*((const char *)(&one)))) |
cu++; |
be = 1; |
switch (nby) { |
|
case 3: |
|
if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */ |
|
(accum == 0x0d && *cu & 0x20)) /* Surrogates. */ |
|
goto latin; |
|
break; |
|
case 4: |
|
if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */ |
|
(accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */ |
|
goto latin; |
|
break; |
|
default: |
|
break; |
|
} |
|
|
for (i = *ii; i < ib->sz; i++) { |
while (--nby) { |
cu = ib->buf[i]; |
if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */ |
if (state) { |
goto latin; |
if ( ! (cu & 128) || (cu & 64)) { |
accum <<= 6; |
/* Bad sequence header. */ |
accum += *cu & 0x3f; |
break; |
cu++; |
} |
|
|
|
/* Accept only legitimate bit patterns. */ |
|
|
|
if (cu > 191 || cu < 128) { |
|
/* Bad in-sequence bits. */ |
|
break; |
|
} |
|
|
|
accum |= (cu & 63) << --state * 6; |
|
|
|
if (state) |
|
continue; |
|
|
|
/* |
|
* Accum is held in little-endian order as |
|
* stipulated by the UTF-8 sequence coding. We |
|
* need to convert to a native big-endian if our |
|
* architecture requires it. |
|
*/ |
|
|
|
if (be) |
|
accum = (accum >> 24) | |
|
((accum << 8) & 0x00FF0000) | |
|
((accum >> 8) & 0x0000FF00) | |
|
(accum << 24); |
|
|
|
if (accum < 0x80) |
|
ob->buf[(*oi)++] = accum; |
|
else |
|
*oi += snprintf(ob->buf + *oi, |
|
11, "\\[u%.4X]", accum); |
|
*ii = i + 1; |
|
*filenc &= ~MPARSE_LATIN1; |
|
return(1); |
|
} else { |
|
/* |
|
* Entering a UTF-8 state: if we encounter a |
|
* UTF-8 bitmask, calculate the expected UTF-8 |
|
* state from it. |
|
*/ |
|
for (state = 0; state < 7; state++) |
|
if ( ! (cu & (1 << (7 - state)))) |
|
break; |
|
|
|
/* Accept only legitimate bit patterns. */ |
|
|
|
switch (state--) { |
|
case (4): |
|
if (cu <= 244 && cu >= 240) { |
|
accum = (cu & 7) << 18; |
|
continue; |
|
} |
|
/* Bad 4-sequence start bits. */ |
|
break; |
|
case (3): |
|
if (cu <= 239 && cu >= 224) { |
|
accum = (cu & 15) << 12; |
|
continue; |
|
} |
|
/* Bad 3-sequence start bits. */ |
|
break; |
|
case (2): |
|
if (cu <= 223 && cu >= 194) { |
|
accum = (cu & 31) << 6; |
|
continue; |
|
} |
|
/* Bad 2-sequence start bits. */ |
|
break; |
|
default: |
|
/* Bad sequence bit mask. */ |
|
break; |
|
} |
|
break; |
|
} |
|
} |
} |
|
|
/* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */ |
assert(accum > 0x7f); |
|
assert(accum < 0x110000); |
|
assert(accum < 0xd800 || accum > 0xdfff); |
|
|
|
*oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum); |
|
*ii = (const char *)cu - ib->buf; |
|
*filenc &= ~MPARSE_LATIN1; |
|
return 1; |
|
|
latin: |
latin: |
if ( ! (*filenc & MPARSE_LATIN1)) |
if ( ! (*filenc & MPARSE_LATIN1)) |
return(0); |
return 0; |
|
|
*oi += snprintf(ob->buf + *oi, 11, |
*oi += snprintf(ob->buf + *oi, 11, |
"\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); |
"\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]); |
|
|
*filenc &= ~MPARSE_UTF8; |
*filenc &= ~MPARSE_UTF8; |
return(1); |
return 1; |
} |
} |
|
|
int |
int |
Line 158 preconv_cue(const struct buf *b, size_t offset) |
|
Line 124 preconv_cue(const struct buf *b, size_t offset) |
|
|
|
/* Check if we have the correct header/trailer. */ |
/* Check if we have the correct header/trailer. */ |
|
|
if ((sz = (size_t)(eoln - ln)) < 10 || |
if ((sz = (size_t)(eoln - ln)) < 10 || |
memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) |
memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3)) |
return(MPARSE_UTF8 | MPARSE_LATIN1); |
return MPARSE_UTF8 | MPARSE_LATIN1; |
|
|
/* Move after the header and adjust for the trailer. */ |
/* Move after the header and adjust for the trailer. */ |
|
|
Line 189 preconv_cue(const struct buf *b, size_t offset) |
|
Line 155 preconv_cue(const struct buf *b, size_t offset) |
|
sz -= phsz; |
sz -= phsz; |
ln += phsz; |
ln += phsz; |
continue; |
continue; |
} |
} |
|
|
sz -= 7; |
sz -= 7; |
ln += 7; |
ln += 7; |
Line 199 preconv_cue(const struct buf *b, size_t offset) |
|
Line 165 preconv_cue(const struct buf *b, size_t offset) |
|
sz--; |
sz--; |
} |
} |
if (0 == sz) |
if (0 == sz) |
return(0); |
return 0; |
|
|
/* Check us against known encodings. */ |
/* Check us against known encodings. */ |
|
|
if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) |
if (phsz > 4 && !strncasecmp(ln, "utf-8", 5)) |
return(MPARSE_UTF8); |
return MPARSE_UTF8; |
if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) |
if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11)) |
return(MPARSE_LATIN1); |
return MPARSE_LATIN1; |
return(0); |
return 0; |
} |
} |
return(MPARSE_UTF8 | MPARSE_LATIN1); |
return MPARSE_UTF8 | MPARSE_LATIN1; |
} |
} |