Annotation of mandoc/preconv.c, Revision 1.16
1.16 ! schwarze 1: /* $Id: preconv.c,v 1.15 2015/10/06 18:32:19 schwarze Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
1.9 schwarze 4: * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
1.1 kristaps 5: *
6: * Permission to use, copy, modify, and distribute this software for any
7: * purpose with or without fee is hereby granted, provided that the above
8: * copyright notice and this permission notice appear in all copies.
9: *
10: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17: */
18: #include "config.h"
19:
1.7 schwarze 20: #include <sys/types.h>
1.1 kristaps 21:
1.13 schwarze 22: #include <assert.h>
1.1 kristaps 23: #include <stdio.h>
24: #include <string.h>
1.9 schwarze 25: #include "mandoc.h"
26: #include "libmandoc.h"
1.1 kristaps 27:
1.9 schwarze 28: int
1.16 ! schwarze 29: preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
1.11 schwarze 30: int *filenc)
1.1 kristaps 31: {
1.16 ! schwarze 32: const unsigned char *cu;
! 33: int nby;
! 34: unsigned int accum;
1.13 schwarze 35:
1.16 ! schwarze 36: cu = (const unsigned char *)ib->buf + *ii;
1.13 schwarze 37: assert(*cu & 0x80);
1.2 kristaps 38:
1.9 schwarze 39: if ( ! (*filenc & MPARSE_UTF8))
40: goto latin;
41:
1.13 schwarze 42: nby = 1;
43: while (nby < 5 && *cu & (1 << (7 - nby)))
44: nby++;
45:
46: switch (nby) {
47: case 2:
48: accum = *cu & 0x1f;
49: if (accum < 0x02) /* Obfuscated ASCII. */
50: goto latin;
51: break;
52: case 3:
53: accum = *cu & 0x0f;
54: break;
55: case 4:
56: accum = *cu & 0x07;
57: if (accum > 0x04) /* Beyond Unicode. */
58: goto latin;
59: break;
60: default: /* Bad sequence header. */
61: goto latin;
62: }
63:
64: cu++;
65: switch (nby) {
66: case 3:
67: if ((accum == 0x00 && ! (*cu & 0x20)) || /* Use 2-byte. */
68: (accum == 0x0d && *cu & 0x20)) /* Surrogates. */
69: goto latin;
70: break;
71: case 4:
72: if ((accum == 0x00 && ! (*cu & 0x30)) || /* Use 3-byte. */
73: (accum == 0x04 && *cu & 0x30)) /* Beyond Unicode. */
74: goto latin;
75: break;
76: default:
77: break;
78: }
1.2 kristaps 79:
1.13 schwarze 80: while (--nby) {
81: if ((*cu & 0xc0) != 0x80) /* Invalid continuation. */
82: goto latin;
83: accum <<= 6;
84: accum += *cu & 0x3f;
85: cu++;
1.1 kristaps 86: }
87:
1.13 schwarze 88: assert(accum > 0x7f);
89: assert(accum < 0x110000);
90: assert(accum < 0xd800 || accum > 0xdfff);
91:
92: *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
1.16 ! schwarze 93: *ii = (const char *)cu - ib->buf;
1.13 schwarze 94: *filenc &= ~MPARSE_LATIN1;
1.15 schwarze 95: return 1;
1.1 kristaps 96:
1.9 schwarze 97: latin:
98: if ( ! (*filenc & MPARSE_LATIN1))
1.15 schwarze 99: return 0;
1.1 kristaps 100:
1.11 schwarze 101: *oi += snprintf(ob->buf + *oi, 11,
102: "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
1.1 kristaps 103:
1.9 schwarze 104: *filenc &= ~MPARSE_UTF8;
1.15 schwarze 105: return 1;
1.1 kristaps 106: }
107:
1.9 schwarze 108: int
1.11 schwarze 109: preconv_cue(const struct buf *b, size_t offset)
1.3 kristaps 110: {
111: const char *ln, *eoln, *eoph;
1.9 schwarze 112: size_t sz, phsz;
1.3 kristaps 113:
1.11 schwarze 114: ln = b->buf + offset;
115: sz = b->sz - offset;
1.3 kristaps 116:
117: /* Look for the end-of-line. */
118:
119: if (NULL == (eoln = memchr(ln, '\n', sz)))
1.9 schwarze 120: eoln = ln + sz;
1.3 kristaps 121:
122: /* Check if we have the correct header/trailer. */
123:
1.12 schwarze 124: if ((sz = (size_t)(eoln - ln)) < 10 ||
1.10 schwarze 125: memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
1.15 schwarze 126: return MPARSE_UTF8 | MPARSE_LATIN1;
1.3 kristaps 127:
128: /* Move after the header and adjust for the trailer. */
129:
130: ln += 7;
131: sz -= 10;
132:
133: while (sz > 0) {
134: while (sz > 0 && ' ' == *ln) {
135: ln++;
136: sz--;
137: }
138: if (0 == sz)
139: break;
140:
141: /* Find the end-of-phrase marker (or eoln). */
142:
143: if (NULL == (eoph = memchr(ln, ';', sz)))
144: eoph = eoln - 3;
145: else
146: eoph++;
147:
148: /* Only account for the "coding" phrase. */
149:
1.9 schwarze 150: if ((phsz = eoph - ln) < 7 ||
151: strncasecmp(ln, "coding:", 7)) {
1.3 kristaps 152: sz -= phsz;
153: ln += phsz;
154: continue;
1.12 schwarze 155: }
1.3 kristaps 156:
157: sz -= 7;
158: ln += 7;
159:
160: while (sz > 0 && ' ' == *ln) {
161: ln++;
162: sz--;
163: }
164: if (0 == sz)
1.15 schwarze 165: return 0;
1.3 kristaps 166:
167: /* Check us against known encodings. */
168:
1.9 schwarze 169: if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
1.15 schwarze 170: return MPARSE_UTF8;
1.9 schwarze 171: if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
1.15 schwarze 172: return MPARSE_LATIN1;
173: return 0;
1.3 kristaps 174: }
1.15 schwarze 175: return MPARSE_UTF8 | MPARSE_LATIN1;
1.1 kristaps 176: }
CVSweb