Annotation of mandoc/preconv.c, Revision 1.2
1.2 ! kristaps 1: /* $Id: preconv.c,v 1.1 2011/05/26 00:30:11 kristaps Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #ifdef HAVE_CONFIG_H
18: #include "config.h"
19: #endif
20:
21: #include <sys/stat.h>
22: #include <sys/mman.h>
23:
24: #include <assert.h>
25: #include <fcntl.h>
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <unistd.h>
30:
31: /*
32: * The read_whole_file() and resize_buf() functions are copied from
33: * read.c, including all dependency code (MAP_FILE, etc.).
34: */
35:
36: #ifndef MAP_FILE
37: #define MAP_FILE 0
38: #endif
39:
40: enum enc {
41: ENC_UTF_8, /* UTF-8 */
42: ENC_US_ASCII, /* US-ASCII */
43: ENC_LATIN_1, /* Latin-1 */
44: ENC__MAX
45: };
46:
47: struct buf {
48: char *buf; /* binary input buffer */
49: size_t sz; /* size of binary buffer */
50: size_t offs; /* starting buffer offset */
51: };
52:
53: struct encode {
54: const char *name;
55: int (*conv)(const struct buf *);
56: };
57:
58: static int conv_latin_1(const struct buf *);
59: static int conv_us_ascii(const struct buf *);
60: static int conv_utf_8(const struct buf *);
61: static int read_whole_file(const char *, int,
62: struct buf *, int *);
63: static void resize_buf(struct buf *, size_t);
64: static void usage(void);
65:
66: static const struct encode encs[ENC__MAX] = {
67: { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
68: { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
69: { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
70: };
71:
72: static const char *progname;
73:
74: static void
75: usage(void)
76: {
77:
78: fprintf(stderr, "usage: %s "
79: "[-D enc] "
80: "[-e ENC] "
81: "[file]\n", progname);
82: }
83:
84: static int
85: conv_latin_1(const struct buf *b)
86: {
87: size_t i;
1.2 ! kristaps 88: unsigned char cu;
1.1 kristaps 89: const char *cp;
90:
91: cp = b->buf + (int)b->offs;
92:
93: /*
94: * Latin-1 falls into the first 256 code-points of Unicode, so
95: * there's no need for any sort of translation. Just make the
96: * 8-bit characters use the Unicode escape.
97: */
98:
99: for (i = b->offs; i < b->sz; i++) {
1.2 ! kristaps 100: cu = (unsigned char)*cp++;
! 101: cu < 128U ? putchar(cu) : printf("\\[u%.4X]", cu);
1.1 kristaps 102: }
103:
104: return(1);
105: }
106:
107: static int
108: conv_us_ascii(const struct buf *b)
109: {
110:
111: /*
112: * US-ASCII has no conversion since it falls into the first 128
113: * bytes of Unicode.
114: */
115:
116: fwrite(b->buf, 1, b->sz, stdout);
117: return(1);
118: }
119:
120: static int
121: conv_utf_8(const struct buf *b)
122: {
1.2 ! kristaps 123: int state, be;
! 124: unsigned int accum;
! 125: size_t i;
! 126: unsigned char cu;
! 127: const char *cp;
! 128: const long one = 1L;
! 129:
! 130: cp = b->buf + (int)b->offs;
! 131: state = 0;
! 132: accum = 0U;
! 133: be = 0;
! 134:
! 135: /* Quick test for big-endian value. */
! 136:
! 137: if ( ! (*((char *)(&one))))
! 138: be = 1;
! 139:
! 140: for (i = b->offs; i < b->sz; i++) {
! 141: cu = (unsigned char)*cp++;
! 142: if (state) {
! 143: if ( ! (cu & 128) || (cu & 64)) {
! 144: /* Bad sequence header. */
! 145: return(0);
! 146: }
! 147:
! 148: /* Accept only legitimate bit patterns. */
! 149:
! 150: if (cu > 191 || cu < 128) {
! 151: /* Bad in-sequence bits. */
! 152: return(0);
! 153: }
! 154:
! 155: accum |= (cu & 63) << --state * 6;
! 156:
! 157: /*
! 158: * Accum is held in little-endian order as
! 159: * stipulated by the UTF-8 sequence coding. We
! 160: * need to convert to a native big-endian if our
! 161: * architecture requires it.
! 162: */
! 163:
! 164: if (0 == state && be)
! 165: accum = (accum >> 24) |
! 166: ((accum << 8) & 0x00FF0000) |
! 167: ((accum >> 8) & 0x0000FF00) |
! 168: (accum << 24);
! 169:
! 170: if (0 == state) {
! 171: accum < 128U ? putchar(accum) :
! 172: printf("\\[u%.4X]", accum);
! 173: accum = 0U;
! 174: }
! 175: } else if (cu & (1 << 7)) {
! 176: /*
! 177: * Entering a UTF-8 state: if we encounter a
! 178: * UTF-8 bitmask, calculate the expected UTF-8
! 179: * state from it.
! 180: */
! 181: for (state = 0; state < 7; state++)
! 182: if ( ! (cu & (1 << (7 - state))))
! 183: break;
! 184:
! 185: /* Accept only legitimate bit patterns. */
! 186:
! 187: switch (state) {
! 188: case (4):
! 189: if (cu <= 244 && cu >= 240) {
! 190: accum = (cu & 7) << 18;
! 191: break;
! 192: }
! 193: /* Bad 4-sequence start bits. */
! 194: return(0);
! 195: case (3):
! 196: if (cu <= 239 && cu >= 224) {
! 197: accum = (cu & 15) << 12;
! 198: break;
! 199: }
! 200: /* Bad 3-sequence start bits. */
! 201: return(0);
! 202: case (2):
! 203: if (cu <= 223 && cu >= 194) {
! 204: accum = (cu & 31) << 6;
! 205: break;
! 206: }
! 207: /* Bad 2-sequence start bits. */
! 208: return(0);
! 209: default:
! 210: /* Bad sequence bit mask. */
! 211: return(0);
! 212: }
! 213: state--;
! 214: } else
! 215: putchar(cu);
! 216: }
! 217:
! 218: if (0 != state) {
! 219: /* Bad trailing bits. */
! 220: return(0);
! 221: }
1.1 kristaps 222:
223: return(1);
224: }
225:
226: static void
227: resize_buf(struct buf *buf, size_t initial)
228: {
229:
230: buf->sz = buf->sz > initial / 2 ?
231: 2 * buf->sz : initial;
232:
233: buf->buf = realloc(buf->buf, buf->sz);
234: if (NULL == buf->buf) {
235: perror(NULL);
236: exit(EXIT_FAILURE);
237: }
238: }
239:
240: static int
241: read_whole_file(const char *f, int fd,
242: struct buf *fb, int *with_mmap)
243: {
244: struct stat st;
245: size_t off;
246: ssize_t ssz;
247:
248: if (-1 == fstat(fd, &st)) {
249: perror(f);
250: return(0);
251: }
252:
253: /*
254: * If we're a regular file, try just reading in the whole entry
255: * via mmap(). This is faster than reading it into blocks, and
256: * since each file is only a few bytes to begin with, I'm not
257: * concerned that this is going to tank any machines.
258: */
259:
260: if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
261: fprintf(stderr, "%s: input too large\n", f);
262: return(0);
263: }
264:
265: if (S_ISREG(st.st_mode)) {
266: *with_mmap = 1;
267: fb->sz = (size_t)st.st_size;
268: fb->buf = mmap(NULL, fb->sz, PROT_READ,
269: MAP_FILE|MAP_SHARED, fd, 0);
270: if (fb->buf != MAP_FAILED)
271: return(1);
272: }
273:
274: /*
275: * If this isn't a regular file (like, say, stdin), then we must
276: * go the old way and just read things in bit by bit.
277: */
278:
279: *with_mmap = 0;
280: off = 0;
281: fb->sz = 0;
282: fb->buf = NULL;
283: for (;;) {
284: if (off == fb->sz && fb->sz == (1U << 31)) {
285: fprintf(stderr, "%s: input too large\n", f);
286: break;
287: }
288:
289: if (off == fb->sz)
290: resize_buf(fb, 65536);
291:
292: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
293: if (ssz == 0) {
294: fb->sz = off;
295: return(1);
296: }
297: if (ssz == -1) {
298: perror(f);
299: break;
300: }
301: off += (size_t)ssz;
302: }
303:
304: free(fb->buf);
305: fb->buf = NULL;
306: return(0);
307: }
308:
309: int
310: main(int argc, char *argv[])
311: {
312: int i, ch, map, fd, rc;
1.2 ! kristaps 313: struct buf b;
1.1 kristaps 314: const char *fn;
315: enum enc enc, def;
1.2 ! kristaps 316: const char bom[3] = { 0xEF, 0xBB, 0xBF };
1.1 kristaps 317: extern int optind;
318: extern char *optarg;
319:
320: progname = strrchr(argv[0], '/');
321: if (progname == NULL)
322: progname = argv[0];
323: else
324: ++progname;
325:
326: fn = "<stdin>";
327: fd = STDIN_FILENO;
328: rc = EXIT_FAILURE;
329: enc = def = ENC__MAX;
330: map = 0;
331:
1.2 ! kristaps 332: memset(&b, 0, sizeof(struct buf));
1.1 kristaps 333:
334: while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
335: switch (ch) {
336: case ('D'):
337: /* FALLTHROUGH */
338: case ('e'):
339: for (i = 0; i < ENC__MAX; i++) {
340: if (strcasecmp(optarg, encs[i].name))
341: continue;
342: break;
343: }
344: if (i < ENC__MAX) {
345: if ('D' == ch)
346: def = (enum enc)i;
347: else
348: enc = (enum enc)i;
349: break;
350: }
351:
352: fprintf(stderr, "%s: Bad encoding\n", optarg);
353: return(EXIT_FAILURE);
354: case ('r'):
355: /* FALLTHROUGH */
356: case ('d'):
357: /* FALLTHROUGH */
358: case ('v'):
359: /* Compatibility with GNU preconv. */
360: break;
361: case ('h'):
362: /* Compatibility with GNU preconv. */
363: /* FALLTHROUGH */
364: default:
365: usage();
366: return(EXIT_FAILURE);
367: }
368:
369: argc -= optind;
370: argv += optind;
371:
372: /*
373: * Open and read the first argument on the command-line.
374: * If we don't have one, we default to stdin.
375: */
376:
377: if (argc > 0) {
378: fn = *argv;
379: fd = open(fn, O_RDONLY, 0);
380: if (-1 == fd) {
381: perror(fn);
382: return(EXIT_FAILURE);
383: }
384: }
385:
1.2 ! kristaps 386: if ( ! read_whole_file(fn, fd, &b, &map))
1.1 kristaps 387: goto out;
388:
1.2 ! kristaps 389: /* Try to read the UTF-8 BOM. */
! 390:
! 391: if (ENC__MAX == enc)
! 392: if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
! 393: b.offs = 3;
! 394: enc = ENC_UTF_8;
! 395: }
1.1 kristaps 396:
397: /*
398: * No encoding has been detected.
399: * Thus, we either fall into our default encoder, if specified,
400: * or use Latin-1 if all else fails.
401: */
402:
403: if (ENC__MAX == enc)
404: enc = ENC__MAX == def ? ENC_LATIN_1 : def;
405:
1.2 ! kristaps 406: if ( ! (*encs[(int)enc].conv)(&b))
1.1 kristaps 407: goto out;
408:
409: rc = EXIT_SUCCESS;
410: out:
411: if (map)
1.2 ! kristaps 412: munmap(b.buf, b.sz);
1.1 kristaps 413: else
1.2 ! kristaps 414: free(b.buf);
1.1 kristaps 415:
416: if (fd > STDIN_FILENO)
417: close(fd);
418:
419: return(rc);
420: }
CVSweb