Annotation of mandoc/preconv.c, Revision 1.3
1.3 ! kristaps 1: /* $Id: preconv.c,v 1.2 2011/05/26 12:01:14 kristaps Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #ifdef HAVE_CONFIG_H
18: #include "config.h"
19: #endif
20:
21: #include <sys/stat.h>
22: #include <sys/mman.h>
23:
24: #include <assert.h>
25: #include <fcntl.h>
26: #include <stdio.h>
27: #include <stdlib.h>
28: #include <string.h>
29: #include <unistd.h>
30:
31: /*
32: * The read_whole_file() and resize_buf() functions are copied from
33: * read.c, including all dependency code (MAP_FILE, etc.).
34: */
35:
36: #ifndef MAP_FILE
37: #define MAP_FILE 0
38: #endif
39:
40: enum enc {
41: ENC_UTF_8, /* UTF-8 */
42: ENC_US_ASCII, /* US-ASCII */
43: ENC_LATIN_1, /* Latin-1 */
44: ENC__MAX
45: };
46:
47: struct buf {
48: char *buf; /* binary input buffer */
49: size_t sz; /* size of binary buffer */
50: size_t offs; /* starting buffer offset */
51: };
52:
53: struct encode {
54: const char *name;
55: int (*conv)(const struct buf *);
56: };
57:
1.3 ! kristaps 58: static int cue_enc(const struct buf *, size_t *, enum enc *);
1.1 kristaps 59: static int conv_latin_1(const struct buf *);
60: static int conv_us_ascii(const struct buf *);
61: static int conv_utf_8(const struct buf *);
62: static int read_whole_file(const char *, int,
63: struct buf *, int *);
64: static void resize_buf(struct buf *, size_t);
65: static void usage(void);
66:
67: static const struct encode encs[ENC__MAX] = {
68: { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
69: { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
70: { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
71: };
72:
73: static const char *progname;
74:
75: static void
76: usage(void)
77: {
78:
79: fprintf(stderr, "usage: %s "
80: "[-D enc] "
81: "[-e ENC] "
82: "[file]\n", progname);
83: }
84:
85: static int
86: conv_latin_1(const struct buf *b)
87: {
88: size_t i;
1.2 kristaps 89: unsigned char cu;
1.1 kristaps 90: const char *cp;
91:
92: cp = b->buf + (int)b->offs;
93:
94: /*
95: * Latin-1 falls into the first 256 code-points of Unicode, so
96: * there's no need for any sort of translation. Just make the
97: * 8-bit characters use the Unicode escape.
1.3 ! kristaps 98: * Note that binary values 128 < v < 160 are passed through
! 99: * unmodified to mandoc.
1.1 kristaps 100: */
101:
102: for (i = b->offs; i < b->sz; i++) {
1.2 kristaps 103: cu = (unsigned char)*cp++;
1.3 ! kristaps 104: cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
1.1 kristaps 105: }
106:
107: return(1);
108: }
109:
110: static int
111: conv_us_ascii(const struct buf *b)
112: {
113:
114: /*
115: * US-ASCII has no conversion since it falls into the first 128
116: * bytes of Unicode.
117: */
118:
119: fwrite(b->buf, 1, b->sz, stdout);
120: return(1);
121: }
122:
123: static int
124: conv_utf_8(const struct buf *b)
125: {
1.2 kristaps 126: int state, be;
127: unsigned int accum;
128: size_t i;
129: unsigned char cu;
130: const char *cp;
131: const long one = 1L;
132:
133: cp = b->buf + (int)b->offs;
134: state = 0;
135: accum = 0U;
136: be = 0;
137:
138: /* Quick test for big-endian value. */
139:
140: if ( ! (*((char *)(&one))))
141: be = 1;
142:
143: for (i = b->offs; i < b->sz; i++) {
144: cu = (unsigned char)*cp++;
145: if (state) {
146: if ( ! (cu & 128) || (cu & 64)) {
147: /* Bad sequence header. */
148: return(0);
149: }
150:
151: /* Accept only legitimate bit patterns. */
152:
153: if (cu > 191 || cu < 128) {
154: /* Bad in-sequence bits. */
155: return(0);
156: }
157:
158: accum |= (cu & 63) << --state * 6;
159:
160: /*
161: * Accum is held in little-endian order as
162: * stipulated by the UTF-8 sequence coding. We
163: * need to convert to a native big-endian if our
164: * architecture requires it.
165: */
166:
167: if (0 == state && be)
168: accum = (accum >> 24) |
169: ((accum << 8) & 0x00FF0000) |
170: ((accum >> 8) & 0x0000FF00) |
171: (accum << 24);
172:
173: if (0 == state) {
174: accum < 128U ? putchar(accum) :
175: printf("\\[u%.4X]", accum);
176: accum = 0U;
177: }
178: } else if (cu & (1 << 7)) {
179: /*
180: * Entering a UTF-8 state: if we encounter a
181: * UTF-8 bitmask, calculate the expected UTF-8
182: * state from it.
183: */
184: for (state = 0; state < 7; state++)
185: if ( ! (cu & (1 << (7 - state))))
186: break;
187:
188: /* Accept only legitimate bit patterns. */
189:
190: switch (state) {
191: case (4):
192: if (cu <= 244 && cu >= 240) {
193: accum = (cu & 7) << 18;
194: break;
195: }
196: /* Bad 4-sequence start bits. */
197: return(0);
198: case (3):
199: if (cu <= 239 && cu >= 224) {
200: accum = (cu & 15) << 12;
201: break;
202: }
203: /* Bad 3-sequence start bits. */
204: return(0);
205: case (2):
206: if (cu <= 223 && cu >= 194) {
207: accum = (cu & 31) << 6;
208: break;
209: }
210: /* Bad 2-sequence start bits. */
211: return(0);
212: default:
213: /* Bad sequence bit mask. */
214: return(0);
215: }
216: state--;
217: } else
218: putchar(cu);
219: }
220:
221: if (0 != state) {
222: /* Bad trailing bits. */
223: return(0);
224: }
1.1 kristaps 225:
226: return(1);
227: }
228:
229: static void
230: resize_buf(struct buf *buf, size_t initial)
231: {
232:
233: buf->sz = buf->sz > initial / 2 ?
234: 2 * buf->sz : initial;
235:
236: buf->buf = realloc(buf->buf, buf->sz);
237: if (NULL == buf->buf) {
238: perror(NULL);
239: exit(EXIT_FAILURE);
240: }
241: }
242:
243: static int
244: read_whole_file(const char *f, int fd,
245: struct buf *fb, int *with_mmap)
246: {
247: struct stat st;
248: size_t off;
249: ssize_t ssz;
250:
251: if (-1 == fstat(fd, &st)) {
252: perror(f);
253: return(0);
254: }
255:
256: /*
257: * If we're a regular file, try just reading in the whole entry
258: * via mmap(). This is faster than reading it into blocks, and
259: * since each file is only a few bytes to begin with, I'm not
260: * concerned that this is going to tank any machines.
261: */
262:
263: if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
264: fprintf(stderr, "%s: input too large\n", f);
265: return(0);
266: }
267:
268: if (S_ISREG(st.st_mode)) {
269: *with_mmap = 1;
270: fb->sz = (size_t)st.st_size;
271: fb->buf = mmap(NULL, fb->sz, PROT_READ,
272: MAP_FILE|MAP_SHARED, fd, 0);
273: if (fb->buf != MAP_FAILED)
274: return(1);
275: }
276:
277: /*
278: * If this isn't a regular file (like, say, stdin), then we must
279: * go the old way and just read things in bit by bit.
280: */
281:
282: *with_mmap = 0;
283: off = 0;
284: fb->sz = 0;
285: fb->buf = NULL;
286: for (;;) {
287: if (off == fb->sz && fb->sz == (1U << 31)) {
288: fprintf(stderr, "%s: input too large\n", f);
289: break;
290: }
291:
292: if (off == fb->sz)
293: resize_buf(fb, 65536);
294:
295: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
296: if (ssz == 0) {
297: fb->sz = off;
298: return(1);
299: }
300: if (ssz == -1) {
301: perror(f);
302: break;
303: }
304: off += (size_t)ssz;
305: }
306:
307: free(fb->buf);
308: fb->buf = NULL;
309: return(0);
310: }
311:
1.3 ! kristaps 312: static int
! 313: cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
! 314: {
! 315: const char *ln, *eoln, *eoph;
! 316: size_t sz, phsz, nsz;
! 317: int i;
! 318:
! 319: ln = b->buf + (int)*offs;
! 320: sz = b->sz - *offs;
! 321:
! 322: /* Look for the end-of-line. */
! 323:
! 324: if (NULL == (eoln = memchr(ln, '\n', sz)))
! 325: return(-1);
! 326:
! 327: /* Set next-line marker. */
! 328:
! 329: *offs = (size_t)((eoln + 1) - b->buf);
! 330:
! 331: /* Check if we have the correct header/trailer. */
! 332:
! 333: if ((sz = (size_t)(eoln - ln)) < 10 ||
! 334: memcmp(ln, ".\\\" -*-", 7) ||
! 335: memcmp(eoln - 3, "-*-", 3))
! 336: return(0);
! 337:
! 338: /* Move after the header and adjust for the trailer. */
! 339:
! 340: ln += 7;
! 341: sz -= 10;
! 342:
! 343: while (sz > 0) {
! 344: while (sz > 0 && ' ' == *ln) {
! 345: ln++;
! 346: sz--;
! 347: }
! 348: if (0 == sz)
! 349: break;
! 350:
! 351: /* Find the end-of-phrase marker (or eoln). */
! 352:
! 353: if (NULL == (eoph = memchr(ln, ';', sz)))
! 354: eoph = eoln - 3;
! 355: else
! 356: eoph++;
! 357:
! 358: /* Only account for the "coding" phrase. */
! 359:
! 360: if ((phsz = (size_t)(eoph - ln)) < 7 ||
! 361: strncasecmp(ln, "coding:", 7)) {
! 362: sz -= phsz;
! 363: ln += phsz;
! 364: continue;
! 365: }
! 366:
! 367: sz -= 7;
! 368: ln += 7;
! 369:
! 370: while (sz > 0 && ' ' == *ln) {
! 371: ln++;
! 372: sz--;
! 373: }
! 374: if (0 == sz)
! 375: break;
! 376:
! 377: /* Check us against known encodings. */
! 378:
! 379: for (i = 0; i < ENC__MAX; i++) {
! 380: nsz = strlen(encs[i].name);
! 381: if (phsz < nsz)
! 382: continue;
! 383: if (strncasecmp(ln, encs[i].name, nsz))
! 384: continue;
! 385:
! 386: *enc = (enum enc)i;
! 387: return(1);
! 388: }
! 389:
! 390: /* Unknown encoding. */
! 391:
! 392: *enc = ENC__MAX;
! 393: return(1);
! 394: }
! 395:
! 396: return(0);
! 397: }
! 398:
1.1 kristaps 399: int
400: main(int argc, char *argv[])
401: {
402: int i, ch, map, fd, rc;
1.2 kristaps 403: struct buf b;
1.1 kristaps 404: const char *fn;
405: enum enc enc, def;
1.2 kristaps 406: const char bom[3] = { 0xEF, 0xBB, 0xBF };
1.3 ! kristaps 407: size_t offs;
1.1 kristaps 408: extern int optind;
409: extern char *optarg;
410:
411: progname = strrchr(argv[0], '/');
412: if (progname == NULL)
413: progname = argv[0];
414: else
415: ++progname;
416:
417: fn = "<stdin>";
418: fd = STDIN_FILENO;
419: rc = EXIT_FAILURE;
420: enc = def = ENC__MAX;
421: map = 0;
422:
1.2 kristaps 423: memset(&b, 0, sizeof(struct buf));
1.1 kristaps 424:
425: while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
426: switch (ch) {
427: case ('D'):
428: /* FALLTHROUGH */
429: case ('e'):
430: for (i = 0; i < ENC__MAX; i++) {
431: if (strcasecmp(optarg, encs[i].name))
432: continue;
433: break;
434: }
435: if (i < ENC__MAX) {
436: if ('D' == ch)
437: def = (enum enc)i;
438: else
439: enc = (enum enc)i;
440: break;
441: }
442:
443: fprintf(stderr, "%s: Bad encoding\n", optarg);
444: return(EXIT_FAILURE);
445: case ('r'):
446: /* FALLTHROUGH */
447: case ('d'):
448: /* FALLTHROUGH */
449: case ('v'):
450: /* Compatibility with GNU preconv. */
451: break;
452: case ('h'):
453: /* Compatibility with GNU preconv. */
454: /* FALLTHROUGH */
455: default:
456: usage();
457: return(EXIT_FAILURE);
458: }
459:
460: argc -= optind;
461: argv += optind;
462:
463: /*
464: * Open and read the first argument on the command-line.
465: * If we don't have one, we default to stdin.
466: */
467:
468: if (argc > 0) {
469: fn = *argv;
470: fd = open(fn, O_RDONLY, 0);
471: if (-1 == fd) {
472: perror(fn);
473: return(EXIT_FAILURE);
474: }
475: }
476:
1.2 kristaps 477: if ( ! read_whole_file(fn, fd, &b, &map))
1.1 kristaps 478: goto out;
479:
1.2 kristaps 480: /* Try to read the UTF-8 BOM. */
481:
482: if (ENC__MAX == enc)
483: if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
484: b.offs = 3;
485: enc = ENC_UTF_8;
486: }
1.1 kristaps 487:
1.3 ! kristaps 488: /* Try reading from the "-*-" cue. */
! 489:
! 490: if (ENC__MAX == enc) {
! 491: offs = b.offs;
! 492: ch = cue_enc(&b, &offs, &enc);
! 493: if (0 == ch)
! 494: ch = cue_enc(&b, &offs, &enc);
! 495: }
! 496:
1.1 kristaps 497: /*
498: * No encoding has been detected.
499: * Thus, we either fall into our default encoder, if specified,
500: * or use Latin-1 if all else fails.
501: */
502:
503: if (ENC__MAX == enc)
504: enc = ENC__MAX == def ? ENC_LATIN_1 : def;
505:
1.3 ! kristaps 506: if ( ! (*encs[(int)enc].conv)(&b)) {
! 507: fprintf(stderr, "%s: Bad encoding\n", fn);
1.1 kristaps 508: goto out;
1.3 ! kristaps 509: }
1.1 kristaps 510:
511: rc = EXIT_SUCCESS;
512: out:
513: if (map)
1.2 kristaps 514: munmap(b.buf, b.sz);
1.1 kristaps 515: else
1.2 kristaps 516: free(b.buf);
1.1 kristaps 517:
518: if (fd > STDIN_FILENO)
519: close(fd);
520:
521: return(rc);
522: }
CVSweb