Annotation of mandoc/preconv.c, Revision 1.7
1.7 ! schwarze 1: /* $Id: preconv.c,v 1.6 2013/06/02 03:52:21 schwarze Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #include "config.h"
18:
1.7 ! schwarze 19: #include <sys/types.h>
1.5 kristaps 20: #ifdef HAVE_MMAP
1.1 kristaps 21: #include <sys/stat.h>
22: #include <sys/mman.h>
1.5 kristaps 23: #endif
1.1 kristaps 24:
25: #include <assert.h>
26: #include <fcntl.h>
27: #include <stdio.h>
28: #include <stdlib.h>
29: #include <string.h>
30: #include <unistd.h>
31:
32: /*
33: * The read_whole_file() and resize_buf() functions are copied from
1.6 schwarze 34: * read.c, including all dependency code.
1.1 kristaps 35: */
36:
37: enum enc {
38: ENC_UTF_8, /* UTF-8 */
39: ENC_US_ASCII, /* US-ASCII */
40: ENC_LATIN_1, /* Latin-1 */
41: ENC__MAX
42: };
43:
44: struct buf {
45: char *buf; /* binary input buffer */
46: size_t sz; /* size of binary buffer */
47: size_t offs; /* starting buffer offset */
48: };
49:
50: struct encode {
51: const char *name;
52: int (*conv)(const struct buf *);
53: };
54:
1.3 kristaps 55: static int cue_enc(const struct buf *, size_t *, enum enc *);
1.1 kristaps 56: static int conv_latin_1(const struct buf *);
57: static int conv_us_ascii(const struct buf *);
58: static int conv_utf_8(const struct buf *);
59: static int read_whole_file(const char *, int,
60: struct buf *, int *);
61: static void resize_buf(struct buf *, size_t);
62: static void usage(void);
63:
64: static const struct encode encs[ENC__MAX] = {
65: { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
66: { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
67: { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
68: };
69:
70: static const char *progname;
71:
72: static void
73: usage(void)
74: {
75:
76: fprintf(stderr, "usage: %s "
77: "[-D enc] "
78: "[-e ENC] "
79: "[file]\n", progname);
80: }
81:
82: static int
83: conv_latin_1(const struct buf *b)
84: {
85: size_t i;
1.2 kristaps 86: unsigned char cu;
1.1 kristaps 87: const char *cp;
88:
89: cp = b->buf + (int)b->offs;
90:
91: /*
92: * Latin-1 falls into the first 256 code-points of Unicode, so
93: * there's no need for any sort of translation. Just make the
94: * 8-bit characters use the Unicode escape.
1.3 kristaps 95: * Note that binary values 128 < v < 160 are passed through
96: * unmodified to mandoc.
1.1 kristaps 97: */
98:
99: for (i = b->offs; i < b->sz; i++) {
1.2 kristaps 100: cu = (unsigned char)*cp++;
1.3 kristaps 101: cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
1.1 kristaps 102: }
103:
104: return(1);
105: }
106:
107: static int
108: conv_us_ascii(const struct buf *b)
109: {
110:
111: /*
112: * US-ASCII has no conversion since it falls into the first 128
113: * bytes of Unicode.
114: */
115:
116: fwrite(b->buf, 1, b->sz, stdout);
117: return(1);
118: }
119:
120: static int
121: conv_utf_8(const struct buf *b)
122: {
1.2 kristaps 123: int state, be;
124: unsigned int accum;
125: size_t i;
126: unsigned char cu;
127: const char *cp;
128: const long one = 1L;
129:
130: cp = b->buf + (int)b->offs;
131: state = 0;
132: accum = 0U;
133: be = 0;
134:
135: /* Quick test for big-endian value. */
136:
1.4 kristaps 137: if ( ! (*((const char *)(&one))))
1.2 kristaps 138: be = 1;
139:
140: for (i = b->offs; i < b->sz; i++) {
141: cu = (unsigned char)*cp++;
142: if (state) {
143: if ( ! (cu & 128) || (cu & 64)) {
144: /* Bad sequence header. */
145: return(0);
146: }
147:
148: /* Accept only legitimate bit patterns. */
149:
150: if (cu > 191 || cu < 128) {
151: /* Bad in-sequence bits. */
152: return(0);
153: }
154:
155: accum |= (cu & 63) << --state * 6;
156:
157: /*
158: * Accum is held in little-endian order as
159: * stipulated by the UTF-8 sequence coding. We
160: * need to convert to a native big-endian if our
161: * architecture requires it.
162: */
163:
164: if (0 == state && be)
165: accum = (accum >> 24) |
166: ((accum << 8) & 0x00FF0000) |
167: ((accum >> 8) & 0x0000FF00) |
168: (accum << 24);
169:
170: if (0 == state) {
171: accum < 128U ? putchar(accum) :
172: printf("\\[u%.4X]", accum);
173: accum = 0U;
174: }
175: } else if (cu & (1 << 7)) {
176: /*
177: * Entering a UTF-8 state: if we encounter a
178: * UTF-8 bitmask, calculate the expected UTF-8
179: * state from it.
180: */
181: for (state = 0; state < 7; state++)
182: if ( ! (cu & (1 << (7 - state))))
183: break;
184:
185: /* Accept only legitimate bit patterns. */
186:
187: switch (state) {
188: case (4):
189: if (cu <= 244 && cu >= 240) {
190: accum = (cu & 7) << 18;
191: break;
192: }
193: /* Bad 4-sequence start bits. */
194: return(0);
195: case (3):
196: if (cu <= 239 && cu >= 224) {
197: accum = (cu & 15) << 12;
198: break;
199: }
200: /* Bad 3-sequence start bits. */
201: return(0);
202: case (2):
203: if (cu <= 223 && cu >= 194) {
204: accum = (cu & 31) << 6;
205: break;
206: }
207: /* Bad 2-sequence start bits. */
208: return(0);
209: default:
210: /* Bad sequence bit mask. */
211: return(0);
212: }
213: state--;
214: } else
215: putchar(cu);
216: }
217:
218: if (0 != state) {
219: /* Bad trailing bits. */
220: return(0);
221: }
1.1 kristaps 222:
223: return(1);
224: }
225:
226: static void
227: resize_buf(struct buf *buf, size_t initial)
228: {
229:
230: buf->sz = buf->sz > initial / 2 ?
231: 2 * buf->sz : initial;
232:
233: buf->buf = realloc(buf->buf, buf->sz);
234: if (NULL == buf->buf) {
235: perror(NULL);
236: exit(EXIT_FAILURE);
237: }
238: }
239:
240: static int
241: read_whole_file(const char *f, int fd,
242: struct buf *fb, int *with_mmap)
243: {
244: size_t off;
245: ssize_t ssz;
246:
1.5 kristaps 247: #ifdef HAVE_MMAP
248: struct stat st;
1.1 kristaps 249: if (-1 == fstat(fd, &st)) {
250: perror(f);
251: return(0);
252: }
253:
254: /*
255: * If we're a regular file, try just reading in the whole entry
256: * via mmap(). This is faster than reading it into blocks, and
257: * since each file is only a few bytes to begin with, I'm not
258: * concerned that this is going to tank any machines.
259: */
260:
261: if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
262: fprintf(stderr, "%s: input too large\n", f);
263: return(0);
264: }
265:
266: if (S_ISREG(st.st_mode)) {
267: *with_mmap = 1;
268: fb->sz = (size_t)st.st_size;
1.6 schwarze 269: fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
1.1 kristaps 270: if (fb->buf != MAP_FAILED)
271: return(1);
272: }
1.5 kristaps 273: #endif
1.1 kristaps 274:
275: /*
276: * If this isn't a regular file (like, say, stdin), then we must
277: * go the old way and just read things in bit by bit.
278: */
279:
280: *with_mmap = 0;
281: off = 0;
282: fb->sz = 0;
283: fb->buf = NULL;
284: for (;;) {
285: if (off == fb->sz && fb->sz == (1U << 31)) {
286: fprintf(stderr, "%s: input too large\n", f);
287: break;
288: }
289:
290: if (off == fb->sz)
291: resize_buf(fb, 65536);
292:
293: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
294: if (ssz == 0) {
295: fb->sz = off;
296: return(1);
297: }
298: if (ssz == -1) {
299: perror(f);
300: break;
301: }
302: off += (size_t)ssz;
303: }
304:
305: free(fb->buf);
306: fb->buf = NULL;
307: return(0);
308: }
309:
1.3 kristaps 310: static int
311: cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
312: {
313: const char *ln, *eoln, *eoph;
314: size_t sz, phsz, nsz;
315: int i;
316:
317: ln = b->buf + (int)*offs;
318: sz = b->sz - *offs;
319:
320: /* Look for the end-of-line. */
321:
322: if (NULL == (eoln = memchr(ln, '\n', sz)))
323: return(-1);
324:
325: /* Set next-line marker. */
326:
327: *offs = (size_t)((eoln + 1) - b->buf);
328:
329: /* Check if we have the correct header/trailer. */
330:
331: if ((sz = (size_t)(eoln - ln)) < 10 ||
332: memcmp(ln, ".\\\" -*-", 7) ||
333: memcmp(eoln - 3, "-*-", 3))
334: return(0);
335:
336: /* Move after the header and adjust for the trailer. */
337:
338: ln += 7;
339: sz -= 10;
340:
341: while (sz > 0) {
342: while (sz > 0 && ' ' == *ln) {
343: ln++;
344: sz--;
345: }
346: if (0 == sz)
347: break;
348:
349: /* Find the end-of-phrase marker (or eoln). */
350:
351: if (NULL == (eoph = memchr(ln, ';', sz)))
352: eoph = eoln - 3;
353: else
354: eoph++;
355:
356: /* Only account for the "coding" phrase. */
357:
358: if ((phsz = (size_t)(eoph - ln)) < 7 ||
359: strncasecmp(ln, "coding:", 7)) {
360: sz -= phsz;
361: ln += phsz;
362: continue;
363: }
364:
365: sz -= 7;
366: ln += 7;
367:
368: while (sz > 0 && ' ' == *ln) {
369: ln++;
370: sz--;
371: }
372: if (0 == sz)
373: break;
374:
375: /* Check us against known encodings. */
376:
1.4 kristaps 377: for (i = 0; i < (int)ENC__MAX; i++) {
1.3 kristaps 378: nsz = strlen(encs[i].name);
379: if (phsz < nsz)
380: continue;
381: if (strncasecmp(ln, encs[i].name, nsz))
382: continue;
383:
384: *enc = (enum enc)i;
385: return(1);
386: }
387:
388: /* Unknown encoding. */
389:
390: *enc = ENC__MAX;
391: return(1);
392: }
393:
394: return(0);
395: }
396:
1.1 kristaps 397: int
398: main(int argc, char *argv[])
399: {
400: int i, ch, map, fd, rc;
1.2 kristaps 401: struct buf b;
1.1 kristaps 402: const char *fn;
403: enum enc enc, def;
1.4 kristaps 404: unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
1.3 kristaps 405: size_t offs;
1.1 kristaps 406: extern int optind;
407: extern char *optarg;
408:
409: progname = strrchr(argv[0], '/');
410: if (progname == NULL)
411: progname = argv[0];
412: else
413: ++progname;
414:
415: fn = "<stdin>";
416: fd = STDIN_FILENO;
417: rc = EXIT_FAILURE;
418: enc = def = ENC__MAX;
419: map = 0;
420:
1.2 kristaps 421: memset(&b, 0, sizeof(struct buf));
1.1 kristaps 422:
423: while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
424: switch (ch) {
425: case ('D'):
426: /* FALLTHROUGH */
427: case ('e'):
1.4 kristaps 428: for (i = 0; i < (int)ENC__MAX; i++) {
1.1 kristaps 429: if (strcasecmp(optarg, encs[i].name))
430: continue;
431: break;
432: }
1.4 kristaps 433: if (i < (int)ENC__MAX) {
1.1 kristaps 434: if ('D' == ch)
435: def = (enum enc)i;
436: else
437: enc = (enum enc)i;
438: break;
439: }
440:
441: fprintf(stderr, "%s: Bad encoding\n", optarg);
442: return(EXIT_FAILURE);
443: case ('r'):
444: /* FALLTHROUGH */
445: case ('d'):
446: /* FALLTHROUGH */
447: case ('v'):
448: /* Compatibility with GNU preconv. */
449: break;
450: case ('h'):
451: /* Compatibility with GNU preconv. */
452: /* FALLTHROUGH */
453: default:
454: usage();
455: return(EXIT_FAILURE);
456: }
457:
458: argc -= optind;
459: argv += optind;
460:
461: /*
462: * Open and read the first argument on the command-line.
463: * If we don't have one, we default to stdin.
464: */
465:
466: if (argc > 0) {
467: fn = *argv;
468: fd = open(fn, O_RDONLY, 0);
469: if (-1 == fd) {
470: perror(fn);
471: return(EXIT_FAILURE);
472: }
473: }
474:
1.2 kristaps 475: if ( ! read_whole_file(fn, fd, &b, &map))
1.1 kristaps 476: goto out;
477:
1.2 kristaps 478: /* Try to read the UTF-8 BOM. */
479:
480: if (ENC__MAX == enc)
481: if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
482: b.offs = 3;
483: enc = ENC_UTF_8;
484: }
1.1 kristaps 485:
1.3 kristaps 486: /* Try reading from the "-*-" cue. */
487:
488: if (ENC__MAX == enc) {
489: offs = b.offs;
490: ch = cue_enc(&b, &offs, &enc);
491: if (0 == ch)
492: ch = cue_enc(&b, &offs, &enc);
493: }
494:
1.1 kristaps 495: /*
496: * No encoding has been detected.
497: * Thus, we either fall into our default encoder, if specified,
498: * or use Latin-1 if all else fails.
499: */
500:
501: if (ENC__MAX == enc)
502: enc = ENC__MAX == def ? ENC_LATIN_1 : def;
503:
1.3 kristaps 504: if ( ! (*encs[(int)enc].conv)(&b)) {
505: fprintf(stderr, "%s: Bad encoding\n", fn);
1.1 kristaps 506: goto out;
1.3 kristaps 507: }
1.1 kristaps 508:
509: rc = EXIT_SUCCESS;
510: out:
1.5 kristaps 511: #ifdef HAVE_MMAP
1.1 kristaps 512: if (map)
1.2 kristaps 513: munmap(b.buf, b.sz);
1.1 kristaps 514: else
1.5 kristaps 515: #endif
1.2 kristaps 516: free(b.buf);
1.1 kristaps 517:
518: if (fd > STDIN_FILENO)
519: close(fd);
520:
521: return(rc);
522: }
CVSweb