Annotation of mandoc/preconv.c, Revision 1.6
1.6 ! schwarze 1: /* $Id: preconv.c,v 1.5 2011/07/24 18:15:14 kristaps Exp $ */
1.1 kristaps 2: /*
3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4: *
5: * Permission to use, copy, modify, and distribute this software for any
6: * purpose with or without fee is hereby granted, provided that the above
7: * copyright notice and this permission notice appear in all copies.
8: *
9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16: */
17: #ifdef HAVE_CONFIG_H
18: #include "config.h"
19: #endif
20:
1.5 kristaps 21: #ifdef HAVE_MMAP
1.1 kristaps 22: #include <sys/stat.h>
23: #include <sys/mman.h>
1.5 kristaps 24: #endif
1.1 kristaps 25:
26: #include <assert.h>
27: #include <fcntl.h>
28: #include <stdio.h>
29: #include <stdlib.h>
30: #include <string.h>
31: #include <unistd.h>
32:
33: /*
34: * The read_whole_file() and resize_buf() functions are copied from
1.6 ! schwarze 35: * read.c, including all dependency code.
1.1 kristaps 36: */
37:
38: enum enc {
39: ENC_UTF_8, /* UTF-8 */
40: ENC_US_ASCII, /* US-ASCII */
41: ENC_LATIN_1, /* Latin-1 */
42: ENC__MAX
43: };
44:
45: struct buf {
46: char *buf; /* binary input buffer */
47: size_t sz; /* size of binary buffer */
48: size_t offs; /* starting buffer offset */
49: };
50:
51: struct encode {
52: const char *name;
53: int (*conv)(const struct buf *);
54: };
55:
1.3 kristaps 56: static int cue_enc(const struct buf *, size_t *, enum enc *);
1.1 kristaps 57: static int conv_latin_1(const struct buf *);
58: static int conv_us_ascii(const struct buf *);
59: static int conv_utf_8(const struct buf *);
60: static int read_whole_file(const char *, int,
61: struct buf *, int *);
62: static void resize_buf(struct buf *, size_t);
63: static void usage(void);
64:
65: static const struct encode encs[ENC__MAX] = {
66: { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
67: { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
68: { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
69: };
70:
71: static const char *progname;
72:
73: static void
74: usage(void)
75: {
76:
77: fprintf(stderr, "usage: %s "
78: "[-D enc] "
79: "[-e ENC] "
80: "[file]\n", progname);
81: }
82:
83: static int
84: conv_latin_1(const struct buf *b)
85: {
86: size_t i;
1.2 kristaps 87: unsigned char cu;
1.1 kristaps 88: const char *cp;
89:
90: cp = b->buf + (int)b->offs;
91:
92: /*
93: * Latin-1 falls into the first 256 code-points of Unicode, so
94: * there's no need for any sort of translation. Just make the
95: * 8-bit characters use the Unicode escape.
1.3 kristaps 96: * Note that binary values 128 < v < 160 are passed through
97: * unmodified to mandoc.
1.1 kristaps 98: */
99:
100: for (i = b->offs; i < b->sz; i++) {
1.2 kristaps 101: cu = (unsigned char)*cp++;
1.3 kristaps 102: cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
1.1 kristaps 103: }
104:
105: return(1);
106: }
107:
108: static int
109: conv_us_ascii(const struct buf *b)
110: {
111:
112: /*
113: * US-ASCII has no conversion since it falls into the first 128
114: * bytes of Unicode.
115: */
116:
117: fwrite(b->buf, 1, b->sz, stdout);
118: return(1);
119: }
120:
121: static int
122: conv_utf_8(const struct buf *b)
123: {
1.2 kristaps 124: int state, be;
125: unsigned int accum;
126: size_t i;
127: unsigned char cu;
128: const char *cp;
129: const long one = 1L;
130:
131: cp = b->buf + (int)b->offs;
132: state = 0;
133: accum = 0U;
134: be = 0;
135:
136: /* Quick test for big-endian value. */
137:
1.4 kristaps 138: if ( ! (*((const char *)(&one))))
1.2 kristaps 139: be = 1;
140:
141: for (i = b->offs; i < b->sz; i++) {
142: cu = (unsigned char)*cp++;
143: if (state) {
144: if ( ! (cu & 128) || (cu & 64)) {
145: /* Bad sequence header. */
146: return(0);
147: }
148:
149: /* Accept only legitimate bit patterns. */
150:
151: if (cu > 191 || cu < 128) {
152: /* Bad in-sequence bits. */
153: return(0);
154: }
155:
156: accum |= (cu & 63) << --state * 6;
157:
158: /*
159: * Accum is held in little-endian order as
160: * stipulated by the UTF-8 sequence coding. We
161: * need to convert to a native big-endian if our
162: * architecture requires it.
163: */
164:
165: if (0 == state && be)
166: accum = (accum >> 24) |
167: ((accum << 8) & 0x00FF0000) |
168: ((accum >> 8) & 0x0000FF00) |
169: (accum << 24);
170:
171: if (0 == state) {
172: accum < 128U ? putchar(accum) :
173: printf("\\[u%.4X]", accum);
174: accum = 0U;
175: }
176: } else if (cu & (1 << 7)) {
177: /*
178: * Entering a UTF-8 state: if we encounter a
179: * UTF-8 bitmask, calculate the expected UTF-8
180: * state from it.
181: */
182: for (state = 0; state < 7; state++)
183: if ( ! (cu & (1 << (7 - state))))
184: break;
185:
186: /* Accept only legitimate bit patterns. */
187:
188: switch (state) {
189: case (4):
190: if (cu <= 244 && cu >= 240) {
191: accum = (cu & 7) << 18;
192: break;
193: }
194: /* Bad 4-sequence start bits. */
195: return(0);
196: case (3):
197: if (cu <= 239 && cu >= 224) {
198: accum = (cu & 15) << 12;
199: break;
200: }
201: /* Bad 3-sequence start bits. */
202: return(0);
203: case (2):
204: if (cu <= 223 && cu >= 194) {
205: accum = (cu & 31) << 6;
206: break;
207: }
208: /* Bad 2-sequence start bits. */
209: return(0);
210: default:
211: /* Bad sequence bit mask. */
212: return(0);
213: }
214: state--;
215: } else
216: putchar(cu);
217: }
218:
219: if (0 != state) {
220: /* Bad trailing bits. */
221: return(0);
222: }
1.1 kristaps 223:
224: return(1);
225: }
226:
227: static void
228: resize_buf(struct buf *buf, size_t initial)
229: {
230:
231: buf->sz = buf->sz > initial / 2 ?
232: 2 * buf->sz : initial;
233:
234: buf->buf = realloc(buf->buf, buf->sz);
235: if (NULL == buf->buf) {
236: perror(NULL);
237: exit(EXIT_FAILURE);
238: }
239: }
240:
241: static int
242: read_whole_file(const char *f, int fd,
243: struct buf *fb, int *with_mmap)
244: {
245: size_t off;
246: ssize_t ssz;
247:
1.5 kristaps 248: #ifdef HAVE_MMAP
249: struct stat st;
1.1 kristaps 250: if (-1 == fstat(fd, &st)) {
251: perror(f);
252: return(0);
253: }
254:
255: /*
256: * If we're a regular file, try just reading in the whole entry
257: * via mmap(). This is faster than reading it into blocks, and
258: * since each file is only a few bytes to begin with, I'm not
259: * concerned that this is going to tank any machines.
260: */
261:
262: if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
263: fprintf(stderr, "%s: input too large\n", f);
264: return(0);
265: }
266:
267: if (S_ISREG(st.st_mode)) {
268: *with_mmap = 1;
269: fb->sz = (size_t)st.st_size;
1.6 ! schwarze 270: fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
1.1 kristaps 271: if (fb->buf != MAP_FAILED)
272: return(1);
273: }
1.5 kristaps 274: #endif
1.1 kristaps 275:
276: /*
277: * If this isn't a regular file (like, say, stdin), then we must
278: * go the old way and just read things in bit by bit.
279: */
280:
281: *with_mmap = 0;
282: off = 0;
283: fb->sz = 0;
284: fb->buf = NULL;
285: for (;;) {
286: if (off == fb->sz && fb->sz == (1U << 31)) {
287: fprintf(stderr, "%s: input too large\n", f);
288: break;
289: }
290:
291: if (off == fb->sz)
292: resize_buf(fb, 65536);
293:
294: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
295: if (ssz == 0) {
296: fb->sz = off;
297: return(1);
298: }
299: if (ssz == -1) {
300: perror(f);
301: break;
302: }
303: off += (size_t)ssz;
304: }
305:
306: free(fb->buf);
307: fb->buf = NULL;
308: return(0);
309: }
310:
1.3 kristaps 311: static int
312: cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
313: {
314: const char *ln, *eoln, *eoph;
315: size_t sz, phsz, nsz;
316: int i;
317:
318: ln = b->buf + (int)*offs;
319: sz = b->sz - *offs;
320:
321: /* Look for the end-of-line. */
322:
323: if (NULL == (eoln = memchr(ln, '\n', sz)))
324: return(-1);
325:
326: /* Set next-line marker. */
327:
328: *offs = (size_t)((eoln + 1) - b->buf);
329:
330: /* Check if we have the correct header/trailer. */
331:
332: if ((sz = (size_t)(eoln - ln)) < 10 ||
333: memcmp(ln, ".\\\" -*-", 7) ||
334: memcmp(eoln - 3, "-*-", 3))
335: return(0);
336:
337: /* Move after the header and adjust for the trailer. */
338:
339: ln += 7;
340: sz -= 10;
341:
342: while (sz > 0) {
343: while (sz > 0 && ' ' == *ln) {
344: ln++;
345: sz--;
346: }
347: if (0 == sz)
348: break;
349:
350: /* Find the end-of-phrase marker (or eoln). */
351:
352: if (NULL == (eoph = memchr(ln, ';', sz)))
353: eoph = eoln - 3;
354: else
355: eoph++;
356:
357: /* Only account for the "coding" phrase. */
358:
359: if ((phsz = (size_t)(eoph - ln)) < 7 ||
360: strncasecmp(ln, "coding:", 7)) {
361: sz -= phsz;
362: ln += phsz;
363: continue;
364: }
365:
366: sz -= 7;
367: ln += 7;
368:
369: while (sz > 0 && ' ' == *ln) {
370: ln++;
371: sz--;
372: }
373: if (0 == sz)
374: break;
375:
376: /* Check us against known encodings. */
377:
1.4 kristaps 378: for (i = 0; i < (int)ENC__MAX; i++) {
1.3 kristaps 379: nsz = strlen(encs[i].name);
380: if (phsz < nsz)
381: continue;
382: if (strncasecmp(ln, encs[i].name, nsz))
383: continue;
384:
385: *enc = (enum enc)i;
386: return(1);
387: }
388:
389: /* Unknown encoding. */
390:
391: *enc = ENC__MAX;
392: return(1);
393: }
394:
395: return(0);
396: }
397:
1.1 kristaps 398: int
399: main(int argc, char *argv[])
400: {
401: int i, ch, map, fd, rc;
1.2 kristaps 402: struct buf b;
1.1 kristaps 403: const char *fn;
404: enum enc enc, def;
1.4 kristaps 405: unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
1.3 kristaps 406: size_t offs;
1.1 kristaps 407: extern int optind;
408: extern char *optarg;
409:
410: progname = strrchr(argv[0], '/');
411: if (progname == NULL)
412: progname = argv[0];
413: else
414: ++progname;
415:
416: fn = "<stdin>";
417: fd = STDIN_FILENO;
418: rc = EXIT_FAILURE;
419: enc = def = ENC__MAX;
420: map = 0;
421:
1.2 kristaps 422: memset(&b, 0, sizeof(struct buf));
1.1 kristaps 423:
424: while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
425: switch (ch) {
426: case ('D'):
427: /* FALLTHROUGH */
428: case ('e'):
1.4 kristaps 429: for (i = 0; i < (int)ENC__MAX; i++) {
1.1 kristaps 430: if (strcasecmp(optarg, encs[i].name))
431: continue;
432: break;
433: }
1.4 kristaps 434: if (i < (int)ENC__MAX) {
1.1 kristaps 435: if ('D' == ch)
436: def = (enum enc)i;
437: else
438: enc = (enum enc)i;
439: break;
440: }
441:
442: fprintf(stderr, "%s: Bad encoding\n", optarg);
443: return(EXIT_FAILURE);
444: case ('r'):
445: /* FALLTHROUGH */
446: case ('d'):
447: /* FALLTHROUGH */
448: case ('v'):
449: /* Compatibility with GNU preconv. */
450: break;
451: case ('h'):
452: /* Compatibility with GNU preconv. */
453: /* FALLTHROUGH */
454: default:
455: usage();
456: return(EXIT_FAILURE);
457: }
458:
459: argc -= optind;
460: argv += optind;
461:
462: /*
463: * Open and read the first argument on the command-line.
464: * If we don't have one, we default to stdin.
465: */
466:
467: if (argc > 0) {
468: fn = *argv;
469: fd = open(fn, O_RDONLY, 0);
470: if (-1 == fd) {
471: perror(fn);
472: return(EXIT_FAILURE);
473: }
474: }
475:
1.2 kristaps 476: if ( ! read_whole_file(fn, fd, &b, &map))
1.1 kristaps 477: goto out;
478:
1.2 kristaps 479: /* Try to read the UTF-8 BOM. */
480:
481: if (ENC__MAX == enc)
482: if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
483: b.offs = 3;
484: enc = ENC_UTF_8;
485: }
1.1 kristaps 486:
1.3 kristaps 487: /* Try reading from the "-*-" cue. */
488:
489: if (ENC__MAX == enc) {
490: offs = b.offs;
491: ch = cue_enc(&b, &offs, &enc);
492: if (0 == ch)
493: ch = cue_enc(&b, &offs, &enc);
494: }
495:
1.1 kristaps 496: /*
497: * No encoding has been detected.
498: * Thus, we either fall into our default encoder, if specified,
499: * or use Latin-1 if all else fails.
500: */
501:
502: if (ENC__MAX == enc)
503: enc = ENC__MAX == def ? ENC_LATIN_1 : def;
504:
1.3 kristaps 505: if ( ! (*encs[(int)enc].conv)(&b)) {
506: fprintf(stderr, "%s: Bad encoding\n", fn);
1.1 kristaps 507: goto out;
1.3 kristaps 508: }
1.1 kristaps 509:
510: rc = EXIT_SUCCESS;
511: out:
1.5 kristaps 512: #ifdef HAVE_MMAP
1.1 kristaps 513: if (map)
1.2 kristaps 514: munmap(b.buf, b.sz);
1.1 kristaps 515: else
1.5 kristaps 516: #endif
1.2 kristaps 517: free(b.buf);
1.1 kristaps 518:
519: if (fd > STDIN_FILENO)
520: close(fd);
521:
522: return(rc);
523: }
CVSweb