Annotation of mandoc/preconv.c, Revision 1.1
1.1 ! kristaps 1: /* $Id: read.c,v 1.14 2011/04/30 10:18:24 kristaps Exp $ */
! 2: /*
! 3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
! 4: *
! 5: * Permission to use, copy, modify, and distribute this software for any
! 6: * purpose with or without fee is hereby granted, provided that the above
! 7: * copyright notice and this permission notice appear in all copies.
! 8: *
! 9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
! 10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
! 12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 16: */
! 17: #ifdef HAVE_CONFIG_H
! 18: #include "config.h"
! 19: #endif
! 20:
! 21: #include <sys/stat.h>
! 22: #include <sys/mman.h>
! 23:
! 24: #include <assert.h>
! 25: #include <fcntl.h>
! 26: #include <stdio.h>
! 27: #include <stdlib.h>
! 28: #include <string.h>
! 29: #include <unistd.h>
! 30:
! 31: /*
! 32: * The read_whole_file() and resize_buf() functions are copied from
! 33: * read.c, including all dependency code (MAP_FILE, etc.).
! 34: */
! 35:
! 36: #ifndef MAP_FILE
! 37: #define MAP_FILE 0
! 38: #endif
! 39:
! 40: enum enc {
! 41: ENC_UTF_8, /* UTF-8 */
! 42: ENC_US_ASCII, /* US-ASCII */
! 43: ENC_LATIN_1, /* Latin-1 */
! 44: ENC__MAX
! 45: };
! 46:
! 47: struct buf {
! 48: char *buf; /* binary input buffer */
! 49: size_t sz; /* size of binary buffer */
! 50: size_t offs; /* starting buffer offset */
! 51: };
! 52:
! 53: struct encode {
! 54: const char *name;
! 55: int (*conv)(const struct buf *);
! 56: };
! 57:
! 58: static int conv_latin_1(const struct buf *);
! 59: static int conv_us_ascii(const struct buf *);
! 60: static int conv_utf_8(const struct buf *);
! 61: static int read_whole_file(const char *, int,
! 62: struct buf *, int *);
! 63: static void resize_buf(struct buf *, size_t);
! 64: static void usage(void);
! 65:
! 66: static const struct encode encs[ENC__MAX] = {
! 67: { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
! 68: { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
! 69: { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
! 70: };
! 71:
! 72: static const char *progname;
! 73:
! 74: static void
! 75: usage(void)
! 76: {
! 77:
! 78: fprintf(stderr, "usage: %s "
! 79: "[-D enc] "
! 80: "[-e ENC] "
! 81: "[file]\n", progname);
! 82: }
! 83:
! 84: static int
! 85: conv_latin_1(const struct buf *b)
! 86: {
! 87: size_t i;
! 88: unsigned char c;
! 89: const char *cp;
! 90:
! 91: cp = b->buf + (int)b->offs;
! 92:
! 93: /*
! 94: * Latin-1 falls into the first 256 code-points of Unicode, so
! 95: * there's no need for any sort of translation. Just make the
! 96: * 8-bit characters use the Unicode escape.
! 97: */
! 98:
! 99: for (i = b->offs; i < b->sz; i++) {
! 100: c = (unsigned char)*cp++;
! 101: c < 128 ? putchar(c) : printf("\\[u%.4X]", c);
! 102: }
! 103:
! 104: return(1);
! 105: }
! 106:
! 107: static int
! 108: conv_us_ascii(const struct buf *b)
! 109: {
! 110:
! 111: /*
! 112: * US-ASCII has no conversion since it falls into the first 128
! 113: * bytes of Unicode.
! 114: */
! 115:
! 116: fwrite(b->buf, 1, b->sz, stdout);
! 117: return(1);
! 118: }
! 119:
! 120: static int
! 121: conv_utf_8(const struct buf *b)
! 122: {
! 123:
! 124: return(1);
! 125: }
! 126:
! 127: static void
! 128: resize_buf(struct buf *buf, size_t initial)
! 129: {
! 130:
! 131: buf->sz = buf->sz > initial / 2 ?
! 132: 2 * buf->sz : initial;
! 133:
! 134: buf->buf = realloc(buf->buf, buf->sz);
! 135: if (NULL == buf->buf) {
! 136: perror(NULL);
! 137: exit(EXIT_FAILURE);
! 138: }
! 139: }
! 140:
! 141: static int
! 142: read_whole_file(const char *f, int fd,
! 143: struct buf *fb, int *with_mmap)
! 144: {
! 145: struct stat st;
! 146: size_t off;
! 147: ssize_t ssz;
! 148:
! 149: if (-1 == fstat(fd, &st)) {
! 150: perror(f);
! 151: return(0);
! 152: }
! 153:
! 154: /*
! 155: * If we're a regular file, try just reading in the whole entry
! 156: * via mmap(). This is faster than reading it into blocks, and
! 157: * since each file is only a few bytes to begin with, I'm not
! 158: * concerned that this is going to tank any machines.
! 159: */
! 160:
! 161: if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
! 162: fprintf(stderr, "%s: input too large\n", f);
! 163: return(0);
! 164: }
! 165:
! 166: if (S_ISREG(st.st_mode)) {
! 167: *with_mmap = 1;
! 168: fb->sz = (size_t)st.st_size;
! 169: fb->buf = mmap(NULL, fb->sz, PROT_READ,
! 170: MAP_FILE|MAP_SHARED, fd, 0);
! 171: if (fb->buf != MAP_FAILED)
! 172: return(1);
! 173: }
! 174:
! 175: /*
! 176: * If this isn't a regular file (like, say, stdin), then we must
! 177: * go the old way and just read things in bit by bit.
! 178: */
! 179:
! 180: *with_mmap = 0;
! 181: off = 0;
! 182: fb->sz = 0;
! 183: fb->buf = NULL;
! 184: for (;;) {
! 185: if (off == fb->sz && fb->sz == (1U << 31)) {
! 186: fprintf(stderr, "%s: input too large\n", f);
! 187: break;
! 188: }
! 189:
! 190: if (off == fb->sz)
! 191: resize_buf(fb, 65536);
! 192:
! 193: ssz = read(fd, fb->buf + (int)off, fb->sz - off);
! 194: if (ssz == 0) {
! 195: fb->sz = off;
! 196: return(1);
! 197: }
! 198: if (ssz == -1) {
! 199: perror(f);
! 200: break;
! 201: }
! 202: off += (size_t)ssz;
! 203: }
! 204:
! 205: free(fb->buf);
! 206: fb->buf = NULL;
! 207: return(0);
! 208: }
! 209:
! 210: int
! 211: main(int argc, char *argv[])
! 212: {
! 213: int i, ch, map, fd, rc;
! 214: struct buf buf;
! 215: const char *fn;
! 216: enum enc enc, def;
! 217: extern int optind;
! 218: extern char *optarg;
! 219:
! 220: progname = strrchr(argv[0], '/');
! 221: if (progname == NULL)
! 222: progname = argv[0];
! 223: else
! 224: ++progname;
! 225:
! 226: fn = "<stdin>";
! 227: fd = STDIN_FILENO;
! 228: rc = EXIT_FAILURE;
! 229: enc = def = ENC__MAX;
! 230: map = 0;
! 231:
! 232: memset(&buf, 0, sizeof(struct buf));
! 233:
! 234: while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
! 235: switch (ch) {
! 236: case ('D'):
! 237: /* FALLTHROUGH */
! 238: case ('e'):
! 239: for (i = 0; i < ENC__MAX; i++) {
! 240: if (strcasecmp(optarg, encs[i].name))
! 241: continue;
! 242: break;
! 243: }
! 244: if (i < ENC__MAX) {
! 245: if ('D' == ch)
! 246: def = (enum enc)i;
! 247: else
! 248: enc = (enum enc)i;
! 249: break;
! 250: }
! 251:
! 252: fprintf(stderr, "%s: Bad encoding\n", optarg);
! 253: return(EXIT_FAILURE);
! 254: case ('r'):
! 255: /* FALLTHROUGH */
! 256: case ('d'):
! 257: /* FALLTHROUGH */
! 258: case ('v'):
! 259: /* Compatibility with GNU preconv. */
! 260: break;
! 261: case ('h'):
! 262: /* Compatibility with GNU preconv. */
! 263: /* FALLTHROUGH */
! 264: default:
! 265: usage();
! 266: return(EXIT_FAILURE);
! 267: }
! 268:
! 269: argc -= optind;
! 270: argv += optind;
! 271:
! 272: /*
! 273: * Open and read the first argument on the command-line.
! 274: * If we don't have one, we default to stdin.
! 275: */
! 276:
! 277: if (argc > 0) {
! 278: fn = *argv;
! 279: fd = open(fn, O_RDONLY, 0);
! 280: if (-1 == fd) {
! 281: perror(fn);
! 282: return(EXIT_FAILURE);
! 283: }
! 284: }
! 285:
! 286: if ( ! read_whole_file(fn, fd, &buf, &map))
! 287: goto out;
! 288:
! 289: if (ENC__MAX == enc) {
! 290: /* TODO: search for BOM. */
! 291: }
! 292:
! 293: /*
! 294: * No encoding has been detected.
! 295: * Thus, we either fall into our default encoder, if specified,
! 296: * or use Latin-1 if all else fails.
! 297: */
! 298:
! 299: if (ENC__MAX == enc)
! 300: enc = ENC__MAX == def ? ENC_LATIN_1 : def;
! 301:
! 302: if ( ! (*encs[(int)enc].conv)(&buf))
! 303: goto out;
! 304:
! 305: rc = EXIT_SUCCESS;
! 306: out:
! 307: if (map)
! 308: munmap(buf.buf, buf.sz);
! 309: else
! 310: free(buf.buf);
! 311:
! 312: if (fd > STDIN_FILENO)
! 313: close(fd);
! 314:
! 315: return(rc);
! 316: }
CVSweb