mandoc/preconv.c - annotate

Return to preconv.c CVS log
Up to [cvsweb.bsd.lv] / mandoc
Annotation of mandoc/preconv.c, Revision 1.1

1.1     ! kristaps    1: /*     $Id: read.c,v 1.14 2011/04/30 10:18:24 kristaps Exp $ */
        !             2: /*
        !             3:  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
        !             4:  *
        !             5:  * Permission to use, copy, modify, and distribute this software for any
        !             6:  * purpose with or without fee is hereby granted, provided that the above
        !             7:  * copyright notice and this permission notice appear in all copies.
        !             8:  *
        !             9:  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
        !            10:  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
        !            11:  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
        !            12:  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
        !            13:  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
        !            14:  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
        !            15:  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
        !            16:  */
        !            17: #ifdef HAVE_CONFIG_H
        !            18: #include "config.h"
        !            19: #endif
        !            20:
        !            21: #include <sys/stat.h>
        !            22: #include <sys/mman.h>
        !            23:
        !            24: #include <assert.h>
        !            25: #include <fcntl.h>
        !            26: #include <stdio.h>
        !            27: #include <stdlib.h>
        !            28: #include <string.h>
        !            29: #include <unistd.h>
        !            30:
        !            31: /*
        !            32:  * The read_whole_file() and resize_buf() functions are copied from
        !            33:  * read.c, including all dependency code (MAP_FILE, etc.).
        !            34:  */
        !            35:
        !            36: #ifndef MAP_FILE
        !            37: #define        MAP_FILE        0
        !            38: #endif
        !            39:
        !            40: enum   enc {
        !            41:        ENC_UTF_8, /* UTF-8 */
        !            42:        ENC_US_ASCII, /* US-ASCII */
        !            43:        ENC_LATIN_1, /* Latin-1 */
        !            44:        ENC__MAX
        !            45: };
        !            46:
        !            47: struct buf {
        !            48:        char             *buf; /* binary input buffer */
        !            49:        size_t            sz; /* size of binary buffer */
        !            50:        size_t            offs; /* starting buffer offset */
        !            51: };
        !            52:
        !            53: struct encode {
        !            54:        const char       *name;
        !            55:        int             (*conv)(const struct buf *);
        !            56: };
        !            57:
        !            58: static int      conv_latin_1(const struct buf *);
        !            59: static int      conv_us_ascii(const struct buf *);
        !            60: static int      conv_utf_8(const struct buf *);
        !            61: static int      read_whole_file(const char *, int,
        !            62:                        struct buf *, int *);
        !            63: static void     resize_buf(struct buf *, size_t);
        !            64: static void     usage(void);
        !            65:
        !            66: static const struct encode encs[ENC__MAX] = {
        !            67:        { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
        !            68:        { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
        !            69:        { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
        !            70: };
        !            71:
        !            72: static const char       *progname;
        !            73:
        !            74: static void
        !            75: usage(void)
        !            76: {
        !            77:
        !            78:        fprintf(stderr, "usage: %s "
        !            79:                        "[-D enc] "
        !            80:                        "[-e ENC] "
        !            81:                        "[file]\n", progname);
        !            82: }
        !            83:
        !            84: static int
        !            85: conv_latin_1(const struct buf *b)
        !            86: {
        !            87:        size_t           i;
        !            88:        unsigned char    c;
        !            89:        const char      *cp;
        !            90:
        !            91:        cp = b->buf + (int)b->offs;
        !            92:
        !            93:        /*
        !            94:         * Latin-1 falls into the first 256 code-points of Unicode, so
        !            95:         * there's no need for any sort of translation.  Just make the
        !            96:         * 8-bit characters use the Unicode escape.
        !            97:         */
        !            98:
        !            99:        for (i = b->offs; i < b->sz; i++) {
        !           100:                c = (unsigned char)*cp++;
        !           101:                c < 128 ? putchar(c) : printf("\\[u%.4X]", c);
        !           102:        }
        !           103:
        !           104:        return(1);
        !           105: }
        !           106:
        !           107: static int
        !           108: conv_us_ascii(const struct buf *b)
        !           109: {
        !           110:
        !           111:        /*
        !           112:         * US-ASCII has no conversion since it falls into the first 128
        !           113:         * bytes of Unicode.
        !           114:         */
        !           115:
        !           116:        fwrite(b->buf, 1, b->sz, stdout);
        !           117:        return(1);
        !           118: }
        !           119:
        !           120: static int
        !           121: conv_utf_8(const struct buf *b)
        !           122: {
        !           123:
        !           124:        return(1);
        !           125: }
        !           126:
        !           127: static void
        !           128: resize_buf(struct buf *buf, size_t initial)
        !           129: {
        !           130:
        !           131:        buf->sz = buf->sz > initial / 2 ?
        !           132:                2 * buf->sz : initial;
        !           133:
        !           134:        buf->buf = realloc(buf->buf, buf->sz);
        !           135:        if (NULL == buf->buf) {
        !           136:                perror(NULL);
        !           137:                exit(EXIT_FAILURE);
        !           138:        }
        !           139: }
        !           140:
        !           141: static int
        !           142: read_whole_file(const char *f, int fd,
        !           143:                struct buf *fb, int *with_mmap)
        !           144: {
        !           145:        struct stat      st;
        !           146:        size_t           off;
        !           147:        ssize_t          ssz;
        !           148:
        !           149:        if (-1 == fstat(fd, &st)) {
        !           150:                perror(f);
        !           151:                return(0);
        !           152:        }
        !           153:
        !           154:        /*
        !           155:         * If we're a regular file, try just reading in the whole entry
        !           156:         * via mmap().  This is faster than reading it into blocks, and
        !           157:         * since each file is only a few bytes to begin with, I'm not
        !           158:         * concerned that this is going to tank any machines.
        !           159:         */
        !           160:
        !           161:        if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
        !           162:                fprintf(stderr, "%s: input too large\n", f);
        !           163:                return(0);
        !           164:        }
        !           165:
        !           166:        if (S_ISREG(st.st_mode)) {
        !           167:                *with_mmap = 1;
        !           168:                fb->sz = (size_t)st.st_size;
        !           169:                fb->buf = mmap(NULL, fb->sz, PROT_READ,
        !           170:                                MAP_FILE|MAP_SHARED, fd, 0);
        !           171:                if (fb->buf != MAP_FAILED)
        !           172:                        return(1);
        !           173:        }
        !           174:
        !           175:        /*
        !           176:         * If this isn't a regular file (like, say, stdin), then we must
        !           177:         * go the old way and just read things in bit by bit.
        !           178:         */
        !           179:
        !           180:        *with_mmap = 0;
        !           181:        off = 0;
        !           182:        fb->sz = 0;
        !           183:        fb->buf = NULL;
        !           184:        for (;;) {
        !           185:                if (off == fb->sz && fb->sz == (1U << 31)) {
        !           186:                        fprintf(stderr, "%s: input too large\n", f);
        !           187:                        break;
        !           188:                }
        !           189:
        !           190:                if (off == fb->sz)
        !           191:                        resize_buf(fb, 65536);
        !           192:
        !           193:                ssz = read(fd, fb->buf + (int)off, fb->sz - off);
        !           194:                if (ssz == 0) {
        !           195:                        fb->sz = off;
        !           196:                        return(1);
        !           197:                }
        !           198:                if (ssz == -1) {
        !           199:                        perror(f);
        !           200:                        break;
        !           201:                }
        !           202:                off += (size_t)ssz;
        !           203:        }
        !           204:
        !           205:        free(fb->buf);
        !           206:        fb->buf = NULL;
        !           207:        return(0);
        !           208: }
        !           209:
        !           210: int
        !           211: main(int argc, char *argv[])
        !           212: {
        !           213:        int              i, ch, map, fd, rc;
        !           214:        struct buf       buf;
        !           215:        const char      *fn;
        !           216:        enum enc         enc, def;
        !           217:        extern int       optind;
        !           218:        extern char     *optarg;
        !           219:
        !           220:        progname = strrchr(argv[0], '/');
        !           221:        if (progname == NULL)
        !           222:                progname = argv[0];
        !           223:        else
        !           224:                ++progname;
        !           225:
        !           226:        fn = "<stdin>";
        !           227:        fd = STDIN_FILENO;
        !           228:        rc = EXIT_FAILURE;
        !           229:        enc = def = ENC__MAX;
        !           230:        map = 0;
        !           231:
        !           232:        memset(&buf, 0, sizeof(struct buf));
        !           233:
        !           234:        while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
        !           235:                switch (ch) {
        !           236:                case ('D'):
        !           237:                        /* FALLTHROUGH */
        !           238:                case ('e'):
        !           239:                        for (i = 0; i < ENC__MAX; i++) {
        !           240:                                if (strcasecmp(optarg, encs[i].name))
        !           241:                                        continue;
        !           242:                                break;
        !           243:                        }
        !           244:                        if (i < ENC__MAX) {
        !           245:                                if ('D' == ch)
        !           246:                                        def = (enum enc)i;
        !           247:                                else
        !           248:                                        enc = (enum enc)i;
        !           249:                                break;
        !           250:                        }
        !           251:
        !           252:                        fprintf(stderr, "%s: Bad encoding\n", optarg);
        !           253:                        return(EXIT_FAILURE);
        !           254:                case ('r'):
        !           255:                        /* FALLTHROUGH */
        !           256:                case ('d'):
        !           257:                        /* FALLTHROUGH */
        !           258:                case ('v'):
        !           259:                        /* Compatibility with GNU preconv. */
        !           260:                        break;
        !           261:                case ('h'):
        !           262:                        /* Compatibility with GNU preconv. */
        !           263:                        /* FALLTHROUGH */
        !           264:                default:
        !           265:                        usage();
        !           266:                        return(EXIT_FAILURE);
        !           267:                }
        !           268:
        !           269:        argc -= optind;
        !           270:        argv += optind;
        !           271:
        !           272:        /*
        !           273:         * Open and read the first argument on the command-line.
        !           274:         * If we don't have one, we default to stdin.
        !           275:         */
        !           276:
        !           277:        if (argc > 0) {
        !           278:                fn = *argv;
        !           279:                fd = open(fn, O_RDONLY, 0);
        !           280:                if (-1 == fd) {
        !           281:                        perror(fn);
        !           282:                        return(EXIT_FAILURE);
        !           283:                }
        !           284:        }
        !           285:
        !           286:        if ( ! read_whole_file(fn, fd, &buf, &map))
        !           287:                goto out;
        !           288:
        !           289:        if (ENC__MAX == enc) {
        !           290:                /* TODO: search for BOM. */
        !           291:        }
        !           292:
        !           293:        /*
        !           294:         * No encoding has been detected.
        !           295:         * Thus, we either fall into our default encoder, if specified,
        !           296:         * or use Latin-1 if all else fails.
        !           297:         */
        !           298:
        !           299:        if (ENC__MAX == enc)
        !           300:                enc = ENC__MAX == def ? ENC_LATIN_1 : def;
        !           301:
        !           302:        if ( ! (*encs[(int)enc].conv)(&buf))
        !           303:                goto out;
        !           304:
        !           305:        rc = EXIT_SUCCESS;
        !           306: out:
        !           307:        if (map)
        !           308:                munmap(buf.buf, buf.sz);
        !           309:        else
        !           310:                free(buf.buf);
        !           311:
        !           312:        if (fd > STDIN_FILENO)
        !           313:                close(fd);
        !           314:
        !           315:        return(rc);
        !           316: }
CVSweb