/* $Id: util.c,v 1.4 2015/02/21 17:00:33 kristaps Exp $ */ /* * Copyright (c) 2015 Kristaps Dzonsons * * Permission to use, copy, modify, and distribute this software for any * purpose with or without fee is hereby granted, provided that the above * copyright notice and this permission notice appear in all copies. * * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. */ #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include "extern.h" /* * Unmap the top-most file in the stack of files currently opened (that * is, nested calls to parsefile()). */ void texifilepop(struct texi *p) { struct texifile *f; assert(p->filepos > 0); f = &p->files[--p->filepos]; munmap(f->map, f->mapsz); } /* * Unmap all files that we're currently using and free all resources * that we've allocated during the parse. * The utility should exit(...) after this is called. */ void texiexit(struct texi *p) { size_t i; /* Make sure we're newline-terminated. */ if (p->outcol) putchar('\n'); /* Unmap all files. */ while (p->filepos > 0) texifilepop(p); for (i = 0; i < p->dirsz; i++) free(p->dirs[i]); for (i = 0; i < p->indexsz; i++) free(p->indexs[i]); for (i = 0; i < p->valsz; i++) { free(p->vals[i].value); free(p->vals[i].key); } free(p->vals); free(p->indexs); free(p->dirs); free(p->subtitle); free(p->title); } /* * Fatal error: unmap all files and exit. * The "errstring" is passed to perror(3). */ void texiabort(struct texi *p, const char *errstring) { perror(errstring); texiexit(p); exit(EXIT_FAILURE); } /* * Print a generic warning message (to stderr) tied to our current * location in the parse sequence. */ void texiwarn(const struct texi *p, const char *fmt, ...) { va_list ap; fprintf(stderr, "%s:%zu:%zu: warning: ", p->files[p->filepos - 1].name, p->files[p->filepos - 1].line + 1, p->files[p->filepos - 1].col + 1); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); } /* * Print an error message (to stderr) tied to our current location in * the parse sequence, invoke texiexit(), then die. */ void texierr(struct texi *p, const char *fmt, ...) { va_list ap; fprintf(stderr, "%s:%zu:%zu: error: ", p->files[p->filepos - 1].name, p->files[p->filepos - 1].line + 1, p->files[p->filepos - 1].col + 1); va_start(ap, fmt); vfprintf(stderr, fmt, ap); va_end(ap); fputc('\n', stderr); texiexit(p); exit(EXIT_FAILURE); } /* * Put a single data character to the output if we're not ignoring. * Makes sure we don't spurriously start a macro. * Adjusts our output status. * This shouldn't be called for macros: just for ordinary text. */ void texiputchar(struct texi *p, char c) { if (p->ign) return; if ('.' == c && 0 == p->outcol) fputs("\\&", stdout); putchar(c); p->seenvs = 0; if ('\n' == c) { p->outcol = 0; p->seenws = 0; } else p->outcol++; } /* * Put multiple characters (see texiputchar()). * This shouldn't be called for macros: just for ordinary text. */ void texiputchars(struct texi *p, const char *s) { while ('\0' != *s) texiputchar(p, *s++); } /* * Close an mdoc(7) macro opened with teximacroopen(). * If there are no more macros on the line, prints a newline. */ void teximacroclose(struct texi *p) { if (p->ign) return; if (0 == --p->outmacro) { putchar('\n'); p->outcol = p->seenws = 0; } } /* * Open a mdoc(7) macro. * This is used for line macros, e.g., Qq [foo bar baz]. * It can be invoked for nested macros, e.g., Qq Li foo . * TODO: flush-right punctuation (e.g., parenthesis). */ void teximacroopen(struct texi *p, const char *s) { int rc; if (p->ign) return; if (p->outcol && 0 == p->outmacro) { putchar('\n'); p->outcol = 0; } if (0 == p->outmacro) putchar('.'); else putchar(' '); if (EOF != (rc = fputs(s, stdout))) p->outcol += rc; putchar(' '); p->outcol++; p->outmacro++; p->seenws = 0; } /* * Put a stadnalone mdoc(7) command with the trailing newline. */ void teximacro(struct texi *p, const char *s) { if (p->ign) return; if (p->outmacro) texierr(p, "\"%s\" in open line scope!?", s); if (p->literal) texierr(p, "\"%s\" in a literal scope!?", s); if (p->outcol) putchar('\n'); putchar('.'); puts(s); p->outcol = p->seenws = 0; } /* * Introduce vertical space during normal (non-macro) input. */ void texivspace(struct texi *p) { if (p->seenvs) return; teximacro(p, "Pp"); p->seenvs = 1; } /* * Advance by a single byte in the input stream, adjusting our location * in the current input file. */ void advance(struct texi *p, const char *buf, size_t *pos) { if ('\n' == buf[*pos]) { p->files[p->filepos - 1].line++; p->files[p->filepos - 1].col = 0; } else p->files[p->filepos - 1].col++; (*pos)++; } /* * It's common to wait punctuation to float on the right side of macro * lines in mdoc(7), e.g., ".Em hello ) ." * This function does so, and should be called before teximacroclose(). * It will detect that it's the last in the nested macros and * appropriately flush-left punctuation alongside the macro. */ void texipunctuate(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t start, end; if (1 != p->outmacro) return; for (start = end = *pos; end < sz; end++) { switch (buf[end]) { case (','): case (')'): case ('.'): case ('"'): case (':'): case ('!'): case ('?'): continue; default: break; } break; } if (end == *pos) return; if (end + 1 == sz || ' ' == buf[end] || '\n' == buf[end]) { for ( ; start < end; start++) { texiputchar(p, ' '); texiputchar(p, buf[start]); advance(p, buf, pos); } } } /* * Advance to the next non-whitespace word in the input stream. * If we're in literal mode, then print all of the whitespace as we're * doing so. */ static size_t advancenext(struct texi *p, const char *buf, size_t sz, size_t *pos) { if (p->literal) { while (*pos < sz && ismspace(buf[*pos])) { if (*pos && '\n' == buf[*pos] && '\\' == buf[*pos - 1]) texiputchar(p, 'e'); texiputchar(p, buf[*pos]); advance(p, buf, pos); } return(*pos); } while (*pos < sz && ismspace(buf[*pos])) { p->seenws = 1; /* * If it looks like we've printed a double-line, then * output a paragraph. * FIXME: this is stupid. */ if (*pos && '\n' == buf[*pos] && '\n' == buf[*pos - 1]) texivspace(p); advance(p, buf, pos); } return(*pos); } /* * Advance to the EOLN in the input stream. * NOTE: THIS SHOULD NOT BE CALLED ON BLANK TEXT, as it will read up to * the @\n. */ size_t advanceeoln(struct texi *p, const char *buf, size_t sz, size_t *pos, int consumenl) { while (*pos < sz && '\n' != buf[*pos]) advance(p, buf, pos); if (*pos < sz && consumenl) advance(p, buf, pos); return(*pos); } /* * Advance to position "end", which is an absolute position in the * current buffer greater than or equal to the current position. */ void advanceto(struct texi *p, const char *buf, size_t *pos, size_t end) { assert(*pos <= end); while (*pos < end) advance(p, buf, pos); } /* * Output a free-form word in the input stream, progressing to the next * command or white-space. * This also will advance the input stream. */ static void texiword(struct texi *p, const char *buf, size_t sz, size_t *pos, char extra) { if (p->seenws && 0 == p->outmacro && p->outcol > 72 && 0 == p->literal) texiputchar(p, '\n'); /* FIXME: abstract this: we use it elsewhere. */ if (p->seenws && p->outcol && 0 == p->literal) texiputchar(p, ' '); p->seenws = 0; while (*pos < sz && ! ismspace(buf[*pos])) { switch (buf[*pos]) { case ('@'): case ('}'): case ('{'): return; } if ('\0' != extra && buf[*pos] == extra) return; if (*pos < sz - 1 && '`' == buf[*pos] && '`' == buf[*pos + 1]) { texiputchars(p, "\\(lq"); advance(p, buf, pos); } else if (*pos < sz - 1 && '\'' == buf[*pos] && '\'' == buf[*pos + 1]) { texiputchars(p, "\\(rq"); advance(p, buf, pos); } else texiputchar(p, buf[*pos]); advance(p, buf, pos); } } /* * Look up the command at position "pos" in the buffer, returning it (or * TEXICMD__MAX if none found) and setting "end" to be the absolute * index after the command name. */ enum texicmd texicmd(struct texi *p, const char *buf, size_t pos, size_t sz, size_t *end) { size_t i, len, toksz; assert('@' == buf[pos]); if ((*end = pos) == sz) return(TEXICMD__MAX); else if ((*end = ++pos) == sz) return(TEXICMD__MAX); /* Alphabetic commands are special. */ if ( ! isalpha(buf[pos])) { if ((*end = pos + 1) == sz) return(TEXICMD__MAX); for (i = 0; i < TEXICMD__MAX; i++) { if (1 != texitoks[i].len) continue; if (0 == strncmp(texitoks[i].tok, &buf[pos], 1)) return(i); } texiwarn(p, "bad command: @%c", buf[pos]); return(TEXICMD__MAX); } /* Scan to the end of the possible command name. */ for (*end = pos; *end < sz && ! ismspace(buf[*end]); (*end)++) if ((*end > pos && ('@' == buf[*end] || '{' == buf[*end] || '}' == buf[*end]))) break; /* Look for the command. */ len = *end - pos; for (i = 0; i < TEXICMD__MAX; i++) { if (len != texitoks[i].len) continue; if (0 == strncmp(texitoks[i].tok, &buf[pos], len)) return(i); } /* Look for it in our indices. */ for (i = 0; i < p->indexsz; i++) { toksz = strlen(p->indexs[i]); if (len != 5 + toksz) continue; if (strncmp(&buf[pos], p->indexs[i], toksz)) continue; if (0 == strncmp(&buf[pos + toksz], "index", 5)) return(TEXICMD_INDEX); } texiwarn(p, "bad command: @%.*s", (int)len, &buf[pos]); return(TEXICMD__MAX); } /* * Parse an argument from a bracketed command, e.g., @url{foo, baz}. * Num should be set to the argument we're currently parsing, although * it suffixes for it to be zero or non-zero. * This will return 1 if there are more arguments, 0 otherwise. * This will stop (returning 0) in the event of EOF or if we're not at a * bracket for the zeroth parse. */ int parsearg(struct texi *p, const char *buf, size_t sz, size_t *pos, size_t num) { size_t end; enum texicmd cmd; while (*pos < sz && ismspace(buf[*pos])) advance(p, buf, pos); if (*pos == sz || (0 == num && '{' != buf[*pos])) return(0); if (0 == num) advance(p, buf, pos); while ((*pos = advancenext(p, buf, sz, pos)) < sz) { switch (buf[*pos]) { case (','): advance(p, buf, pos); return(1); case ('}'): advance(p, buf, pos); return(0); case ('{'): if (0 == p->ign) texiwarn(p, "unexpected \"{\""); advance(p, buf, pos); continue; case ('@'): break; default: texiword(p, buf, sz, pos, ','); continue; } cmd = texicmd(p, buf, *pos, sz, &end); advanceto(p, buf, pos, end); if (TEXICMD__MAX == cmd) continue; if (NULL != texitoks[cmd].fp) (*texitoks[cmd].fp)(p, cmd, buf, sz, pos); } return(0); } /* * Parse until the end of a bracketed statement, e.g., @foo{bar baz}. * This will stop in the event of EOF or if we're not at a bracket. */ void parsebracket(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t end; enum texicmd cmd; while (*pos < sz && ismspace(buf[*pos])) advance(p, buf, pos); if (*pos == sz || '{' != buf[*pos]) return; advance(p, buf, pos); while ((*pos = advancenext(p, buf, sz, pos)) < sz) { switch (buf[*pos]) { case ('}'): advance(p, buf, pos); return; case ('{'): if (0 == p->ign) texiwarn(p, "unexpected \"{\""); advance(p, buf, pos); continue; case ('@'): break; default: texiword(p, buf, sz, pos, '\0'); continue; } cmd = texicmd(p, buf, *pos, sz, &end); advanceto(p, buf, pos, end); if (TEXICMD__MAX == cmd) continue; if (NULL != texitoks[cmd].fp) (*texitoks[cmd].fp)(p, cmd, buf, sz, pos); } } /* * This should be invoked when we're on a macro line and want to process * to the end of the current input line, doing all of our macros along * the way. */ void parseeoln(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t end; enum texicmd cmd; while (*pos < sz && '\n' != buf[*pos]) { while (*pos < sz && isws(buf[*pos])) { p->seenws = 1; if (p->literal) texiputchar(p, buf[*pos]); advance(p, buf, pos); } switch (buf[*pos]) { case ('}'): if (0 == p->ign) texiwarn(p, "unexpected \"}\""); advance(p, buf, pos); continue; case ('{'): if (0 == p->ign) texiwarn(p, "unexpected \"{\""); advance(p, buf, pos); continue; case ('@'): break; default: texiword(p, buf, sz, pos, '\0'); continue; } cmd = texicmd(p, buf, *pos, sz, &end); advanceto(p, buf, pos, end); if (TEXICMD__MAX == cmd) continue; if (NULL != texitoks[cmd].fp) (*texitoks[cmd].fp)(p, cmd, buf, sz, pos); } } /* * Parse a single word or command. * This will return immediately at the EOF. */ void parsesingle(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t end; enum texicmd cmd; if ((*pos = advancenext(p, buf, sz, pos)) >= sz) return; switch (buf[*pos]) { case ('}'): if (0 == p->ign) texiwarn(p, "unexpected \"}\""); advance(p, buf, pos); return; case ('{'): if (0 == p->ign) texiwarn(p, "unexpected \"{\""); advance(p, buf, pos); return; case ('@'): break; default: texiword(p, buf, sz, pos, '\0'); return; } cmd = texicmd(p, buf, *pos, sz, &end); advanceto(p, buf, pos, end); if (TEXICMD__MAX == cmd) return; if (NULL != texitoks[cmd].fp) (*texitoks[cmd].fp)(p, cmd, buf, sz, pos); } /* * This is used in the @deffn type of command. * These have an arbitrary number of line arguments; however, these * arguments may or may not be surrounded by brackets. * In this function, we parse each one as either a bracketed or * non-bracketed argument, returning 0 when we've reached the end of * line or 1 otherwise. */ int parselinearg(struct texi *p, const char *buf, size_t sz, size_t *pos) { while (*pos < sz && isws(buf[*pos])) { p->seenws = 1; advance(p, buf, pos); } if (*pos < sz && '{' == buf[*pos]) parsebracket(p, buf, sz, pos); else if (*pos < sz && '\n' != buf[*pos]) parsesingle(p, buf, sz, pos); else return(0); return(1); } /* * Parse til the end of the buffer. */ void parseeof(struct texi *p, const char *buf, size_t sz) { size_t pos; for (pos = 0; pos < sz; ) parsesingle(p, buf, sz, &pos); } /* * Parse a block sequence until we have the "@end endtoken" command * invocation. * This will return immediately at EOF. */ void parseto(struct texi *p, const char *buf, size_t sz, size_t *pos, const char *endtoken) { size_t end; enum texicmd cmd; size_t endtoksz; endtoksz = strlen(endtoken); assert(endtoksz > 0); while ((*pos = advancenext(p, buf, sz, pos)) < sz) { switch (buf[*pos]) { case ('}'): if (0 == p->ign) texiwarn(p, "unexpected \"}\""); advance(p, buf, pos); continue; case ('{'): if (0 == p->ign) texiwarn(p, "unexpected \"{\""); advance(p, buf, pos); continue; case ('@'): break; default: texiword(p, buf, sz, pos, '\0'); continue; } cmd = texicmd(p, buf, *pos, sz, &end); advanceto(p, buf, pos, end); if (TEXICMD_END == cmd) { while (*pos < sz && isws(buf[*pos])) advance(p, buf, pos); /* * FIXME: check the full word, not just its * initial substring! */ if (sz - *pos >= endtoksz && 0 == strncmp (&buf[*pos], endtoken, endtoksz)) { advanceeoln(p, buf, sz, pos, 0); break; } if (0 == p->ign) texiwarn(p, "unexpected \"end\""); advanceeoln(p, buf, sz, pos, 0); continue; } else if (TEXICMD__MAX != cmd) if (NULL != texitoks[cmd].fp) (*texitoks[cmd].fp)(p, cmd, buf, sz, pos); } } /* * Memory-map the file "fname" and begin parsing it unless "parse" is * zero, in which case we just dump the file to stdout (making sure it * doesn't trip up mdoc(7) along the way). * This can be called in a nested context. */ void parsefile(struct texi *p, const char *fname, int parse) { struct texifile *f; int fd; struct stat st; size_t i; assert(p->filepos < 64); f = &p->files[p->filepos]; memset(f, 0, sizeof(struct texifile)); f->name = fname; if (-1 == (fd = open(fname, O_RDONLY, 0))) { texiabort(p, fname); } else if (-1 == fstat(fd, &st)) { close(fd); texiabort(p, fname); } f->mapsz = st.st_size; f->map = mmap(NULL, f->mapsz, PROT_READ, MAP_SHARED, fd, 0); close(fd); if (MAP_FAILED == f->map) texiabort(p, fname); p->filepos++; if ( ! parse) { /* * We're printing verbatim output. * Make sure it doesn't get interpreted as mdoc by * escaping escapes and making sure leading dots don't * trigger mdoc(7) expansion. */ for (i = 0; i < f->mapsz; i++) { if (i > 0 && '.' == f->map[i]) if ('\n' == f->map[i - 1]) fputs("\\&", stdout); putchar(f->map[i]); if ('\\' == f->map[i]) putchar('e'); } } else parseeof(p, f->map, f->mapsz); texifilepop(p); } /* * Look up the value to a stored pair's value starting in "buf" from * start to end. * Return the pointer to the value memory, which can be NULL if the * pointer key does not exist. * The pointer can point to NULL if the value has been unset. */ static char ** valuequery(const struct texi *p, const char *buf, size_t start, size_t end) { size_t i, sz, len; assert(end >= start); /* Ignore zero-length. */ if (0 == (len = (end - start))) return(NULL); for (i = 0; i < p->valsz; i++) { sz = strlen(p->vals[i].key); if (sz != len) continue; if (0 == strncmp(p->vals[i].key, &buf[start], len)) return(&p->vals[i].value); } return(NULL); } /* * Parse a key until the end of line, e.g., @clear foo\n, and return the * pointer to its value via valuequery(). */ static char ** valuelquery(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t start, end; char **ret; while (*pos < sz && isws(buf[*pos])) advance(p, buf, pos); if (*pos == sz) return(NULL); for (start = end = *pos; end < sz; end++) if ('\n' == buf[end]) break; advanceto(p, buf, pos, end); if (*pos < sz) { assert('\n' == buf[*pos]); advance(p, buf, pos); } if (NULL == (ret = valuequery(p, buf, start, end))) return(NULL); return(ret); } void valuelclear(struct texi *p, const char *buf, size_t sz, size_t *pos) { char **ret; if (NULL == (ret = valuelquery(p, buf, sz, pos))) return; free(*ret); *ret = NULL; } const char * valuellookup(struct texi *p, const char *buf, size_t sz, size_t *pos) { char **ret; if (NULL == (ret = valuelquery(p, buf, sz, pos))) return(NULL); return(*ret); } /* * Parse a key from a bracketed string, e.g., @value{foo}, and return * the pointer to its value. * If the returned pointer is NULL, either there was no string within * the brackets (or no brackets), or the value was not found, or the * value had previously been unset. */ const char * valueblookup(struct texi *p, const char *buf, size_t sz, size_t *pos) { size_t start, end; char **ret; while (*pos < sz && isws(buf[*pos])) advance(p, buf, pos); if (*pos == sz || '{' != buf[*pos]) return(NULL); advance(p, buf, pos); for (start = end = *pos; end < sz; end++) if ('}' == buf[end]) break; advanceto(p, buf, pos, end); if (*pos < sz) { assert('}' == buf[*pos]); advance(p, buf, pos); } if (NULL == (ret = valuequery(p, buf, start, end))) return(NULL); return(*ret); } void valueadd(struct texi *p, char *key, char *val) { size_t i; assert(NULL != key); assert(NULL != val); for (i = 0; i < p->valsz; i++) if (0 == strcmp(p->vals[i].key, key)) break; if (i < p->valsz) { free(key); free(p->vals[i].value); p->vals[i].value = val; } else { /* FIXME: reallocarray() */ p->vals = realloc(p->vals, (p->valsz + 1) * sizeof(struct texivalue)); if (NULL == p->vals) texiabort(p, NULL); p->vals[p->valsz].key = key; p->vals[p->valsz].value = val; p->valsz++; } }