Annotation of mandoc/db.c, Revision 1.1
1.1 ! kristaps 1: /* $Id: html.c,v 1.2 2011/11/04 15:53:19 kristaps Exp $ */
! 2: /*
! 3: * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
! 4: *
! 5: * Permission to use, copy, modify, and distribute this software for any
! 6: * purpose with or without fee is hereby granted, provided that the above
! 7: * copyright notice and this permission notice appear in all copies.
! 8: *
! 9: * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
! 10: * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
! 11: * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
! 12: * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
! 13: * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
! 14: * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
! 15: * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
! 16: */
! 17: #include <assert.h>
! 18: #include <fcntl.h>
! 19: #include <regex.h>
! 20: #include <stdarg.h>
! 21: #include <stdlib.h>
! 22: #include <string.h>
! 23:
! 24: #ifdef __linux__
! 25: # include <db_185.h>
! 26: #else
! 27: # include <db.h>
! 28: #endif
! 29:
! 30: #include "apropos.h"
! 31: #include "mandoc.h"
! 32:
! 33: static DB *btree_open(void);
! 34: static int btree_read(const DBT *, const struct mchars *, char **);
! 35: static DB *index_open(void);
! 36: static int index_read(const DBT *, const DBT *,
! 37: const struct mchars *, struct rec *);
! 38: static void norm_string(const char *,
! 39: const struct mchars *, char **);
! 40: static size_t norm_utf8(unsigned int, char[7]);
! 41:
! 42: /*
! 43: * Open the keyword mandoc-db database.
! 44: */
! 45: static DB *
! 46: btree_open(void)
! 47: {
! 48: BTREEINFO info;
! 49: DB *db;
! 50:
! 51: memset(&info, 0, sizeof(BTREEINFO));
! 52: info.flags = R_DUP;
! 53:
! 54: db = dbopen("mandoc.db", O_RDONLY, 0, DB_BTREE, &info);
! 55: if (NULL != db)
! 56: return(db);
! 57:
! 58: return(NULL);
! 59: }
! 60:
! 61: /*
! 62: * Read a keyword from the database and normalise it.
! 63: * Return 0 if the database is insane, else 1.
! 64: */
! 65: static int
! 66: btree_read(const DBT *v, const struct mchars *mc, char **buf)
! 67: {
! 68:
! 69: /* Sanity: are we nil-terminated? */
! 70:
! 71: assert(v->size > 0);
! 72: if ('\0' != ((char *)v->data)[(int)v->size - 1])
! 73: return(0);
! 74:
! 75: norm_string((char *)v->data, mc, buf);
! 76: return(1);
! 77: }
! 78:
! 79: /*
! 80: * Take a Unicode codepoint and produce its UTF-8 encoding.
! 81: * This isn't the best way to do this, but it works.
! 82: * The magic numbers are from the UTF-8 packaging.
! 83: * They're not as scary as they seem: read the UTF-8 spec for details.
! 84: */
! 85: static size_t
! 86: norm_utf8(unsigned int cp, char out[7])
! 87: {
! 88: size_t rc;
! 89:
! 90: rc = 0;
! 91:
! 92: if (cp <= 0x0000007F) {
! 93: rc = 1;
! 94: out[0] = (char)cp;
! 95: } else if (cp <= 0x000007FF) {
! 96: rc = 2;
! 97: out[0] = (cp >> 6 & 31) | 192;
! 98: out[1] = (cp & 63) | 128;
! 99: } else if (cp <= 0x0000FFFF) {
! 100: rc = 3;
! 101: out[0] = (cp >> 12 & 15) | 224;
! 102: out[1] = (cp >> 6 & 63) | 128;
! 103: out[2] = (cp & 63) | 128;
! 104: } else if (cp <= 0x001FFFFF) {
! 105: rc = 4;
! 106: out[0] = (cp >> 18 & 7) | 240;
! 107: out[1] = (cp >> 12 & 63) | 128;
! 108: out[2] = (cp >> 6 & 63) | 128;
! 109: out[3] = (cp & 63) | 128;
! 110: } else if (cp <= 0x03FFFFFF) {
! 111: rc = 5;
! 112: out[0] = (cp >> 24 & 3) | 248;
! 113: out[1] = (cp >> 18 & 63) | 128;
! 114: out[2] = (cp >> 12 & 63) | 128;
! 115: out[3] = (cp >> 6 & 63) | 128;
! 116: out[4] = (cp & 63) | 128;
! 117: } else if (cp <= 0x7FFFFFFF) {
! 118: rc = 6;
! 119: out[0] = (cp >> 30 & 1) | 252;
! 120: out[1] = (cp >> 24 & 63) | 128;
! 121: out[2] = (cp >> 18 & 63) | 128;
! 122: out[3] = (cp >> 12 & 63) | 128;
! 123: out[4] = (cp >> 6 & 63) | 128;
! 124: out[5] = (cp & 63) | 128;
! 125: } else
! 126: return(0);
! 127:
! 128: out[rc] = '\0';
! 129: return(rc);
! 130: }
! 131:
! 132: /*
! 133: * Normalise strings from the index and database.
! 134: * These strings are escaped as defined by mandoc_char(7) along with
! 135: * other goop in mandoc.h (e.g., soft hyphens).
! 136: * This function normalises these into a nice UTF-8 string.
! 137: * Returns 0 if the database is fucked.
! 138: */
! 139: static void
! 140: norm_string(const char *val, const struct mchars *mc, char **buf)
! 141: {
! 142: size_t sz, bsz;
! 143: char utfbuf[7];
! 144: const char *seq, *cpp;
! 145: int len, u, pos;
! 146: enum mandoc_esc esc;
! 147: static const char res[] = { '\\', '\t',
! 148: ASCII_NBRSP, ASCII_HYPH, '\0' };
! 149:
! 150: /* Pre-allocate by the length of the input */
! 151:
! 152: bsz = strlen(val) + 1;
! 153: *buf = mandoc_realloc(*buf, bsz);
! 154: pos = 0;
! 155:
! 156: while ('\0' != *val) {
! 157: /*
! 158: * Halt on the first escape sequence.
! 159: * This also halts on the end of string, in which case
! 160: * we just copy, fallthrough, and exit the loop.
! 161: */
! 162: if ((sz = strcspn(val, res)) > 0) {
! 163: memcpy(&(*buf)[pos], val, sz);
! 164: pos += (int)sz;
! 165: val += (int)sz;
! 166: }
! 167:
! 168: if (ASCII_HYPH == *val) {
! 169: (*buf)[pos++] = '-';
! 170: val++;
! 171: continue;
! 172: } else if ('\t' == *val || ASCII_NBRSP == *val) {
! 173: (*buf)[pos++] = ' ';
! 174: val++;
! 175: continue;
! 176: } else if ('\\' != *val)
! 177: break;
! 178:
! 179: /* Read past the slash. */
! 180:
! 181: val++;
! 182: u = 0;
! 183:
! 184: /*
! 185: * Parse the escape sequence and see if it's a
! 186: * predefined character or special character.
! 187: */
! 188:
! 189: esc = mandoc_escape(&val, &seq, &len);
! 190: if (ESCAPE_ERROR == esc)
! 191: break;
! 192:
! 193: /*
! 194: * XXX - this just does UTF-8, but we need to know
! 195: * beforehand whether we should do text substitution.
! 196: */
! 197:
! 198: switch (esc) {
! 199: case (ESCAPE_SPECIAL):
! 200: if (0 != (u = mchars_spec2cp(mc, seq, len)))
! 201: break;
! 202: /* FALLTHROUGH */
! 203: default:
! 204: continue;
! 205: }
! 206:
! 207: /*
! 208: * If we have a Unicode codepoint, try to convert that
! 209: * to a UTF-8 byte string.
! 210: */
! 211:
! 212: cpp = utfbuf;
! 213: if (0 == (sz = norm_utf8(u, utfbuf)))
! 214: continue;
! 215:
! 216: /* Copy the rendered glyph into the stream. */
! 217:
! 218: sz = strlen(cpp);
! 219: bsz += sz;
! 220:
! 221: *buf = mandoc_realloc(*buf, bsz);
! 222:
! 223: memcpy(&(*buf)[pos], cpp, sz);
! 224: pos += (int)sz;
! 225: }
! 226:
! 227: (*buf)[pos] = '\0';
! 228: }
! 229:
! 230: /*
! 231: * Open the filename-index mandoc-db database.
! 232: * Returns NULL if opening failed.
! 233: */
! 234: static DB *
! 235: index_open(void)
! 236: {
! 237: DB *db;
! 238:
! 239: db = dbopen("mandoc.index", O_RDONLY, 0, DB_RECNO, NULL);
! 240: if (NULL != db)
! 241: return(db);
! 242:
! 243: return(NULL);
! 244: }
! 245:
! 246: /*
! 247: * Safely unpack from an index file record into the structure.
! 248: * Returns 1 if an entry was unpacked, 0 if the database is insane.
! 249: */
! 250: static int
! 251: index_read(const DBT *key, const DBT *val,
! 252: const struct mchars *mc, struct rec *rec)
! 253: {
! 254: size_t left;
! 255: char *np, *cp;
! 256:
! 257: #define INDEX_BREAD(_dst) \
! 258: do { \
! 259: if (NULL == (np = memchr(cp, '\0', left))) \
! 260: return(0); \
! 261: norm_string(cp, mc, &(_dst)); \
! 262: left -= (np - cp) + 1; \
! 263: cp = np + 1; \
! 264: } while (/* CONSTCOND */ 0)
! 265:
! 266: left = val->size;
! 267: cp = (char *)val->data;
! 268:
! 269: rec->rec = *(recno_t *)key->data;
! 270:
! 271: INDEX_BREAD(rec->file);
! 272: INDEX_BREAD(rec->cat);
! 273: INDEX_BREAD(rec->title);
! 274: INDEX_BREAD(rec->arch);
! 275: INDEX_BREAD(rec->desc);
! 276: return(1);
! 277: }
! 278:
! 279: /*
! 280: * Search the mandocdb database for the regular expression "q".
! 281: * Filter out by "opts".
! 282: * Call "res" with the results, which may be zero.
! 283: */
! 284: void
! 285: apropos_search(const struct opts *opts, const char *q, void *arg,
! 286: void (*res)(struct rec *, size_t, void *))
! 287: {
! 288: int i, len, root, leaf;
! 289: regex_t reg;
! 290: DBT key, val;
! 291: DB *btree, *idx;
! 292: struct mchars *mc;
! 293: int ch;
! 294: char *buf;
! 295: recno_t rec;
! 296: struct rec *recs;
! 297: struct rec srec;
! 298:
! 299: root = -1;
! 300: leaf = -1;
! 301: btree = NULL;
! 302: idx = NULL;
! 303: mc = NULL;
! 304: buf = NULL;
! 305: recs = NULL;
! 306: len = 0;
! 307:
! 308: memset(&srec, 0, sizeof(struct rec));
! 309:
! 310: if (NULL != q && '\0' == *q)
! 311: q = NULL;
! 312:
! 313: ch = REG_EXTENDED | REG_NOSUB |
! 314: (OPTS_INSENS & opts->flags ? REG_ICASE : 0);
! 315:
! 316: /* XXX: error out with bad regexp? */
! 317:
! 318: if (NULL == q || regcomp(®, q, ch)) {
! 319: (*res)(NULL, 0, arg);
! 320: return;
! 321: }
! 322:
! 323: mc = mchars_alloc();
! 324:
! 325: /* XXX: return fact that we've errored? */
! 326:
! 327: if (NULL == (btree = btree_open()))
! 328: goto out;
! 329: if (NULL == (idx = index_open()))
! 330: goto out;
! 331:
! 332: while (0 == (ch = (*btree->seq)(btree, &key, &val, R_NEXT))) {
! 333: /*
! 334: * Low-water mark for key and value.
! 335: * The key must have something in it, and the value must
! 336: * have the correct tags/recno mix.
! 337: */
! 338: if (key.size < 2 || 8 != val.size)
! 339: break;
! 340:
! 341: if ( ! (*(int32_t *)val.data & opts->types))
! 342: continue;
! 343:
! 344: if ( ! btree_read(&key, mc, &buf))
! 345: break;
! 346: if (regexec(®, buf, 0, NULL, 0))
! 347: continue;
! 348:
! 349: memcpy(&rec, val.data + 4, sizeof(recno_t));
! 350:
! 351: /*
! 352: * O(log n) scan for prior records. Since a record
! 353: * number is unbounded, this has decent performance over
! 354: * a complex hash function.
! 355: */
! 356:
! 357: for (leaf = root; leaf >= 0; )
! 358: if (rec > recs[leaf].rec && recs[leaf].rhs >= 0)
! 359: leaf = recs[leaf].rhs;
! 360: else if (rec < recs[leaf].rec && recs[leaf].lhs >= 0)
! 361: leaf = recs[leaf].lhs;
! 362: else
! 363: break;
! 364:
! 365: if (leaf >= 0 && recs[leaf].rec == rec)
! 366: continue;
! 367:
! 368: /*
! 369: * Now we actually extract the manpage's metadata from
! 370: * the index database.
! 371: */
! 372:
! 373: key.data = &rec;
! 374: key.size = sizeof(recno_t);
! 375:
! 376: if (0 != (*idx->get)(idx, &key, &val, 0))
! 377: break;
! 378:
! 379: srec.lhs = srec.rhs = -1;
! 380: if ( ! index_read(&key, &val, mc, &srec))
! 381: break;
! 382:
! 383: if (opts->cat && strcasecmp(opts->cat, srec.cat))
! 384: continue;
! 385: if (opts->arch && strcasecmp(opts->arch, srec.arch))
! 386: continue;
! 387:
! 388: recs = mandoc_realloc
! 389: (recs, (len + 1) * sizeof(struct rec));
! 390:
! 391: memcpy(&recs[len], &srec, sizeof(struct rec));
! 392:
! 393: /* Append to our tree. */
! 394:
! 395: if (leaf >= 0) {
! 396: if (rec > recs[leaf].rec)
! 397: recs[leaf].rhs = len;
! 398: else
! 399: recs[leaf].lhs = len;
! 400: } else
! 401: root = len;
! 402:
! 403: memset(&srec, 0, sizeof(struct rec));
! 404: len++;
! 405: }
! 406:
! 407: if (1 == ch)
! 408: (*res)(recs, len, arg);
! 409:
! 410: /* XXX: else? corrupt database error? */
! 411: out:
! 412: for (i = 0; i < len; i++) {
! 413: free(recs[i].file);
! 414: free(recs[i].cat);
! 415: free(recs[i].title);
! 416: free(recs[i].arch);
! 417: free(recs[i].desc);
! 418: }
! 419:
! 420: free(srec.file);
! 421: free(srec.cat);
! 422: free(srec.title);
! 423: free(srec.arch);
! 424: free(srec.desc);
! 425:
! 426: if (mc)
! 427: mchars_free(mc);
! 428: if (btree)
! 429: (*btree->close)(btree);
! 430: if (idx)
! 431: (*idx->close)(idx);
! 432:
! 433: free(buf);
! 434: free(recs);
! 435: regfree(®);
! 436: }
CVSweb