]> git.cameronkatri.com Git - mandoc.git/blob - apropos_db.c
89b21fbb034fbd950349806aeba6952769589814
[mandoc.git] / apropos_db.c
1 /* $Id: apropos_db.c,v 1.3 2011/11/13 11:10:27 schwarze Exp $ */
2 /*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include <assert.h>
19 #include <fcntl.h>
20 #include <regex.h>
21 #include <stdarg.h>
22 #include <stdlib.h>
23 #include <string.h>
24
25 #ifdef __linux__
26 # include <db_185.h>
27 #else
28 # include <db.h>
29 #endif
30
31 #include "mandocdb.h"
32 #include "apropos_db.h"
33 #include "mandoc.h"
34
35 struct expr {
36 int regex;
37 int mask;
38 char *v;
39 regex_t re;
40 };
41
42 struct type {
43 int mask;
44 const char *name;
45 };
46
47 static const struct type types[] = {
48 { TYPE_An, "An" },
49 { TYPE_Cd, "Cd" },
50 { TYPE_Er, "Er" },
51 { TYPE_Ev, "Ev" },
52 { TYPE_Fn, "Fn" },
53 { TYPE_Fn, "Fo" },
54 { TYPE_In, "In" },
55 { TYPE_Nd, "Nd" },
56 { TYPE_Nm, "Nm" },
57 { TYPE_Pa, "Pa" },
58 { TYPE_St, "St" },
59 { TYPE_Va, "Va" },
60 { TYPE_Va, "Vt" },
61 { TYPE_Xr, "Xr" },
62 { INT_MAX, "any" },
63 { 0, NULL }
64 };
65
66 static DB *btree_open(void);
67 static int btree_read(const DBT *, const struct mchars *, char **);
68 static int exprexec(const struct expr *, char *, int);
69 static DB *index_open(void);
70 static int index_read(const DBT *, const DBT *,
71 const struct mchars *, struct rec *);
72 static void norm_string(const char *,
73 const struct mchars *, char **);
74 static size_t norm_utf8(unsigned int, char[7]);
75
76 /*
77 * Open the keyword mandoc-db database.
78 */
79 static DB *
80 btree_open(void)
81 {
82 BTREEINFO info;
83 DB *db;
84
85 memset(&info, 0, sizeof(BTREEINFO));
86 info.flags = R_DUP;
87
88 db = dbopen(MANDOC_DB, O_RDONLY, 0, DB_BTREE, &info);
89 if (NULL != db)
90 return(db);
91
92 return(NULL);
93 }
94
95 /*
96 * Read a keyword from the database and normalise it.
97 * Return 0 if the database is insane, else 1.
98 */
99 static int
100 btree_read(const DBT *v, const struct mchars *mc, char **buf)
101 {
102
103 /* Sanity: are we nil-terminated? */
104
105 assert(v->size > 0);
106 if ('\0' != ((char *)v->data)[(int)v->size - 1])
107 return(0);
108
109 norm_string((char *)v->data, mc, buf);
110 return(1);
111 }
112
113 /*
114 * Take a Unicode codepoint and produce its UTF-8 encoding.
115 * This isn't the best way to do this, but it works.
116 * The magic numbers are from the UTF-8 packaging.
117 * They're not as scary as they seem: read the UTF-8 spec for details.
118 */
119 static size_t
120 norm_utf8(unsigned int cp, char out[7])
121 {
122 size_t rc;
123
124 rc = 0;
125
126 if (cp <= 0x0000007F) {
127 rc = 1;
128 out[0] = (char)cp;
129 } else if (cp <= 0x000007FF) {
130 rc = 2;
131 out[0] = (cp >> 6 & 31) | 192;
132 out[1] = (cp & 63) | 128;
133 } else if (cp <= 0x0000FFFF) {
134 rc = 3;
135 out[0] = (cp >> 12 & 15) | 224;
136 out[1] = (cp >> 6 & 63) | 128;
137 out[2] = (cp & 63) | 128;
138 } else if (cp <= 0x001FFFFF) {
139 rc = 4;
140 out[0] = (cp >> 18 & 7) | 240;
141 out[1] = (cp >> 12 & 63) | 128;
142 out[2] = (cp >> 6 & 63) | 128;
143 out[3] = (cp & 63) | 128;
144 } else if (cp <= 0x03FFFFFF) {
145 rc = 5;
146 out[0] = (cp >> 24 & 3) | 248;
147 out[1] = (cp >> 18 & 63) | 128;
148 out[2] = (cp >> 12 & 63) | 128;
149 out[3] = (cp >> 6 & 63) | 128;
150 out[4] = (cp & 63) | 128;
151 } else if (cp <= 0x7FFFFFFF) {
152 rc = 6;
153 out[0] = (cp >> 30 & 1) | 252;
154 out[1] = (cp >> 24 & 63) | 128;
155 out[2] = (cp >> 18 & 63) | 128;
156 out[3] = (cp >> 12 & 63) | 128;
157 out[4] = (cp >> 6 & 63) | 128;
158 out[5] = (cp & 63) | 128;
159 } else
160 return(0);
161
162 out[rc] = '\0';
163 return(rc);
164 }
165
166 /*
167 * Normalise strings from the index and database.
168 * These strings are escaped as defined by mandoc_char(7) along with
169 * other goop in mandoc.h (e.g., soft hyphens).
170 * This function normalises these into a nice UTF-8 string.
171 * Returns 0 if the database is fucked.
172 */
173 static void
174 norm_string(const char *val, const struct mchars *mc, char **buf)
175 {
176 size_t sz, bsz;
177 char utfbuf[7];
178 const char *seq, *cpp;
179 int len, u, pos;
180 enum mandoc_esc esc;
181 static const char res[] = { '\\', '\t',
182 ASCII_NBRSP, ASCII_HYPH, '\0' };
183
184 /* Pre-allocate by the length of the input */
185
186 bsz = strlen(val) + 1;
187 *buf = mandoc_realloc(*buf, bsz);
188 pos = 0;
189
190 while ('\0' != *val) {
191 /*
192 * Halt on the first escape sequence.
193 * This also halts on the end of string, in which case
194 * we just copy, fallthrough, and exit the loop.
195 */
196 if ((sz = strcspn(val, res)) > 0) {
197 memcpy(&(*buf)[pos], val, sz);
198 pos += (int)sz;
199 val += (int)sz;
200 }
201
202 if (ASCII_HYPH == *val) {
203 (*buf)[pos++] = '-';
204 val++;
205 continue;
206 } else if ('\t' == *val || ASCII_NBRSP == *val) {
207 (*buf)[pos++] = ' ';
208 val++;
209 continue;
210 } else if ('\\' != *val)
211 break;
212
213 /* Read past the slash. */
214
215 val++;
216 u = 0;
217
218 /*
219 * Parse the escape sequence and see if it's a
220 * predefined character or special character.
221 */
222
223 esc = mandoc_escape(&val, &seq, &len);
224 if (ESCAPE_ERROR == esc)
225 break;
226
227 /*
228 * XXX - this just does UTF-8, but we need to know
229 * beforehand whether we should do text substitution.
230 */
231
232 switch (esc) {
233 case (ESCAPE_SPECIAL):
234 if (0 != (u = mchars_spec2cp(mc, seq, len)))
235 break;
236 /* FALLTHROUGH */
237 default:
238 continue;
239 }
240
241 /*
242 * If we have a Unicode codepoint, try to convert that
243 * to a UTF-8 byte string.
244 */
245
246 cpp = utfbuf;
247 if (0 == (sz = norm_utf8(u, utfbuf)))
248 continue;
249
250 /* Copy the rendered glyph into the stream. */
251
252 sz = strlen(cpp);
253 bsz += sz;
254
255 *buf = mandoc_realloc(*buf, bsz);
256
257 memcpy(&(*buf)[pos], cpp, sz);
258 pos += (int)sz;
259 }
260
261 (*buf)[pos] = '\0';
262 }
263
264 /*
265 * Open the filename-index mandoc-db database.
266 * Returns NULL if opening failed.
267 */
268 static DB *
269 index_open(void)
270 {
271 DB *db;
272
273 db = dbopen(MANDOC_IDX, O_RDONLY, 0, DB_RECNO, NULL);
274 if (NULL != db)
275 return(db);
276
277 return(NULL);
278 }
279
280 /*
281 * Safely unpack from an index file record into the structure.
282 * Returns 1 if an entry was unpacked, 0 if the database is insane.
283 */
284 static int
285 index_read(const DBT *key, const DBT *val,
286 const struct mchars *mc, struct rec *rec)
287 {
288 size_t left;
289 char *np, *cp;
290
291 #define INDEX_BREAD(_dst) \
292 do { \
293 if (NULL == (np = memchr(cp, '\0', left))) \
294 return(0); \
295 norm_string(cp, mc, &(_dst)); \
296 left -= (np - cp) + 1; \
297 cp = np + 1; \
298 } while (/* CONSTCOND */ 0)
299
300 left = val->size;
301 cp = (char *)val->data;
302
303 rec->rec = *(recno_t *)key->data;
304
305 INDEX_BREAD(rec->file);
306 INDEX_BREAD(rec->cat);
307 INDEX_BREAD(rec->title);
308 INDEX_BREAD(rec->arch);
309 INDEX_BREAD(rec->desc);
310 return(1);
311 }
312
313 /*
314 * Search the mandocdb database for the expression "expr".
315 * Filter out by "opts".
316 * Call "res" with the results, which may be zero.
317 */
318 void
319 apropos_search(const struct opts *opts, const struct expr *expr,
320 void *arg, void (*res)(struct rec *, size_t, void *))
321 {
322 int i, len, root, leaf;
323 DBT key, val;
324 DB *btree, *idx;
325 struct mchars *mc;
326 int ch;
327 char *buf;
328 recno_t rec;
329 struct rec *recs;
330 struct rec srec;
331
332 root = -1;
333 leaf = -1;
334 btree = NULL;
335 idx = NULL;
336 mc = NULL;
337 buf = NULL;
338 recs = NULL;
339 len = 0;
340
341 memset(&srec, 0, sizeof(struct rec));
342
343 /* XXX: error out with bad regexp? */
344
345 mc = mchars_alloc();
346
347 /* XXX: return fact that we've errored? */
348
349 if (NULL == (btree = btree_open()))
350 goto out;
351 if (NULL == (idx = index_open()))
352 goto out;
353
354 while (0 == (ch = (*btree->seq)(btree, &key, &val, R_NEXT))) {
355 /*
356 * Low-water mark for key and value.
357 * The key must have something in it, and the value must
358 * have the correct tags/recno mix.
359 */
360 if (key.size < 2 || 8 != val.size)
361 break;
362 if ( ! btree_read(&key, mc, &buf))
363 break;
364
365 if ( ! exprexec(expr, buf, *(int *)val.data))
366 continue;
367
368 memcpy(&rec, val.data + 4, sizeof(recno_t));
369
370 /*
371 * O(log n) scan for prior records. Since a record
372 * number is unbounded, this has decent performance over
373 * a complex hash function.
374 */
375
376 for (leaf = root; leaf >= 0; )
377 if (rec > recs[leaf].rec && recs[leaf].rhs >= 0)
378 leaf = recs[leaf].rhs;
379 else if (rec < recs[leaf].rec && recs[leaf].lhs >= 0)
380 leaf = recs[leaf].lhs;
381 else
382 break;
383
384 if (leaf >= 0 && recs[leaf].rec == rec)
385 continue;
386
387 /*
388 * Now we actually extract the manpage's metadata from
389 * the index database.
390 */
391
392 key.data = &rec;
393 key.size = sizeof(recno_t);
394
395 if (0 != (*idx->get)(idx, &key, &val, 0))
396 break;
397
398 srec.lhs = srec.rhs = -1;
399 if ( ! index_read(&key, &val, mc, &srec))
400 break;
401
402 if (opts->cat && strcasecmp(opts->cat, srec.cat))
403 continue;
404 if (opts->arch && strcasecmp(opts->arch, srec.arch))
405 continue;
406
407 recs = mandoc_realloc
408 (recs, (len + 1) * sizeof(struct rec));
409
410 memcpy(&recs[len], &srec, sizeof(struct rec));
411
412 /* Append to our tree. */
413
414 if (leaf >= 0) {
415 if (rec > recs[leaf].rec)
416 recs[leaf].rhs = len;
417 else
418 recs[leaf].lhs = len;
419 } else
420 root = len;
421
422 memset(&srec, 0, sizeof(struct rec));
423 len++;
424 }
425
426 if (1 == ch)
427 (*res)(recs, len, arg);
428
429 /* XXX: else? corrupt database error? */
430 out:
431 for (i = 0; i < len; i++) {
432 free(recs[i].file);
433 free(recs[i].cat);
434 free(recs[i].title);
435 free(recs[i].arch);
436 free(recs[i].desc);
437 }
438
439 free(srec.file);
440 free(srec.cat);
441 free(srec.title);
442 free(srec.arch);
443 free(srec.desc);
444
445 if (mc)
446 mchars_free(mc);
447 if (btree)
448 (*btree->close)(btree);
449 if (idx)
450 (*idx->close)(idx);
451
452 free(buf);
453 free(recs);
454 }
455
456 struct expr *
457 exprcomp(int argc, char *argv[])
458 {
459 struct expr *p;
460 struct expr e;
461 char *key;
462 int i, icase;
463
464 if (0 >= argc)
465 return(NULL);
466
467 /*
468 * Choose regex or substring match.
469 */
470
471 if (NULL == (e.v = strpbrk(*argv, "=~"))) {
472 e.regex = 0;
473 e.v = *argv;
474 } else {
475 e.regex = '~' == *e.v;
476 *e.v++ = '\0';
477 }
478
479 /*
480 * Determine the record types to search for.
481 */
482
483 icase = 0;
484 e.mask = 0;
485 if (*argv < e.v) {
486 while (NULL != (key = strsep(argv, ","))) {
487 if ('i' == key[0] && '\0' == key[1]) {
488 icase = REG_ICASE;
489 continue;
490 }
491 i = 0;
492 while (types[i].mask &&
493 strcmp(types[i].name, key))
494 i++;
495 e.mask |= types[i].mask;
496 }
497 }
498 if (0 == e.mask)
499 e.mask = TYPE_Nm | TYPE_Nd;
500
501 if (e.regex &&
502 regcomp(&e.re, e.v, REG_EXTENDED | REG_NOSUB | icase))
503 return(NULL);
504
505 e.v = mandoc_strdup(e.v);
506
507 p = mandoc_calloc(1, sizeof(struct expr));
508 memcpy(p, &e, sizeof(struct expr));
509 return(p);
510 }
511
512 void
513 exprfree(struct expr *p)
514 {
515
516 if (NULL == p)
517 return;
518
519 if (p->regex)
520 regfree(&p->re);
521
522 free(p->v);
523 free(p);
524 }
525
526 static int
527 exprexec(const struct expr *p, char *cp, int mask)
528 {
529
530 if ( ! (mask & p->mask))
531 return(0);
532
533 if (p->regex)
534 return(0 == regexec(&p->re, cp, 0, NULL, 0));
535 else
536 return(NULL != strcasestr(cp, p->v));
537 }