]>
git.cameronkatri.com Git - mandoc.git/blob - apropos_db.c
89b21fbb034fbd950349806aeba6952769589814
1 /* $Id: apropos_db.c,v 1.3 2011/11/13 11:10:27 schwarze Exp $ */
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
32 #include "apropos_db.h"
47 static const struct type types
[] = {
66 static DB
*btree_open(void);
67 static int btree_read(const DBT
*, const struct mchars
*, char **);
68 static int exprexec(const struct expr
*, char *, int);
69 static DB
*index_open(void);
70 static int index_read(const DBT
*, const DBT
*,
71 const struct mchars
*, struct rec
*);
72 static void norm_string(const char *,
73 const struct mchars
*, char **);
74 static size_t norm_utf8(unsigned int, char[7]);
77 * Open the keyword mandoc-db database.
85 memset(&info
, 0, sizeof(BTREEINFO
));
88 db
= dbopen(MANDOC_DB
, O_RDONLY
, 0, DB_BTREE
, &info
);
96 * Read a keyword from the database and normalise it.
97 * Return 0 if the database is insane, else 1.
100 btree_read(const DBT
*v
, const struct mchars
*mc
, char **buf
)
103 /* Sanity: are we nil-terminated? */
106 if ('\0' != ((char *)v
->data
)[(int)v
->size
- 1])
109 norm_string((char *)v
->data
, mc
, buf
);
114 * Take a Unicode codepoint and produce its UTF-8 encoding.
115 * This isn't the best way to do this, but it works.
116 * The magic numbers are from the UTF-8 packaging.
117 * They're not as scary as they seem: read the UTF-8 spec for details.
120 norm_utf8(unsigned int cp
, char out
[7])
126 if (cp
<= 0x0000007F) {
129 } else if (cp
<= 0x000007FF) {
131 out
[0] = (cp
>> 6 & 31) | 192;
132 out
[1] = (cp
& 63) | 128;
133 } else if (cp
<= 0x0000FFFF) {
135 out
[0] = (cp
>> 12 & 15) | 224;
136 out
[1] = (cp
>> 6 & 63) | 128;
137 out
[2] = (cp
& 63) | 128;
138 } else if (cp
<= 0x001FFFFF) {
140 out
[0] = (cp
>> 18 & 7) | 240;
141 out
[1] = (cp
>> 12 & 63) | 128;
142 out
[2] = (cp
>> 6 & 63) | 128;
143 out
[3] = (cp
& 63) | 128;
144 } else if (cp
<= 0x03FFFFFF) {
146 out
[0] = (cp
>> 24 & 3) | 248;
147 out
[1] = (cp
>> 18 & 63) | 128;
148 out
[2] = (cp
>> 12 & 63) | 128;
149 out
[3] = (cp
>> 6 & 63) | 128;
150 out
[4] = (cp
& 63) | 128;
151 } else if (cp
<= 0x7FFFFFFF) {
153 out
[0] = (cp
>> 30 & 1) | 252;
154 out
[1] = (cp
>> 24 & 63) | 128;
155 out
[2] = (cp
>> 18 & 63) | 128;
156 out
[3] = (cp
>> 12 & 63) | 128;
157 out
[4] = (cp
>> 6 & 63) | 128;
158 out
[5] = (cp
& 63) | 128;
167 * Normalise strings from the index and database.
168 * These strings are escaped as defined by mandoc_char(7) along with
169 * other goop in mandoc.h (e.g., soft hyphens).
170 * This function normalises these into a nice UTF-8 string.
171 * Returns 0 if the database is fucked.
174 norm_string(const char *val
, const struct mchars
*mc
, char **buf
)
178 const char *seq
, *cpp
;
181 static const char res
[] = { '\\', '\t',
182 ASCII_NBRSP
, ASCII_HYPH
, '\0' };
184 /* Pre-allocate by the length of the input */
186 bsz
= strlen(val
) + 1;
187 *buf
= mandoc_realloc(*buf
, bsz
);
190 while ('\0' != *val
) {
192 * Halt on the first escape sequence.
193 * This also halts on the end of string, in which case
194 * we just copy, fallthrough, and exit the loop.
196 if ((sz
= strcspn(val
, res
)) > 0) {
197 memcpy(&(*buf
)[pos
], val
, sz
);
202 if (ASCII_HYPH
== *val
) {
206 } else if ('\t' == *val
|| ASCII_NBRSP
== *val
) {
210 } else if ('\\' != *val
)
213 /* Read past the slash. */
219 * Parse the escape sequence and see if it's a
220 * predefined character or special character.
223 esc
= mandoc_escape(&val
, &seq
, &len
);
224 if (ESCAPE_ERROR
== esc
)
228 * XXX - this just does UTF-8, but we need to know
229 * beforehand whether we should do text substitution.
233 case (ESCAPE_SPECIAL
):
234 if (0 != (u
= mchars_spec2cp(mc
, seq
, len
)))
242 * If we have a Unicode codepoint, try to convert that
243 * to a UTF-8 byte string.
247 if (0 == (sz
= norm_utf8(u
, utfbuf
)))
250 /* Copy the rendered glyph into the stream. */
255 *buf
= mandoc_realloc(*buf
, bsz
);
257 memcpy(&(*buf
)[pos
], cpp
, sz
);
265 * Open the filename-index mandoc-db database.
266 * Returns NULL if opening failed.
273 db
= dbopen(MANDOC_IDX
, O_RDONLY
, 0, DB_RECNO
, NULL
);
281 * Safely unpack from an index file record into the structure.
282 * Returns 1 if an entry was unpacked, 0 if the database is insane.
285 index_read(const DBT
*key
, const DBT
*val
,
286 const struct mchars
*mc
, struct rec
*rec
)
291 #define INDEX_BREAD(_dst) \
293 if (NULL == (np = memchr(cp, '\0', left))) \
295 norm_string(cp, mc, &(_dst)); \
296 left -= (np - cp) + 1; \
298 } while (/* CONSTCOND */ 0)
301 cp
= (char *)val
->data
;
303 rec
->rec
= *(recno_t
*)key
->data
;
305 INDEX_BREAD(rec
->file
);
306 INDEX_BREAD(rec
->cat
);
307 INDEX_BREAD(rec
->title
);
308 INDEX_BREAD(rec
->arch
);
309 INDEX_BREAD(rec
->desc
);
314 * Search the mandocdb database for the expression "expr".
315 * Filter out by "opts".
316 * Call "res" with the results, which may be zero.
319 apropos_search(const struct opts
*opts
, const struct expr
*expr
,
320 void *arg
, void (*res
)(struct rec
*, size_t, void *))
322 int i
, len
, root
, leaf
;
341 memset(&srec
, 0, sizeof(struct rec
));
343 /* XXX: error out with bad regexp? */
347 /* XXX: return fact that we've errored? */
349 if (NULL
== (btree
= btree_open()))
351 if (NULL
== (idx
= index_open()))
354 while (0 == (ch
= (*btree
->seq
)(btree
, &key
, &val
, R_NEXT
))) {
356 * Low-water mark for key and value.
357 * The key must have something in it, and the value must
358 * have the correct tags/recno mix.
360 if (key
.size
< 2 || 8 != val
.size
)
362 if ( ! btree_read(&key
, mc
, &buf
))
365 if ( ! exprexec(expr
, buf
, *(int *)val
.data
))
368 memcpy(&rec
, val
.data
+ 4, sizeof(recno_t
));
371 * O(log n) scan for prior records. Since a record
372 * number is unbounded, this has decent performance over
373 * a complex hash function.
376 for (leaf
= root
; leaf
>= 0; )
377 if (rec
> recs
[leaf
].rec
&& recs
[leaf
].rhs
>= 0)
378 leaf
= recs
[leaf
].rhs
;
379 else if (rec
< recs
[leaf
].rec
&& recs
[leaf
].lhs
>= 0)
380 leaf
= recs
[leaf
].lhs
;
384 if (leaf
>= 0 && recs
[leaf
].rec
== rec
)
388 * Now we actually extract the manpage's metadata from
389 * the index database.
393 key
.size
= sizeof(recno_t
);
395 if (0 != (*idx
->get
)(idx
, &key
, &val
, 0))
398 srec
.lhs
= srec
.rhs
= -1;
399 if ( ! index_read(&key
, &val
, mc
, &srec
))
402 if (opts
->cat
&& strcasecmp(opts
->cat
, srec
.cat
))
404 if (opts
->arch
&& strcasecmp(opts
->arch
, srec
.arch
))
407 recs
= mandoc_realloc
408 (recs
, (len
+ 1) * sizeof(struct rec
));
410 memcpy(&recs
[len
], &srec
, sizeof(struct rec
));
412 /* Append to our tree. */
415 if (rec
> recs
[leaf
].rec
)
416 recs
[leaf
].rhs
= len
;
418 recs
[leaf
].lhs
= len
;
422 memset(&srec
, 0, sizeof(struct rec
));
427 (*res
)(recs
, len
, arg
);
429 /* XXX: else? corrupt database error? */
431 for (i
= 0; i
< len
; i
++) {
448 (*btree
->close
)(btree
);
457 exprcomp(int argc
, char *argv
[])
468 * Choose regex or substring match.
471 if (NULL
== (e
.v
= strpbrk(*argv
, "=~"))) {
475 e
.regex
= '~' == *e
.v
;
480 * Determine the record types to search for.
486 while (NULL
!= (key
= strsep(argv
, ","))) {
487 if ('i' == key
[0] && '\0' == key
[1]) {
492 while (types
[i
].mask
&&
493 strcmp(types
[i
].name
, key
))
495 e
.mask
|= types
[i
].mask
;
499 e
.mask
= TYPE_Nm
| TYPE_Nd
;
502 regcomp(&e
.re
, e
.v
, REG_EXTENDED
| REG_NOSUB
| icase
))
505 e
.v
= mandoc_strdup(e
.v
);
507 p
= mandoc_calloc(1, sizeof(struct expr
));
508 memcpy(p
, &e
, sizeof(struct expr
));
513 exprfree(struct expr
*p
)
527 exprexec(const struct expr
*p
, char *cp
, int mask
)
530 if ( ! (mask
& p
->mask
))
534 return(0 == regexec(&p
->re
, cp
, 0, NULL
, 0));
536 return(NULL
!= strcasestr(cp
, p
->v
));