]>
git.cameronkatri.com Git - mandoc.git/blob - apropos_db.c
0ac526241e812f1cb4b1bf2f98bd28d134f2816f
1 /* $Id: apropos_db.c,v 1.9 2011/11/20 15:45:37 kristaps Exp $ */
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011 Ingo Schwarze <schwarze@openbsd.org>
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
34 #include "apropos_db.h"
38 struct res res
; /* resulting record info */
40 * Maintain a binary tree for checking the uniqueness of `rec'
41 * when adding elements to the results array.
42 * Since the results array is dynamic, use offset in the array
43 * instead of a pointer to the structure.
47 int matched
; /* expression is true */
48 int *matches
; /* partial truth evaluations */
52 int regex
; /* is regex? */
53 int index
; /* index in match array */
54 uint64_t mask
; /* type-mask */
55 int cs
; /* is case-sensitive? */
56 int and; /* is rhs of logical AND? */
57 char *v
; /* search value */
58 regex_t re
; /* compiled re, if regex */
59 struct expr
*next
; /* next in sequence */
69 struct rec
*node
; /* record array for dir tree */
70 int len
; /* length of record array */
73 static const struct type types
[] = {
117 static DB
*btree_open(void);
118 static int btree_read(const DBT
*,
119 const struct mchars
*, char **);
120 static int expreval(const struct expr
*, int *);
121 static void exprexec(const struct expr
*,
122 const char *, uint64_t, struct rec
*);
123 static int exprmark(const struct expr
*,
124 const char *, uint64_t, int *);
125 static struct expr
*exprexpr(int, char *[], int *, int *, size_t *);
126 static struct expr
*exprterm(char *, int);
127 static DB
*index_open(void);
128 static int index_read(const DBT
*, const DBT
*,
129 const struct mchars
*, struct rec
*);
130 static void norm_string(const char *,
131 const struct mchars
*, char **);
132 static size_t norm_utf8(unsigned int, char[7]);
133 static void recfree(struct rec
*);
134 static int single_search(struct rectree
*, const struct opts
*,
135 const struct expr
*, size_t terms
,
139 * Open the keyword mandoc-db database.
147 memset(&info
, 0, sizeof(BTREEINFO
));
150 db
= dbopen(MANDOC_DB
, O_RDONLY
, 0, DB_BTREE
, &info
);
158 * Read a keyword from the database and normalise it.
159 * Return 0 if the database is insane, else 1.
162 btree_read(const DBT
*v
, const struct mchars
*mc
, char **buf
)
165 /* Sanity: are we nil-terminated? */
169 if ('\0' != ((char *)v
->data
)[(int)v
->size
- 1])
172 norm_string((char *)v
->data
, mc
, buf
);
177 * Take a Unicode codepoint and produce its UTF-8 encoding.
178 * This isn't the best way to do this, but it works.
179 * The magic numbers are from the UTF-8 packaging.
180 * They're not as scary as they seem: read the UTF-8 spec for details.
183 norm_utf8(unsigned int cp
, char out
[7])
189 if (cp
<= 0x0000007F) {
192 } else if (cp
<= 0x000007FF) {
194 out
[0] = (cp
>> 6 & 31) | 192;
195 out
[1] = (cp
& 63) | 128;
196 } else if (cp
<= 0x0000FFFF) {
198 out
[0] = (cp
>> 12 & 15) | 224;
199 out
[1] = (cp
>> 6 & 63) | 128;
200 out
[2] = (cp
& 63) | 128;
201 } else if (cp
<= 0x001FFFFF) {
203 out
[0] = (cp
>> 18 & 7) | 240;
204 out
[1] = (cp
>> 12 & 63) | 128;
205 out
[2] = (cp
>> 6 & 63) | 128;
206 out
[3] = (cp
& 63) | 128;
207 } else if (cp
<= 0x03FFFFFF) {
209 out
[0] = (cp
>> 24 & 3) | 248;
210 out
[1] = (cp
>> 18 & 63) | 128;
211 out
[2] = (cp
>> 12 & 63) | 128;
212 out
[3] = (cp
>> 6 & 63) | 128;
213 out
[4] = (cp
& 63) | 128;
214 } else if (cp
<= 0x7FFFFFFF) {
216 out
[0] = (cp
>> 30 & 1) | 252;
217 out
[1] = (cp
>> 24 & 63) | 128;
218 out
[2] = (cp
>> 18 & 63) | 128;
219 out
[3] = (cp
>> 12 & 63) | 128;
220 out
[4] = (cp
>> 6 & 63) | 128;
221 out
[5] = (cp
& 63) | 128;
230 * Normalise strings from the index and database.
231 * These strings are escaped as defined by mandoc_char(7) along with
232 * other goop in mandoc.h (e.g., soft hyphens).
233 * This function normalises these into a nice UTF-8 string.
234 * Returns 0 if the database is fucked.
237 norm_string(const char *val
, const struct mchars
*mc
, char **buf
)
241 const char *seq
, *cpp
;
244 static const char res
[] = { '\\', '\t',
245 ASCII_NBRSP
, ASCII_HYPH
, '\0' };
247 /* Pre-allocate by the length of the input */
249 bsz
= strlen(val
) + 1;
250 *buf
= mandoc_realloc(*buf
, bsz
);
253 while ('\0' != *val
) {
255 * Halt on the first escape sequence.
256 * This also halts on the end of string, in which case
257 * we just copy, fallthrough, and exit the loop.
259 if ((sz
= strcspn(val
, res
)) > 0) {
260 memcpy(&(*buf
)[pos
], val
, sz
);
265 if (ASCII_HYPH
== *val
) {
269 } else if ('\t' == *val
|| ASCII_NBRSP
== *val
) {
273 } else if ('\\' != *val
)
276 /* Read past the slash. */
282 * Parse the escape sequence and see if it's a
283 * predefined character or special character.
286 esc
= mandoc_escape(&val
, &seq
, &len
);
287 if (ESCAPE_ERROR
== esc
)
291 * XXX - this just does UTF-8, but we need to know
292 * beforehand whether we should do text substitution.
296 case (ESCAPE_SPECIAL
):
297 if (0 != (u
= mchars_spec2cp(mc
, seq
, len
)))
305 * If we have a Unicode codepoint, try to convert that
306 * to a UTF-8 byte string.
310 if (0 == (sz
= norm_utf8(u
, utfbuf
)))
313 /* Copy the rendered glyph into the stream. */
318 *buf
= mandoc_realloc(*buf
, bsz
);
320 memcpy(&(*buf
)[pos
], cpp
, sz
);
328 * Open the filename-index mandoc-db database.
329 * Returns NULL if opening failed.
336 db
= dbopen(MANDOC_IDX
, O_RDONLY
, 0, DB_RECNO
, NULL
);
344 * Safely unpack from an index file record into the structure.
345 * Returns 1 if an entry was unpacked, 0 if the database is insane.
348 index_read(const DBT
*key
, const DBT
*val
,
349 const struct mchars
*mc
, struct rec
*rec
)
354 #define INDEX_BREAD(_dst) \
356 if (NULL == (np = memchr(cp, '\0', left))) \
358 norm_string(cp, mc, &(_dst)); \
359 left -= (np - cp) + 1; \
361 } while (/* CONSTCOND */ 0)
364 cp
= (char *)val
->data
;
366 rec
->res
.rec
= *(recno_t
*)key
->data
;
368 INDEX_BREAD(rec
->res
.file
);
369 INDEX_BREAD(rec
->res
.cat
);
370 INDEX_BREAD(rec
->res
.title
);
371 INDEX_BREAD(rec
->res
.arch
);
372 INDEX_BREAD(rec
->res
.desc
);
377 * Search mandocdb databases in argv (size argc) for the expression
379 * Filter out by "opts".
380 * Call "res" with the results, which may be zero.
381 * Return 0 if there was a database error, else return 1.
384 apropos_search(int argc
, char *argv
[], const struct opts
*opts
,
385 const struct expr
*expr
, size_t terms
, void *arg
,
386 void (*res
)(struct res
*, size_t, void *))
393 memset(&tree
, 0, sizeof(struct rectree
));
397 for (rc
= 1, i
= 0; rc
&& i
< argc
; i
++) {
398 /* FIXME: ugly warning: we shouldn't get here! */
401 rc
= single_search(&tree
, opts
, expr
, terms
, mc
);
402 /* FIXME: warn and continue... ? */
406 * Count the matching files
407 * and feed them to the output handler.
410 for (mlen
= i
= 0; i
< tree
.len
; i
++)
411 if (tree
.node
[i
].matched
)
414 ress
= mandoc_malloc(mlen
* sizeof(struct res
));
416 for (mlen
= i
= 0; i
< tree
.len
; i
++)
417 if (tree
.node
[i
].matched
)
418 memcpy(&ress
[mlen
++], &tree
.node
[i
].res
,
421 (*res
)(ress
, mlen
, arg
);
424 for (i
= 0; i
< tree
.len
; i
++)
425 recfree(&tree
.node
[i
]);
433 single_search(struct rectree
*tree
, const struct opts
*opts
,
434 const struct expr
*expr
, size_t terms
,
454 memset(&r
, 0, sizeof(struct rec
));
456 if (NULL
== (btree
= btree_open()))
459 if (NULL
== (idx
= index_open())) {
460 (*btree
->close
)(btree
);
464 while (0 == (ch
= (*btree
->seq
)(btree
, &key
, &val
, R_NEXT
))) {
465 if (key
.size
< 2 || sizeof(struct db_val
) != val
.size
)
467 if ( ! btree_read(&key
, mc
, &buf
))
475 * See if this keyword record matches any of the
476 * expressions we have stored.
478 if ( ! exprmark(expr
, buf
, mask
, NULL
))
482 * O(log n) scan for prior records. Since a record
483 * number is unbounded, this has decent performance over
484 * a complex hash function.
487 for (leaf
= root
; leaf
>= 0; )
488 if (rec
> rs
[leaf
].res
.rec
&&
491 else if (rec
< rs
[leaf
].res
.rec
&&
498 * If we find a record, see if it has already evaluated
499 * to true. If it has, great, just keep going. If not,
500 * try to evaluate it now and continue anyway.
503 if (leaf
>= 0 && rs
[leaf
].res
.rec
== rec
) {
504 if (0 == rs
[leaf
].matched
)
505 exprexec(expr
, buf
, mask
, &rs
[leaf
]);
510 * We have a new file to examine.
511 * Extract the manpage's metadata from the index
512 * database, then begin partial evaluation.
516 key
.size
= sizeof(recno_t
);
518 if (0 != (*idx
->get
)(idx
, &key
, &val
, 0))
522 if ( ! index_read(&key
, &val
, mc
, &r
))
525 /* XXX: this should be elsewhere, I guess? */
527 if (opts
->cat
&& strcasecmp(opts
->cat
, r
.res
.cat
))
529 if (opts
->arch
&& strcasecmp(opts
->arch
, r
.res
.arch
))
532 tree
->node
= rs
= mandoc_realloc
533 (rs
, (tree
->len
+ 1) * sizeof(struct rec
));
535 memcpy(&rs
[tree
->len
], &r
, sizeof(struct rec
));
536 rs
[tree
->len
].matches
=
537 mandoc_calloc(terms
, sizeof(int));
539 exprexec(expr
, buf
, mask
, &rs
[tree
->len
]);
540 /* Append to our tree. */
543 if (rec
> rs
[leaf
].res
.rec
)
544 rs
[leaf
].rhs
= tree
->len
;
546 rs
[leaf
].lhs
= tree
->len
;
550 memset(&r
, 0, sizeof(struct rec
));
554 (*btree
->close
)(btree
);
562 recfree(struct rec
*rec
)
567 free(rec
->res
.title
);
575 exprcomp(int argc
, char *argv
[], size_t *tt
)
583 e
= exprexpr(argc
, argv
, &pos
, &lvl
, tt
);
585 if (0 == lvl
&& pos
>= argc
)
593 * Compile an array of tokens into an expression.
594 * An informal expression grammar is defined in apropos(1).
595 * Return NULL if we fail doing so. All memory will be cleaned up.
596 * Return the root of the expression sequence if alright.
599 exprexpr(int argc
, char *argv
[], int *pos
, int *lvl
, size_t *tt
)
601 struct expr
*e
, *first
, *next
;
606 for ( ; *pos
< argc
; (*pos
)++) {
610 * Close out a subexpression.
613 if (NULL
!= e
&& 0 == strcmp(")", argv
[*pos
])) {
620 * Small note: if we're just starting, don't let "-a"
621 * and "-o" be considered logical operators: they're
622 * just tokens unless pairwise joining, in which case we
623 * record their existence (or assume "OR").
627 if (NULL
!= e
&& 0 == strcmp("-a", argv
[*pos
]))
629 else if (NULL
!= e
&& 0 == strcmp("-o", argv
[*pos
]))
632 if (log
> 0 && ++(*pos
) >= argc
)
636 * Now we parse the term part. This can begin with
637 * "-i", in which case the expression is case
641 if (0 == strcmp("(", argv
[*pos
])) {
644 next
= mandoc_calloc(1, sizeof(struct expr
));
646 next
->subexpr
= exprexpr(argc
, argv
, pos
, lvl
, tt
);
647 if (NULL
== next
->subexpr
) {
651 } else if (0 == strcmp("-i", argv
[*pos
])) {
652 if (++(*pos
) >= argc
)
654 next
= exprterm(argv
[*pos
], 0);
656 next
= exprterm(argv
[*pos
], 1);
661 next
->and = log
== 1;
662 next
->index
= (int)(*tt
)++;
664 /* Append to our chain of expressions. */
682 * Parse a terminal expression with the grammar as defined in
684 * Return NULL if we fail the parse.
687 exprterm(char *buf
, int cs
)
694 memset(&e
, 0, sizeof(struct expr
));
698 /* Choose regex or substring match. */
700 if (NULL
== (e
.v
= strpbrk(buf
, "=~"))) {
704 e
.regex
= '~' == *e
.v
;
708 /* Determine the record types to search for. */
712 while (NULL
!= (key
= strsep(&buf
, ","))) {
714 while (types
[i
].mask
&&
715 strcmp(types
[i
].name
, key
))
717 e
.mask
|= types
[i
].mask
;
721 e
.mask
= TYPE_Nm
| TYPE_Nd
;
724 i
= REG_EXTENDED
| REG_NOSUB
| cs
? 0 : REG_ICASE
;
725 if (regcomp(&e
.re
, e
.v
, i
))
729 e
.v
= mandoc_strdup(e
.v
);
731 p
= mandoc_calloc(1, sizeof(struct expr
));
732 memcpy(p
, &e
, sizeof(struct expr
));
737 exprfree(struct expr
*p
)
743 exprfree(p
->subexpr
);
754 exprmark(const struct expr
*p
, const char *cp
,
755 uint64_t mask
, int *ms
)
758 for ( ; p
; p
= p
->next
) {
760 if (exprmark(p
->subexpr
, cp
, mask
, ms
))
763 } else if ( ! (mask
& p
->mask
))
767 if (regexec(&p
->re
, cp
, 0, NULL
, 0))
770 if (NULL
== strstr(cp
, p
->v
))
773 if (NULL
== strcasestr(cp
, p
->v
))
787 expreval(const struct expr
*p
, int *ms
)
792 * AND has precedence over OR. Analysis is left-right, though
793 * it doesn't matter because there are no side-effects.
794 * Thus, step through pairwise ANDs and accumulate their Boolean
795 * evaluation. If we encounter a single true AND collection or
796 * standalone term, the whole expression is true (by definition
800 for (match
= 0; p
&& ! match
; p
= p
->next
) {
801 /* Evaluate a subexpression, if applicable. */
802 if (p
->subexpr
&& ! ms
[p
->index
])
803 ms
[p
->index
] = expreval(p
->subexpr
, ms
);
805 match
= ms
[p
->index
];
806 for ( ; p
->next
&& p
->next
->and; p
= p
->next
) {
807 /* Evaluate a subexpression, if applicable. */
808 if (p
->next
->subexpr
&& ! ms
[p
->next
->index
])
810 expreval(p
->next
->subexpr
, ms
);
811 match
= match
&& ms
[p
->next
->index
];
819 * First, update the array of terms for which this expression evaluates
821 * Second, logically evaluate all terms over the updated array of truth
823 * If this evaluates to true, mark the expression as satisfied.
826 exprexec(const struct expr
*p
, const char *cp
,
827 uint64_t mask
, struct rec
*r
)
830 assert(0 == r
->matched
);
831 exprmark(p
, cp
, mask
, r
->matches
);
832 r
->matched
= expreval(p
, r
->matches
);