Import apropos from mandoc-tools after inlining all source files

(originally including extern.h, state.c, and sort.c). The apropos utility interfaces with the databases of mandocdb to provide semantic searching capabilities. It Works For Me, but will need lots of cleanup in the coming months.
author: Kristaps Dzonsons <kristaps@bsd.lv> 2011-10-06 23:00:54 +0000
committer: Kristaps Dzonsons <kristaps@bsd.lv> 2011-10-06 23:00:54 +0000
commit: 686c2b46e18ab428a740ea36f788ce921093fe65 (patch)
tree: 084d3a7e28dffb068873b21c0d2533195a97e7a9 /apropos.c
parent: 04d794422a19264dc0a985a9b145b3748fc69862 (diff)
download: mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.tar.gz
mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.tar.zst
mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.zip
1 files changed, 689 insertions, 0 deletions
diff --git a/apropos.c b/apropos.c
new file mode 100644
index 00000000..409ef8e2
--- /dev/null
+++ b/apropos.c
@@ -0,0 +1,689 @@
+/*	$Id: apropos.c,v 1.1 2011/10/06 23:00:54 kristaps Exp $ */
+/*
+* Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
+ *
+ * Permission to use, copy, modify, and distribute this software for any
+ * purpose with or without fee is hereby granted, provided that the above
+ * copyright notice and this permission notice appear in all copies.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
+ * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
+ * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
+ * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
+ * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
+ * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
+ * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ */
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include <sys/types.h>
+
+#include <assert.h>
+#include <errno.h>
+#include <fcntl.h>
+#include <getopt.h>
+#include <limits.h>
+#include <regex.h>
+#include <stdarg.h>
+#include <stdint.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <unistd.h>
+
+#ifdef __linux__
+# include <db_185.h>
+#else
+# include <db.h>
+#endif
+
+#include "mandoc.h"
+
+#define	MAXRESULTS	 100
+
+#define TYPE_NAME	0x01
+#define TYPE_FUNCTION	0x02
+#define TYPE_UTILITY	0x04
+#define TYPE_INCLUDES	0x08
+#define TYPE_VARIABLE	0x10
+#define TYPE_STANDARD	0x20
+#define TYPE_AUTHOR	0x40
+#define TYPE_CONFIG	0x80
+#define TYPE_DESC	0x100
+#define TYPE_XREF	0x200
+#define TYPE_PATH	0x400
+#define	TYPE_ENV	0x800
+#define	TYPE_ERR	0x1000
+
+enum	match {
+	MATCH_SUBSTR = 0,
+	MATCH_REGEX,
+	MATCH_EXACT
+};
+
+enum	sort {
+	SORT_TITLE = 0,
+	SORT_CAT,
+	SORT__MAX
+};
+
+struct	opts {
+	enum sort	 sort; /* output sorting */
+	const char	*arch; /* restrict to architecture */
+	const char	*cat; /* restrict to category */
+	int		 types; /* only types in bitmask */
+	int		 insens; /* case-insensitive match */
+	enum match	 match; /* match type */
+};
+
+struct	type {
+	int		 mask;
+	const char	*name;
+};
+
+struct	rec {
+	char		*file;
+	char		*cat;
+	char		*title;
+	char		*arch;
+	char		*desc;
+	recno_t		 rec;
+};
+
+struct	res {
+	char		*arch; /* architecture */
+	char		*desc; /* free-form description */
+	char		*keyword; /* matched keyword */
+	int	 	 types; /* bitmask of field selectors */
+	char		*cat; /* manual section */
+	char		*title; /* manual section */
+	char		*uri; /* formatted uri of file */
+	recno_t		 rec; /* unique id of underlying manual */
+};
+
+struct	state {
+	DB		 *db; /* database */
+	DB		 *idx; /* index */
+	const char	 *dbf; /* database name */
+	const char	 *idxf; /* index name */
+	void		(*err)(const char *);
+	void		(*errx)(const char *, ...);
+};
+
+static	const char * const sorts[SORT__MAX] = {
+	"cat", /* SORT_CAT */
+	"title", /* SORT_TITLE */
+};
+
+static	const struct type types[] = {
+	{ TYPE_NAME, "name" },
+	{ TYPE_FUNCTION, "func" },
+	{ TYPE_UTILITY, "utility" },
+	{ TYPE_INCLUDES, "incl" },
+	{ TYPE_VARIABLE, "var" },
+	{ TYPE_STANDARD, "stand" },
+	{ TYPE_AUTHOR, "auth" },
+	{ TYPE_CONFIG, "conf" },
+	{ TYPE_DESC, "desc" },
+	{ TYPE_XREF, "xref" },
+	{ TYPE_PATH, "path" },
+	{ TYPE_ENV, "env" },
+	{ TYPE_ERR, "err" },
+	{ INT_MAX, "all" },
+	{ 0, NULL }
+};
+
+static	void	 buf_alloc(char **, size_t *, size_t);
+static	void	 buf_dup(struct mchars *, char **, const char *);
+static	void	 buf_redup(struct mchars *, char **, 
+			size_t *, const char *);
+static	void	 error(const char *, ...);
+static	int	 sort_cat(const void *, const void *);
+static	int	 sort_title(const void *, const void *);
+static	void	 state_destroy(struct state *);
+static	int	 state_getrecord(struct state *, recno_t, struct rec *);
+static	int	 state_init(struct state *, 
+			const char *, const char *,
+			void (*err)(const char *),
+			void (*errx)(const char *, ...));
+static	void	 state_output(const struct res *, int);
+static	void	 state_search(struct state *, 
+			const struct opts *, char *);
+
+static	void	 usage(void);
+
+static	const char	 *progname;
+
+int
+main(int argc, char *argv[])
+{
+	int		 ch, i;
+	const char	*dbf, *idxf;
+	struct state	 state;
+	char		*q, *v;
+	struct opts	 opts;
+	extern int	 optind;
+	extern char	*optarg;
+
+	memset(&opts, 0, sizeof(struct opts));
+
+	dbf = "mandoc.db";
+	idxf = "mandoc.index";
+	q = NULL;
+
+	progname = strrchr(argv[0], '/');
+	if (progname == NULL)
+		progname = argv[0];
+	else
+		++progname;
+
+	opts.match = MATCH_SUBSTR;
+
+	while (-1 != (ch = getopt(argc, argv, "a:c:eIrs:t:"))) 
+		switch (ch) {
+		case ('a'):
+			opts.arch = optarg;
+			break;
+		case ('c'):
+			opts.cat = optarg;
+			break;
+		case ('e'):
+			opts.match = MATCH_EXACT;
+			break;
+		case ('I'):
+			opts.insens = 1;
+			break;
+		case ('r'):
+			opts.match = MATCH_REGEX;
+			break;
+		case ('s'):
+			for (i = 0; i < SORT__MAX; i++) {
+				if (strcmp(optarg, sorts[i])) 
+					continue;
+				opts.sort = (enum sort)i;
+				break;
+			}
+
+			if (i < SORT__MAX)
+				break;
+
+			error("%s: Bad sort\n", optarg);
+			return(EXIT_FAILURE);
+		case ('t'):
+			while (NULL != (v = strsep(&optarg, ","))) {
+				if ('\0' == *v)
+					continue;
+				for (i = 0; types[i].mask; i++) {
+					if (strcmp(types[i].name, v))
+						continue;
+					break;
+				}
+				if (0 == types[i].mask)
+					break;
+				opts.types |= types[i].mask;
+			}
+			if (NULL == v)
+				break;
+			
+			error("%s: Bad type\n", v);
+			return(EXIT_FAILURE);
+		default:
+			usage();
+			return(EXIT_FAILURE);
+		}
+
+	argc -= optind;
+	argv += optind;
+
+	if (0 == argc || '\0' == **argv) {
+		usage();
+		return(EXIT_FAILURE);
+	} else
+		q = *argv;
+
+	if (0 == opts.types)
+		opts.types = TYPE_NAME | TYPE_DESC;
+
+	if ( ! state_init(&state, dbf, idxf, perror, error)) {
+		state_destroy(&state);
+		return(EXIT_FAILURE);
+	}
+
+	state_search(&state, &opts, q);
+	state_destroy(&state);
+
+	return(EXIT_SUCCESS);
+}
+
+static void
+state_search(struct state *p, const struct opts *opts, char *q)
+{
+	int		 i, len, ch, rflags, dflag;
+	struct mchars	*mc;
+	char		*buf;
+	size_t		 bufsz;
+	recno_t		 rec;
+	uint32_t	 fl;
+	DBT		 key, val;
+	struct res	 res[MAXRESULTS];
+	regex_t		 reg;
+	regex_t		*regp;
+	char		 filebuf[10];
+	struct rec	 record;
+
+	len = 0;
+	buf = NULL;
+	bufsz = 0;
+	ch = 0;
+	regp = NULL;
+
+	switch (opts->match) {
+	case (MATCH_REGEX):
+		rflags = REG_EXTENDED | REG_NOSUB | 
+			(opts->insens ? REG_ICASE : 0);
+
+		if (0 != regcomp(&reg, q, rflags)) {
+			error("%s: Bad pattern\n", q);
+			return;
+		}
+
+		regp = &reg;
+		dflag = R_FIRST;
+		break;
+	case (MATCH_EXACT):
+		key.data = q;
+		key.size = strlen(q) + 1;
+		dflag = R_CURSOR;
+		break;
+	default:
+		dflag = R_FIRST;
+		break;
+	}
+
+	if (NULL == (mc = mchars_alloc())) {
+		perror(NULL);
+		exit(EXIT_FAILURE);
+	}
+
+	/*
+	 * Iterate over the entire keyword database.
+	 * For each record, we must first translate the key into UTF-8.
+	 * Following that, make sure it's acceptable.
+	 * Lastly, add it to the available records.
+	 */
+
+	while (len < MAXRESULTS) {
+		if ((ch = (*p->db->seq)(p->db, &key, &val, dflag)))
+			break;
+
+		dflag = R_NEXT;
+
+		/* 
+		 * Keys must be sized as such: the keyword must be
+		 * non-empty (nil terminator plus one character) and the
+		 * value must be 8 (recno_t---uint32_t---index reference
+		 * and a uint32_t flag field).
+		 */
+
+		if (key.size < 2 || 8 != val.size) {
+			error("%s: Corrupt database\n", p->dbf);
+			exit(EXIT_FAILURE);
+		}
+
+		buf_redup(mc, &buf, &bufsz, (char *)key.data);
+
+		fl = *(uint32_t *)val.data;
+
+		if ( ! (fl & opts->types))
+			continue;
+
+		switch (opts->match) {
+		case (MATCH_REGEX):
+			if (regexec(regp, buf, 0, NULL, 0))
+				continue;
+			break;
+		case (MATCH_EXACT):
+			if (opts->insens && strcasecmp(buf, q))
+				goto send;
+			if ( ! opts->insens && strcmp(buf, q))
+				goto send;
+			break;
+		default:
+			if (opts->insens && NULL == strcasestr(buf, q))
+				continue;
+			if ( ! opts->insens && NULL == strstr(buf, q))
+				continue;
+			break;
+		}
+
+		/*
+		 * Now look up the file itself in our index.  The file's
+		 * indexed by its recno for fast lookups.
+		 */
+
+		memcpy(&rec, val.data + 4, sizeof(recno_t));
+
+		if ( ! state_getrecord(p, rec, &record))
+			exit(EXIT_FAILURE);
+
+		/* If we're in a different section, skip... */
+
+		if (opts->cat && strcasecmp(opts->cat, record.cat))
+			continue;
+		if (opts->arch && strcasecmp(opts->arch, record.arch))
+			continue;
+
+		/* FIXME: this needs to be changed.  Ugh.  Linear. */
+
+		for (i = 0; i < len; i++)
+			if (res[i].rec == record.rec)
+				break;
+
+		if (i < len)
+			continue;
+
+		/*
+		 * Now we have our filename, keywords, types, and all
+		 * other necessary information.  
+		 * Process it and add it to our list of results.
+		 */
+
+		filebuf[9] = '\0';
+		snprintf(filebuf, 10, "%u", record.rec);
+		assert('\0' == filebuf[9]);
+
+		res[len].rec = record.rec;
+		res[len].types = fl;
+
+		buf_dup(mc, &res[len].keyword, buf);
+		buf_dup(mc, &res[len].uri, filebuf);
+		buf_dup(mc, &res[len].cat, record.cat);
+		buf_dup(mc, &res[len].arch, record.arch);
+		buf_dup(mc, &res[len].title, record.title);
+		buf_dup(mc, &res[len].desc, record.desc);
+		len++;
+	}
+
+send:
+	if (ch < 0) {
+		perror(p->dbf);
+		exit(EXIT_FAILURE);
+	} 
+
+	switch (opts->sort) {
+	case (SORT_CAT):
+		qsort(res, len, sizeof(struct res), sort_cat);
+		break;
+	default:
+		qsort(res, len, sizeof(struct res), sort_title);
+		break;
+	}
+
+	state_output(res, len);
+
+	for (len-- ; len >= 0; len--) {
+		free(res[len].keyword);
+		free(res[len].title);
+		free(res[len].cat);
+		free(res[len].arch);
+		free(res[len].desc);
+		free(res[len].uri);
+	}
+
+	free(buf);
+	mchars_free(mc);
+
+	if (regp)
+		regfree(regp);
+}
+
+/*
+ * Track allocated buffer size for buf_redup().
+ */
+static inline void
+buf_alloc(char **buf, size_t *bufsz, size_t sz)
+{
+
+	if (sz < *bufsz) 
+		return;
+
+	*bufsz = sz + 1024;
+	if (NULL == (*buf = realloc(*buf, *bufsz))) {
+		perror(NULL);
+		exit(EXIT_FAILURE);
+	}
+}
+
+/*
+ * Like buf_redup() but throwing away the buffer size.
+ */
+static void
+buf_dup(struct mchars *mc, char **buf, const char *val)
+{
+	size_t		 bufsz;
+
+	bufsz = 0;
+	*buf = NULL;
+	buf_redup(mc, buf, &bufsz, val);
+}
+
+/*
+ * Normalise strings from the index and database.
+ * These strings are escaped as defined by mandoc_char(7) along with
+ * other goop in mandoc.h (e.g., soft hyphens).
+ */
+static void
+buf_redup(struct mchars *mc, char **buf, 
+		size_t *bufsz, const char *val)
+{
+	size_t		 sz;
+	const char	*seq, *cpp;
+	int		 len, pos;
+	enum mandoc_esc	 esc;
+	const char	 rsv[] = { '\\', ASCII_NBRSP, ASCII_HYPH, '\0' };
+
+	/* Pre-allocate by the length of the input */
+
+	buf_alloc(buf, bufsz, strlen(val) + 1);
+
+	pos = 0;
+
+	while ('\0' != *val) {
+		/*
+		 * Halt on the first escape sequence.
+		 * This also halts on the end of string, in which case
+		 * we just copy, fallthrough, and exit the loop.
+		 */
+		if ((sz = strcspn(val, rsv)) > 0) {
+			memcpy(&(*buf)[pos], val, sz);
+			pos += (int)sz;
+			val += (int)sz;
+		}
+
+		if (ASCII_HYPH == *val) {
+			(*buf)[pos++] = '-';
+			val++;
+			continue;
+		} else if (ASCII_NBRSP == *val) {
+			(*buf)[pos++] = ' ';
+			val++;
+			continue;
+		} else if ('\\' != *val)
+			break;
+
+		/* Read past the slash. */
+
+		val++;
+
+		/*
+		 * Parse the escape sequence and see if it's a
+		 * predefined character or special character.
+		 */
+
+		esc = mandoc_escape(&val, &seq, &len);
+		if (ESCAPE_ERROR == esc)
+			break;
+
+		cpp = ESCAPE_SPECIAL == esc ? 
+			mchars_spec2str(mc, seq, len, &sz) : NULL;
+
+		if (NULL == cpp)
+			continue;
+
+		/* Copy the rendered glyph into the stream. */
+
+		buf_alloc(buf, bufsz, sz);
+
+		memcpy(&(*buf)[pos], cpp, sz);
+		pos += (int)sz;
+	}
+
+	(*buf)[pos] = '\0';
+}
+
+static void
+error(const char *fmt, ...)
+{
+	va_list		 ap;
+
+	va_start(ap, fmt);
+	vfprintf(stderr, fmt, ap);
+	va_end(ap);
+}
+
+static void
+state_output(const struct res *res, int sz)
+{
+	int		 i;
+
+	for (i = 0; i < sz; i++)
+		printf("%s(%s%s%s) - %s\n", res[i].title, 
+				res[i].cat, 
+				*res[i].arch ? "/" : "",
+				*res[i].arch ? res[i].arch : "",
+				res[i].desc);
+}
+
+static void
+usage(void)
+{
+
+	fprintf(stderr, "usage: %s "
+			"[-eIr] "
+			"[-a arch] "
+			"[-c cat] "
+			"[-s sort] "
+			"[-t type[,...]] "
+			"key\n", progname);
+}
+
+static int
+state_init(struct state *p, 
+		const char *dbf, const char *idxf,
+		void (*err)(const char *),
+		void (*errx)(const char *, ...))
+{
+	BTREEINFO	 info;
+
+	memset(p, 0, sizeof(struct state));
+	memset(&info, 0, sizeof(BTREEINFO));
+
+	info.flags = R_DUP;
+
+	p->dbf = dbf;
+	p->idxf = idxf;
+	p->err = err;
+
+	p->db = dbopen(p->dbf, O_RDONLY, 0, DB_BTREE, &info);
+	if (NULL == p->db) {
+		(*err)(p->dbf);
+		return(0);
+	}
+
+	p->idx = dbopen(p->idxf, O_RDONLY, 0, DB_RECNO, NULL);
+	if (NULL == p->idx) {
+		(*err)(p->idxf);
+		return(0);
+	}
+
+	return(1);
+}
+
+static void
+state_destroy(struct state *p)
+{
+
+	if (p->db)
+		(*p->db->close)(p->db);
+	if (p->idx)
+		(*p->idx->close)(p->idx);
+}
+
+static int
+state_getrecord(struct state *p, recno_t rec, struct rec *rp)
+{
+	DBT		key, val;
+	size_t		sz;
+	int		rc;
+
+	key.data = &rec;
+	key.size = sizeof(recno_t);
+
+	rc = (*p->idx->get)(p->idx, &key, &val, 0);
+	if (rc < 0) {
+		(*p->err)(p->idxf);
+		return(0);
+	} else if (rc > 0) {
+		(*p->errx)("%s: Corrupt index\n", p->idxf);
+		return(0);
+	}
+
+	rp->file = (char *)val.data;
+	if ((sz = strlen(rp->file) + 1) >= val.size) {
+		(*p->errx)("%s: Corrupt index\n", p->idxf);
+		return(0);
+	}
+
+	rp->cat = (char *)val.data + (int)sz;
+	if ((sz += strlen(rp->cat) + 1) >= val.size) {
+		(*p->errx)("%s: Corrupt index\n", p->idxf);
+		return(0);
+	}
+
+	rp->title = (char *)val.data + (int)sz;
+	if ((sz += strlen(rp->title) + 1) >= val.size) {
+		(*p->errx)("%s: Corrupt index\n", p->idxf);
+		return(0);
+	}
+
+	rp->arch = (char *)val.data + (int)sz;
+	if ((sz += strlen(rp->arch) + 1) >= val.size) {
+		(*p->errx)("%s: Corrupt index\n", p->idxf);
+		return(0);
+	}
+
+	rp->desc = (char *)val.data + (int)sz;
+	rp->rec = rec;
+	return(1);
+}
+
+static int
+sort_title(const void *p1, const void *p2)
+{
+
+	return(strcmp(((const struct res *)p1)->title,
+		      ((const struct res *)p2)->title));
+}
+
+static int
+sort_cat(const void *p1, const void *p2)
+{
+	int		 rc;
+
+	rc = strcmp(((const struct res *)p1)->cat,
+			((const struct res *)p2)->cat);
+
+	return(0 == rc ? sort_title(p1, p2) : rc);
+}
author	Kristaps Dzonsons <kristaps@bsd.lv>	2011-10-06 23:00:54 +0000
committer	Kristaps Dzonsons <kristaps@bsd.lv>	2011-10-06 23:00:54 +0000
commit	686c2b46e18ab428a740ea36f788ce921093fe65 (patch)
tree	084d3a7e28dffb068873b21c0d2533195a97e7a9 /apropos.c
parent	04d794422a19264dc0a985a9b145b3748fc69862 (diff)
download	mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.tar.gz mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.tar.zst mandoc-686c2b46e18ab428a740ea36f788ce921093fe65.zip