From ff1018946028fdff5987992f5ccc7e26a7ba7d55 Mon Sep 17 00:00:00 2001
From: Kristaps Dzonsons <kristaps@bsd.lv>
Date: Fri, 27 Feb 2009 08:20:15 +0000
Subject: More character-encoding.

---
 Makefile   |   7 ++-
 mdoc.3     |  45 ++++++++++++++--
 mdocterm.1 |  11 +---
 mdocterm.c | 170 +++++++++++++++++++++++++++++++++++++++++++++++++++++++------
 strings.c  |  18 +++++--
 term.c     |   9 ++--
 6 files changed, 221 insertions(+), 39 deletions(-)

diff --git a/Makefile b/Makefile
index ae89552e..556ca5c5 100644
--- a/Makefile
+++ b/Makefile
@@ -87,7 +87,9 @@ FAIL	= regress/test.empty \
 	  regress/test.escape.06 \
 	  regress/test.escape.07 \
 	  regress/test.escape.08 \
-	  regress/test.escape.09
+	  regress/test.escape.09 \
+	  regress/test.escape.11 \
+	  regress/test.escape.12
 
 SUCCEED	= regress/test.prologue.05 \
 	  regress/test.prologue.07 \
@@ -114,7 +116,8 @@ SUCCEED	= regress/test.prologue.05 \
 	  regress/test.sh.01 \
 	  regress/test.sh.02 \
 	  regress/test.escape.00 \
-	  regress/test.escape.05
+	  regress/test.escape.05 \
+	  regress/test.escape.10
 
 REGRESS	= $(FAIL) $(SUCCEED)
 
diff --git a/mdoc.3 b/mdoc.3
index 5b5f70e8..e7ee3ffe 100644
--- a/mdoc.3
+++ b/mdoc.3
@@ -1,4 +1,4 @@
-.\" $Id: mdoc.3,v 1.11 2009/02/25 17:02:47 kristaps Exp $
+.\" $Id: mdoc.3,v 1.12 2009/02/27 08:20:15 kristaps Exp $
 .\"
 .\" Copyright (c) 2009 Kristaps Dzonsons <kristaps@kth.se>
 .\"
@@ -16,7 +16,7 @@
 .\" TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 .\" PERFORMANCE OF THIS SOFTWARE.
 .\" 
-.Dd $Mdocdate: February 25 2009 $
+.Dd $Mdocdate: February 27 2009 $
 .Dt mdoc 3
 .Os
 .\" SECTION
@@ -184,7 +184,8 @@ The
 .Xr mdoc 3
 library accepts only printable ASCII characters as defined by
 .Xr isprint 3 .
-Non-ASCII character sequences are escaped with an escape character 
+Non-ASCII character sequences are delimited in various ways.  All are
+preceeded by an escape character
 .Sq \\
 and followed by either an open-parenthesis 
 .Sq \&(
@@ -192,7 +193,43 @@ for two-character sequences; an open-bracket
 .Sq \&[
 for n-character sequences (terminated at a close-bracket
 .Sq \&] ) ;
-or one of a small set of single characters for other escapes.
+an asterisk and open-parenthesis
+.Sq \&*(
+for two-character sequences;
+an asterisk and non-open-parenthesis 
+.Sq \&*
+for single-character sequences; or one of a small set of standalone
+single characters for other escapes.
+.Pp
+Examples:
+.Pp
+.Bl -tag -width "XXXXXXXX" -offset "XXXX" -compact
+.\" LIST-ITEM
+.It \\*(<=
+prints 
+.Dq \*(<=
+.Pq greater-equal
+.\" LIST-ITEM
+.It \\(<-
+prints
+.Dq \(<-
+.Pq left-arrow
+.\" LIST-ITEM
+.It \\[<-]
+also prints
+.Dq \(<-
+.Pq left-arrow
+.\" LIST-ITEM
+.It \\*(Ba
+prints
+.Dq \*(Ba
+.Pq bar
+.\" LIST-ITEM
+.It \\*q
+prints
+.Dq \*q
+.Pq double-quote
+.El
 .\" SUBSECTION
 .Ss Abstract Syntax Tree
 The 
diff --git a/mdocterm.1 b/mdocterm.1
index c7033d41..6c0c5737 100644
--- a/mdocterm.1
+++ b/mdocterm.1
@@ -1,4 +1,4 @@
-.\" $Id: mdocterm.1,v 1.5 2009/02/25 15:12:26 kristaps Exp $
+.\" $Id: mdocterm.1,v 1.6 2009/02/27 08:20:15 kristaps Exp $
 .\"
 .\" Copyright (c) 2009 Kristaps Dzonsons <kristaps@kth.se>
 .\"
@@ -16,7 +16,7 @@
 .\" TORTIOUS ACTION, ARISING OUT OF OR IN CONNECTION WITH THE USE OR
 .\" PERFORMANCE OF THIS SOFTWARE.
 .\"
-.Dd $Mdocdate: February 25 2009 $
+.Dd $Mdocdate: February 27 2009 $
 .Dt mdocmterm 1
 .Os
 .\" SECTION
@@ -119,10 +119,3 @@ See
 .Xr mdoc 3
 for a list of bugs, caveats, and incomplete macros regarding the
 document parse.
-.Pp
-For front-end formatting, the
-.Sq -hang ,
-.Sq -inset
-and
-.Sq -column
-list types aren't yet supported.
diff --git a/mdocterm.c b/mdocterm.c
index 9b08c4e2..5e5e751d 100644
--- a/mdocterm.c
+++ b/mdocterm.c
@@ -1,4 +1,4 @@
-/* $Id: mdocterm.c,v 1.15 2009/02/26 17:11:38 kristaps Exp $ */
+/* $Id: mdocterm.c,v 1.16 2009/02/27 08:20:15 kristaps Exp $ */
 /*
  * Copyright (c) 2008 Kristaps Dzonsons <kristaps@kth.se>
  *
@@ -31,6 +31,32 @@
 #include "mmain.h"
 #include "term.h"
 
+#define	TERMSYM_RBRACK		"]"
+#define	TERMSYM_LBRACK		"["
+#define	TERMSYM_LARROW		"<-"
+#define	TERMSYM_RARROW		"->"
+#define	TERMSYM_UARROW		"^"
+#define	TERMSYM_LSQUOTE		"`"
+#define	TERMSYM_RSQUOTE		"\'"
+#define	TERMSYM_SQUOTE		"\'"
+#define	TERMSYM_LDQUOTE		"``"
+#define	TERMSYM_RDQUOTE		"\'\'"
+#define	TERMSYM_DQUOTE		"\""
+#define	TERMSYM_LT		"<"
+#define	TERMSYM_GT		">"
+#define	TERMSYM_LE		"<="
+#define	TERMSYM_GE		">="
+#define	TERMSYM_EQ		"=="
+#define	TERMSYM_NEQ		"!="
+#define	TERMSYM_ACUTE		"\'"
+#define	TERMSYM_GRAVE		"`"
+#define	TERMSYM_PI		"pi"
+#define	TERMSYM_PLUSMINUS	"+="
+#define	TERMSYM_INFINITY	"infinity"
+#define	TERMSYM_NAN		"NaN"
+#define	TERMSYM_BAR		"|"
+#define	TERMSYM_BULLET		"o"
+
 #ifdef __NetBSD__
 #define xisspace(x) isspace((int)(x))
 #else
@@ -133,7 +159,6 @@ flushln(struct termp *p)
 	 * If we're literal, print out verbatim.
 	 */
 	if (p->flags & TERMP_LITERAL) {
-		/* FIXME: count non-printing chars. */
 		for (i = 0; i < p->col; i++)
 			putchar(p->buf[i]);
 		putchar('\n');
@@ -168,8 +193,9 @@ flushln(struct termp *p)
 		 * the line with TERMP_NOBREAK).
 		 */
 
+		/* FIXME: allow selective right-margin breaking. */
+
 		if (vis && vis + vsz > maxvis) {
-			/* FIXME */
 			if (p->flags & TERMP_NOBREAK)
 				errx(1, "word breaks right margin");
 			putchar('\n');
@@ -177,7 +203,6 @@ flushln(struct termp *p)
 				putchar(' ');
 			vis = 0;
 		} else if (vis + vsz > maxvis)
-			/* FIXME */
 			errx(1, "word breaks right margin");
 
 		/* 
@@ -258,9 +283,16 @@ static void
 chara(struct termp *p, char c)
 {
 
-	/* TODO: dynamically expand the buffer. */
-	if (p->col + 1 >= p->maxcols)
-		errx(1, "line overrun");
+	/*
+	 * Insert a single character into the line-buffer.  If the
+	 * buffer's space is exceeded, then allocate more space.
+	 */
+	if (p->col + 1 >= p->maxcols) {
+		p->buf = realloc(p->buf, p->maxcols * 2);
+		if (NULL == p->buf)
+			err(1, "malloc");
+		p->maxcols *= 2;
+	}
 	p->buf[(p->col)++] = c;
 }
 
@@ -297,21 +329,59 @@ nescape(struct termp *p, const char *word, size_t len)
 {
 
 	switch (len) {
+	case (1):
+		if ('q' == word[0])
+			stringa(p, TERMSYM_DQUOTE);
+		break;
 	case (2):
 		if ('r' == word[0] && 'B' == word[1])
-			chara(p, ']');
+			stringa(p, TERMSYM_RBRACK);
 		else if ('l' == word[0] && 'B' == word[1])
-			chara(p, '[');
+			stringa(p, TERMSYM_LBRACK);
 		else if ('<' == word[0] && '-' == word[1])
-			stringa(p, "<-");
+			stringa(p, TERMSYM_LARROW);
 		else if ('-' == word[0] && '>' == word[1])
-			stringa(p, "->");
+			stringa(p, TERMSYM_RARROW);
 		else if ('l' == word[0] && 'q' == word[1])
-			chara(p, '\"');
+			stringa(p, TERMSYM_DQUOTE);
 		else if ('r' == word[0] && 'q' == word[1])
-			chara(p, '\"');
+			stringa(p, TERMSYM_DQUOTE);
 		else if ('b' == word[0] && 'u' == word[1])
-			chara(p, 'o');
+			stringa(p, TERMSYM_BULLET);
+		else if ('L' == word[0] && 'e' == word[1])
+			stringa(p, TERMSYM_LE);
+		else if ('<' == word[0] && '=' == word[1])
+			stringa(p, TERMSYM_LE);
+		else if ('G' == word[0] && 'e' == word[1])
+			stringa(p, TERMSYM_GE);
+		else if ('>' == word[0] && '=' == word[1])
+			stringa(p, TERMSYM_GE);
+		else if ('R' == word[0] && 'q' == word[1])
+			stringa(p, TERMSYM_RDQUOTE);
+		else if ('L' == word[0] && 'q' == word[1])
+			stringa(p, TERMSYM_LDQUOTE);
+		else if ('u' == word[0] && 'a' == word[1])
+			stringa(p, TERMSYM_UARROW);
+		else if ('a' == word[0] && 'a' == word[1])
+			stringa(p, TERMSYM_ACUTE);
+		else if ('g' == word[0] && 'a' == word[1])
+			stringa(p, TERMSYM_GRAVE);
+		else if ('P' == word[0] && 'i' == word[1])
+			stringa(p, TERMSYM_PI);
+		else if ('N' == word[0] && 'e' == word[1])
+			stringa(p, TERMSYM_NEQ);
+		else if ('L' == word[0] && 't' == word[1])
+			stringa(p, TERMSYM_LT);
+		else if ('G' == word[0] && 't' == word[1])
+			stringa(p, TERMSYM_GT);
+		else if ('P' == word[0] && 'm' == word[1])
+			stringa(p, TERMSYM_PLUSMINUS);
+		else if ('I' == word[0] && 'f' == word[1])
+			stringa(p, TERMSYM_INFINITY);
+		else if ('N' == word[0] && 'a' == word[1])
+			stringa(p, TERMSYM_NAN);
+		else if ('B' == word[0] && 'a' == word[1])
+			stringa(p, TERMSYM_BAR);
 		break;
 	default:
 		break;
@@ -327,6 +397,11 @@ pescape(struct termp *p, const char *word, size_t *i, size_t len)
 	(*i)++;
 	assert(*i < len);
 
+	/*
+	 * Handle an escape sequence.  This must manage both groff-style
+	 * escapes and mdoc-style escapes.
+	 */
+
 	if ('(' == word[*i]) {
 		/* Two-character escapes. */
 		(*i)++;
@@ -335,6 +410,22 @@ pescape(struct termp *p, const char *word, size_t *i, size_t len)
 		(*i)++;
 		return;
 
+	} else if ('*' == word[*i]) { 
+		(*i)++;
+		assert(*i < len);
+		switch (word[*i]) {
+		case ('('):
+			(*i)++;
+			assert(*i + 1 < len);
+			nescape(p, &word[*i], 2);
+			(*i)++;
+			return;
+		default:
+			break;
+		}
+		nescape(p, &word[*i], 1);
+		return;
+
 	} else if ('[' != word[*i]) {
 		/* One-character escapes. */
 		switch (word[*i]) {
@@ -371,6 +462,12 @@ pword(struct termp *p, const char *word, size_t len)
 
 	/*assert(len > 0);*/ /* Can be, if literal. */
 
+	/*
+	 * Handle pwords, partial words, which may be either a single
+	 * word or a phrase that cannot be broken down (such as a
+	 * literal string).  This handles word styling.
+	 */
+
 	if ( ! (p->flags & TERMP_NOSPACE) && 
 			! (p->flags & TERMP_LITERAL))
 		chara(p, ' ');
@@ -378,6 +475,11 @@ pword(struct termp *p, const char *word, size_t len)
 	if ( ! (p->flags & TERMP_NONOSPACE))
 		p->flags &= ~TERMP_NOSPACE;
 
+	/* 
+	 * XXX - if literal and underlining, this will underline the
+	 * spaces between literal words. 
+	 */
+
 	if (p->flags & TERMP_BOLD)
 		style(p, STYLE_BOLD);
 	if (p->flags & TERMP_UNDERLINE)
@@ -402,6 +504,13 @@ word(struct termp *p, const char *word)
 {
 	size_t 		 i, j, len;
 
+	/*
+	 * Break apart a word into tokens.  If we're a literal word,
+	 * then don't.  This doesn't handle zero-length words (there
+	 * should be none) and makes sure that pword doesn't get spaces
+	 * or nil words unless literal.
+	 */
+
 	if (p->flags & TERMP_LITERAL) {
 		pword(p, word, strlen(word));
 		return;
@@ -443,6 +552,12 @@ body(struct termp *p, struct termpair *ppair,
 	int		 dochild;
 	struct termpair	 pair;
 
+	/*
+	 * This is the main function for printing out nodes.  It's
+	 * constituted of PRE and POST functions, which correspond to
+	 * prefix and infix processing.
+	 */
+
 	/* Pre-processing. */
 
 	dochild = 1;
@@ -505,6 +620,13 @@ footer(struct termp *p, const struct mdoc_meta *meta)
 
 	(void)strlcpy(os, meta->os, p->rmargin);
 
+	/*
+	 * This is /slightly/ different from regular groff output
+	 * because we don't have page numbers.  Print the following:
+	 *
+	 * OS                                            MDOCDATE
+	 */
+
 	vspace(p);
 
 	p->flags |= TERMP_NOSPACE | TERMP_NOBREAK;
@@ -530,7 +652,7 @@ footer(struct termp *p, const struct mdoc_meta *meta)
 static void
 header(struct termp *p, const struct mdoc_meta *meta)
 {
-	char		*buf, *title;
+	char		*buf, *title, *bufp;
 	const char	*pp;
 
 	if (NULL == (buf = malloc(p->rmargin)))
@@ -569,8 +691,21 @@ header(struct termp *p, const struct mdoc_meta *meta)
 			break;
 		}
 
+	/*
+	 * The header is strange.  It has three components, which are
+	 * really two with the first duplicated.  It goes like this:
+	 *
+	 * IDENTIFIER              TITLE                   IDENTIFIER
+	 *
+	 * The IDENTIFIER is NAME(SECTION), which is the command-name
+	 * (if given, or "unknown" if not) followed by the manual page
+	 * section.  These are given in `Dt'.  The TITLE is a free-form
+	 * string depending on the manual volume.  If not specified, it
+	 * switches on the manual section.
+	 */
+
 	if (mdoc_arch2a(meta->arch))
-		(void)snprintf(buf, p->rmargin, "%s(%s)",
+		(void)snprintf(buf, p->rmargin, "%s (%s)",
 				pp, mdoc_arch2a(meta->arch));
 	else
 		(void)strlcpy(buf, pp, p->rmargin);
@@ -580,6 +715,9 @@ header(struct termp *p, const struct mdoc_meta *meta)
 	(void)snprintf(title, p->rmargin, "%s(%s)",
 			meta->title, pp ? pp : "");
 
+	for (bufp = title; *bufp; bufp++)
+		*bufp = toupper(*bufp);
+	
 	p->offset = 0;
 	p->rmargin = (p->maxrmargin - strlen(buf)) / 2;
 	p->flags |= TERMP_NOBREAK | TERMP_NOSPACE;
diff --git a/strings.c b/strings.c
index 5d143492..87a9b35e 100644
--- a/strings.c
+++ b/strings.c
@@ -1,4 +1,4 @@
-/* $Id: strings.c,v 1.20 2009/02/26 16:08:11 kristaps Exp $ */
+/* $Id: strings.c,v 1.21 2009/02/27 08:20:15 kristaps Exp $ */
 /*
  * Copyright (c) 2008 Kristaps Dzonsons <kristaps@kth.se>
  *
@@ -61,6 +61,18 @@ mdoc_isescape(const char *p)
 		/* FALLTHROUGH */
 	case ('e'):
 		return(2);
+	case ('*'):
+		if (0 == *++p || ! isgraph((int)*p))
+			return(0);
+		switch (*p) {
+		case ('('):
+			if (0 == *++p || ! isgraph((int)*p))
+				return(0);
+			return(4);
+		default:
+			break;
+		}
+		return(3);
 	case ('('):
 		if (0 == *++p || ! isgraph((int)*p))
 			return(0);
@@ -178,9 +190,9 @@ mdoc_atotime(const char *p)
 
 	(void)memset(&tm, 0, sizeof(struct tm));
 
-	if (xstrcmp(p, "$Mdocdate: February 26 2009 $"))
+	if (xstrcmp(p, "$Mdocdate: February 27 2009 $"))
 		return(time(NULL));
-	if ((pp = strptime(p, "$Mdocdate: February 26 2009 $", &tm)) && 0 == *pp)
+	if ((pp = strptime(p, "$Mdocdate: February 27 2009 $", &tm)) && 0 == *pp)
 		return(mktime(&tm));
 	/* XXX - this matches "June 1999", which is wrong. */
 	if ((pp = strptime(p, "%b %d %Y", &tm)) && 0 == *pp)
diff --git a/term.c b/term.c
index d3b7d07a..6b8a3fe3 100644
--- a/term.c
+++ b/term.c
@@ -1,4 +1,4 @@
-/* $Id: term.c,v 1.25 2009/02/26 16:08:11 kristaps Exp $ */
+/* $Id: term.c,v 1.26 2009/02/27 08:20:15 kristaps Exp $ */
 /*
  * Copyright (c) 2009 Kristaps Dzonsons <kristaps@kth.se>
  *
@@ -25,7 +25,7 @@
 
 #include "term.h"
 
-#define	INDENT		  4
+#define	INDENT		  6
 
 /*
  * Performs actions on nodes of the abstract syntax tree.  Both pre- and
@@ -279,7 +279,6 @@ arg_width(const struct mdoc_arg *arg)
 {
 	size_t		 len, i, v;
 
-	/* TODO */
 	assert(*arg->value);
 	if (0 == strcmp(*arg->value, "indent"))
 		return(INDENT);
@@ -1330,7 +1329,7 @@ termp_bq_pre(DECL_ARGS)
 
 	if (MDOC_BODY != node->type)
 		return(1);
-	word(p, "[");
+	word(p, "\\[");
 	p->flags |= TERMP_NOSPACE;
 	return(1);
 }
@@ -1354,7 +1353,7 @@ termp_pq_pre(DECL_ARGS)
 
 	if (MDOC_BODY != node->type)
 		return(1);
-	word(p, "(");
+	word(p, "\\&(");
 	p->flags |= TERMP_NOSPACE;
 	return(1);
 }
-- 
cgit v1.2.3-56-ge451