Use C99 uint32_t, not BSD-style u_int32_t.

[mandoc.git] / mandocdb.c
diff --git a/mandocdb.c b/mandocdb.c

index 0acd9924dc95d3a58cdfc7bd1a60a0f13e7293c2..28d11e0bb0569e1557e5a6c30b549b724c1ba5b7 100644 (file)
--- a/mandocdb.c
+++ b/mandocdb.c
@@ -1,7 +1,7 @@
-/*     $Id: mandocdb.c,v 1.89 2013/12/27 20:35:51 schwarze Exp $ */
+/*     $Id: mandocdb.c,v 1.96 2014/01/02 22:44:10 schwarze Exp $ */
  /*
   * Copyright (c) 2011, 2012 Kristaps Dzonsons <kristaps@bsd.lv>
- * Copyright (c) 2011, 2012, 2013 Ingo Schwarze <schwarze@openbsd.org>
+ * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
   *
   * Permission to use, copy, modify, and distribute this software for any
   * purpose with or without fee is hereby granted, provided that the above
@@ -82,10 +82,10 @@ enum        form {
  };
  
  struct str {
-       char            *utf8; /* key in UTF-8 form */
+       char            *rendered; /* key in UTF-8 or ASCII form */
         const struct mpage *mpage; /* if set, the owning parse */
         uint64_t         mask; /* bitmask in sequence */
-       char             key[]; /* the string itself */
+       char             key[]; /* may contain escape sequences */
  };
  
  struct inodev {
@@ -143,6 +143,7 @@ static      void    *hash_alloc(size_t, void *);
  static void     hash_free(void *, size_t, void *);
  static void    *hash_halloc(size_t, void *);
  static void     mlink_add(struct mlink *, const struct stat *);
+static int      mlink_check(struct mpage *, struct mlink *);
  static void     mlink_free(struct mlink *);
  static void     mlinks_undupe(struct mpage *);
  static void     mpages_free(void);
@@ -154,11 +155,9 @@ static     int      parse_mdoc_body(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_head(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Fd(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Fn(struct mpage *, const struct mdoc_node *);
-static int      parse_mdoc_In(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Nd(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Nm(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Sh(struct mpage *, const struct mdoc_node *);
-static int      parse_mdoc_St(struct mpage *, const struct mdoc_node *);
  static int      parse_mdoc_Xr(struct mpage *, const struct mdoc_node *);
  static void     putkey(const struct mpage *,
                         const char *, uint64_t);
@@ -166,17 +165,18 @@ static    void     putkeys(const struct mpage *,
                         const char *, size_t, uint64_t);
  static void     putmdockey(const struct mpage *,
                         const struct mdoc_node *, uint64_t);
+static void     render_key(struct mchars *, struct str *);
  static void     say(const char *, const char *, ...);
  static int      set_basedir(const char *);
  static int      treescan(void);
  static size_t   utf8(unsigned int, char [7]);
-static void     utf8key(struct mchars *, struct str *);
  
  static char            *progname;
  static int              use_all; /* use all found files */
  static int              nodb; /* no database changes */
  static int              verb; /* print what we're doing */
  static int              warnings; /* warn about crap */
+static int              write_utf8; /* write UTF-8 output; else ASCII */
  static int              exitcode; /* to be returned by main */
  static enum op          op; /* operational mode */
  static char             basedir[PATH_MAX]; /* current base directory */
@@ -216,7 +216,7 @@ static      const struct mdoc_handler mdocs[MDOC_MAX] = {
         { parse_mdoc_Fn, 0 },  /* Fn */
         { NULL, TYPE_Ft },  /* Ft */
         { NULL, TYPE_Ic },  /* Ic */
-       { parse_mdoc_In, TYPE_In },  /* In */
+       { NULL, TYPE_In },  /* In */
         { NULL, TYPE_Li },  /* Li */
         { parse_mdoc_Nd, TYPE_Nd },  /* Nd */
         { parse_mdoc_Nm, TYPE_Nm },  /* Nm */
@@ -224,7 +224,7 @@ static      const struct mdoc_handler mdocs[MDOC_MAX] = {
         { NULL, 0 },  /* Ot */
         { NULL, TYPE_Pa },  /* Pa */
         { NULL, 0 },  /* Rv */
-       { parse_mdoc_St, 0 },  /* St */
+       { NULL, TYPE_St },  /* St */
         { NULL, TYPE_Va },  /* Va */
         { parse_mdoc_body, TYPE_Va },  /* Vt */
         { parse_mdoc_Xr, 0 },  /* Xr */
@@ -352,7 +352,7 @@ main(int argc, char *argv[])
         path_arg = NULL;
         op = OP_DEFAULT;
  
-       while (-1 != (ch = getopt(argc, argv, "aC:d:ntu:vW")))
+       while (-1 != (ch = getopt(argc, argv, "aC:d:nT:tu:vW")))
                 switch (ch) {
                 case ('a'):
                         use_all = 1;
@@ -370,6 +370,14 @@ main(int argc, char *argv[])
                 case ('n'):
                         nodb = 1;
                         break;
+               case ('T'):
+                       if (strcmp(optarg, "utf8")) {
+                               fprintf(stderr, "-T%s: Unsupported "
+                                   "output format\n", optarg);
+                               goto usage;
+                       }
+                       write_utf8 = 1;
+                       break;
                 case ('t'):
                         CHECKOP(op, ch);
                         dup2(STDOUT_FILENO, STDERR_FILENO);
@@ -491,9 +499,9 @@ out:
         ohash_delete(&mlinks);
         return(exitcode);
  usage:
-       fprintf(stderr, "usage: %s [-anvW] [-C file]\n"
-                       "       %s [-anvW] dir ...\n"
-                       "       %s [-nvW] -d dir [file ...]\n"
+       fprintf(stderr, "usage: %s [-anvW] [-C file] [-Tutf8]\n"
+                       "       %s [-anvW] [-Tutf8] dir ...\n"
+                       "       %s [-nvW] [-Tutf8] -d dir [file ...]\n"
                         "       %s [-nvW] -u dir [file ...]\n"
                         "       %s -t file ...\n",
                        progname, progname, progname, 
@@ -523,8 +531,8 @@ treescan(void)
         FTSENT          *ff;
         struct mlink    *mlink;
         int              dform;
-       char            *fsec;
-       const char      *dsec, *arch, *cp, *path;
+       char            *dsec, *arch, *fsec, *cp;
+       const char      *path;
         const char      *argv[2];
  
         argv[0] = ".";
@@ -589,16 +597,14 @@ treescan(void)
                                 continue;
                         } else
                                 fsec[-1] = '\0';
+
                         mlink = mandoc_calloc(1, sizeof(struct mlink));
                         strlcpy(mlink->file, path, sizeof(mlink->file));
                         mlink->dform = dform;
-                       if (NULL != dsec)
-                               mlink->dsec = mandoc_strdup(dsec);
-                       if (NULL != arch)
-                               mlink->arch = mandoc_strdup(arch);
-                       mlink->name = mandoc_strdup(ff->fts_name);
-                       if (NULL != fsec)
-                               mlink->fsec = mandoc_strdup(fsec);
+                       mlink->dsec = dsec;
+                       mlink->arch = arch;
+                       mlink->name = ff->fts_name;
+                       mlink->fsec = fsec;
                         mlink_add(mlink, ff->fts_statp);
                         continue;
                 } else if (FTS_D != ff->fts_info &&
@@ -618,8 +624,6 @@ treescan(void)
                          * Try to infer this from the name.
                          * If we're not in use_all, enforce it.
                          */
-                       dsec = NULL;
-                       dform = FORM_NONE;
                         cp = ff->fts_name;
                         if (FTS_DP == ff->fts_info)
                                 break;
@@ -630,6 +634,9 @@ treescan(void)
                         } else if (0 == strncmp(cp, "cat", 3)) {
                                 dform = FORM_CAT;
                                 dsec = cp + 3;
+                       } else {
+                               dform = FORM_NONE;
+                               dsec = NULL;
                         }
  
                         if (NULL != dsec || use_all) 
@@ -644,9 +651,10 @@ treescan(void)
                          * Possibly our architecture.
                          * If we're descending, keep tabs on it.
                          */
-                       arch = NULL;
                         if (FTS_DP != ff->fts_info && NULL != dsec)
                                 arch = ff->fts_name;
+                       else
+                               arch = NULL;
                         break;
                 default:
                         if (FTS_DP == ff->fts_info || use_all)
@@ -720,16 +728,16 @@ filescan(const char *file)
                 *p++ = '\0';
                 if (0 == strncmp(start, "man", 3)) {
                         mlink->dform = FORM_SRC;
-                       mlink->dsec = mandoc_strdup(start + 3);
+                       mlink->dsec = start + 3;
                 } else if (0 == strncmp(start, "cat", 3)) {
                         mlink->dform = FORM_CAT;
-                       mlink->dsec = mandoc_strdup(start + 3);
+                       mlink->dsec = start + 3;
                 }
  
                 start = p;
                 if (NULL != mlink->dsec && NULL != (p = strchr(start, '/'))) {
                         *p++ = '\0';
-                       mlink->arch = mandoc_strdup(start);
+                       mlink->arch = start;
                         start = p;
                 }
         }
@@ -744,7 +752,7 @@ filescan(const char *file)
  
         if ('.' == *p) {
                 *p++ = '\0';
-               mlink->fsec = mandoc_strdup(p);
+               mlink->fsec = p;
         }
  
         /*
@@ -756,8 +764,6 @@ filescan(const char *file)
                 mlink->name = p + 1;
                 *p = '\0';
         }
-       mlink->name = mandoc_strdup(mlink->name);
-
         mlink_add(mlink, &st);
  }
  
@@ -770,14 +776,10 @@ mlink_add(struct mlink *mlink, const struct stat *st)
  
         assert(NULL != mlink->file);
  
-       if (NULL == mlink->dsec)
-               mlink->dsec = mandoc_strdup("");
-       if (NULL == mlink->arch)
-               mlink->arch = mandoc_strdup("");
-       if (NULL == mlink->name)
-               mlink->name = mandoc_strdup("");
-       if (NULL == mlink->fsec)
-               mlink->fsec = mandoc_strdup("");
+       mlink->dsec = mandoc_strdup(mlink->dsec ? mlink->dsec : "");
+       mlink->arch = mandoc_strdup(mlink->arch ? mlink->arch : "");
+       mlink->name = mandoc_strdup(mlink->name ? mlink->name : "");
+       mlink->fsec = mandoc_strdup(mlink->fsec ? mlink->fsec : "");
  
         if ('0' == *mlink->fsec) {
                 free(mlink->fsec);
@@ -855,16 +857,16 @@ mlinks_undupe(struct mpage *mpage)
         char             *bufp;
  
         mpage->form = FORM_CAT;
-       for(prev = &mpage->mlinks; *prev; prev = &(*prev)->next) {
-               mlink = *prev;
+       prev = &mpage->mlinks;
+       while (NULL != (mlink = *prev)) {
                 if (FORM_CAT != mlink->dform) {
                         mpage->form = FORM_NONE;
-                       continue;
+                       goto nextlink;
                 }
                 if (strlcpy(buf, mlink->file, PATH_MAX) >= PATH_MAX) {
                         if (warnings)
                                 say(mlink->file, "Filename too long");
-                       continue;
+                       goto nextlink;
                 }
                 bufp = strstr(buf, "cat");
                 assert(NULL != bufp);
@@ -874,15 +876,65 @@ mlinks_undupe(struct mpage *mpage)
                 strlcat(buf, mlink->dsec, PATH_MAX);
                 if (NULL == ohash_find(&mlinks,
                                 ohash_qlookup(&mlinks, buf)))
-                       continue;
+                       goto nextlink;
                 if (warnings)
                         say(mlink->file, "Man source exists: %s", buf);
                 if (use_all)
-                       continue;
+                       goto nextlink;
                 *prev = mlink->next;
                 mlink_free(mlink);
-               mlink = *prev;
+               continue;
+nextlink:
+               prev = &(*prev)->next;
+       }
+}
+
+static int
+mlink_check(struct mpage *mpage, struct mlink *mlink)
+{
+       int      match;
+
+       match = 1;
+
+       /*
+        * Check whether the manual section given in a file
+        * agrees with the directory where the file is located.
+        * Some manuals have suffixes like (3p) on their
+        * section number either inside the file or in the
+        * directory name, some are linked into more than one
+        * section, like encrypt(1) = makekey(8).
+        */
+
+       if (FORM_SRC == mpage->form &&
+           strcasecmp(mpage->sec, mlink->dsec)) {
+               match = 0;
+               say(mlink->file, "Section \"%s\" manual in %s directory",
+                   mpage->sec, mlink->dsec);
         }
+
+       /*
+        * Manual page directories exist for each kernel
+        * architecture as returned by machine(1).
+        * However, many manuals only depend on the
+        * application architecture as returned by arch(1).
+        * For example, some (2/ARM) manuals are shared
+        * across the "armish" and "zaurus" kernel
+        * architectures.
+        * A few manuals are even shared across completely
+        * different architectures, for example fdformat(1)
+        * on amd64, i386, sparc, and sparc64.
+        */
+
+       if (strcasecmp(mpage->arch, mlink->arch)) {
+               match = 0;
+               say(mlink->file, "Architecture \"%s\" manual in "
+                   "\"%s\" directory", mpage->arch, mlink->arch);
+       }
+
+       if (strcasecmp(mpage->title, mlink->name))
+               match = 0;
+
+       return(match);
  }
  
  /*
@@ -898,6 +950,7 @@ mpages_merge(struct mchars *mc, struct mparse *mp, int check_reachable)
         struct ohash             title_table;
         struct ohash_info        title_info, str_info;
         struct mpage            *mpage;
+       struct mlink            *mlink;
         struct mdoc             *mdoc;
         struct man              *man;
         struct title            *title_entry;
@@ -932,7 +985,6 @@ mpages_merge(struct mchars *mc, struct mparse *mp, int check_reachable)
                 mparse_reset(mp);
                 mdoc = NULL;
                 man = NULL;
-               match = 1;
  
                 /*
                  * Try interpreting the file as mdoc(7) or man(7)
@@ -973,49 +1025,17 @@ mpages_merge(struct mchars *mc, struct mparse *mp, int check_reachable)
                             mandoc_strdup(mpage->mlinks->name);
                 }
  
-               /*
-                * Check whether the manual section given in a file
-                * agrees with the directory where the file is located.
-                * Some manuals have suffixes like (3p) on their
-                * section number either inside the file or in the
-                * directory name, some are linked into more than one
-                * section, like encrypt(1) = makekey(8).  Do not skip
-                * manuals for such reasons.
-                */
-               if (warnings && !use_all && FORM_SRC == mpage->form &&
-                   strcasecmp(mpage->sec, mpage->mlinks->dsec)) {
-                       match = 0;
-                       say(mpage->mlinks->file, "Section \"%s\" "
-                               "manual in %s directory",
-                               mpage->sec, mpage->mlinks->dsec);
-               }
+               for (mlink = mpage->mlinks; mlink; mlink = mlink->next)
+                       putkey(mpage, mlink->name, TYPE_Nm);
  
-               /*
-                * Manual page directories exist for each kernel
-                * architecture as returned by machine(1).
-                * However, many manuals only depend on the
-                * application architecture as returned by arch(1).
-                * For example, some (2/ARM) manuals are shared
-                * across the "armish" and "zaurus" kernel
-                * architectures.
-                * A few manuals are even shared across completely
-                * different architectures, for example fdformat(1)
-                * on amd64, i386, sparc, and sparc64.
-                * Thus, warn about architecture mismatches,
-                * but don't skip manuals for this reason.
-                */
-               if (warnings && !use_all &&
-                   strcasecmp(mpage->arch, mpage->mlinks->arch)) {
+               if (warnings && !use_all) {
                         match = 0;
-                       say(mpage->mlinks->file, "Architecture \"%s\" "
-                               "manual in \"%s\" directory",
-                               mpage->arch, mpage->mlinks->arch);
-               }
-               if (warnings && !use_all &&
-                   strcasecmp(mpage->title, mpage->mlinks->name))
-                       match = 0;
-
-               putkey(mpage, mpage->mlinks->name, TYPE_Nm);
+                       for (mlink = mpage->mlinks; mlink;
+                            mlink = mlink->next)
+                               if (mlink_check(mpage, mlink))
+                                       match = 1;
+               } else
+                       match = 1;
  
                 if (NULL != mdoc) {
                         if (NULL != (cp = mdoc_meta(mdoc)->name))
@@ -1439,18 +1459,7 @@ parse_mdoc_Fd(struct mpage *mpage, const struct mdoc_node *n)
  
         if (end > start)
                 putkeys(mpage, start, end - start + 1, TYPE_In);
-       return(1);
-}
-
-static int
-parse_mdoc_In(struct mpage *mpage, const struct mdoc_node *n)
-{
-
-       if (NULL != n->child && MDOC_TEXT == n->child->type)
-               return(0);
-
-       putkey(mpage, n->child->string, TYPE_In);
-       return(1);
+       return(0);
  }
  
  static int
@@ -1486,17 +1495,6 @@ parse_mdoc_Fn(struct mpage *mpage, const struct mdoc_node *n)
         return(0);
  }
  
-static int
-parse_mdoc_St(struct mpage *mpage, const struct mdoc_node *n)
-{
-
-       if (NULL == n->child || MDOC_TEXT != n->child->type)
-               return(0);
-
-       putkey(mpage, n->child->string, TYPE_St);
-       return(1);
-}
-
  static int
  parse_mdoc_Xr(struct mpage *mpage, const struct mdoc_node *n)
  {
@@ -1554,12 +1552,8 @@ static int
  parse_mdoc_Nm(struct mpage *mpage, const struct mdoc_node *n)
  {
  
-       if (SEC_NAME == n->sec)
-               return(1);
-       else if (SEC_SYNOPSIS != n->sec || MDOC_HEAD != n->type)
-               return(0);
-
-       return(1);
+       return(SEC_NAME == n->sec ||
+           (SEC_SYNOPSIS == n->sec && MDOC_HEAD == n->type));
  }
  
  static int
@@ -1668,11 +1662,11 @@ utf8(unsigned int cp, char out[7])
  }
  
  /*
- * Store the UTF-8 version of a key, or alias the pointer if the key has
- * no UTF-8 transcription marks in it.
+ * Store the rendered version of a key, or alias the pointer
+ * if the key contains no escape sequences.
   */
  static void
-utf8key(struct mchars *mc, struct str *key)
+render_key(struct mchars *mc, struct str *key)
  {
         size_t           sz, bsz, pos;
         char             utfbuf[7], res[5];
@@ -1681,7 +1675,7 @@ utf8key(struct mchars *mc, struct str *key)
         int              len, u;
         enum mandoc_esc  esc;
  
-       assert(NULL == key->utf8);
+       assert(NULL == key->rendered);
  
         res[0] = '\\';
         res[1] = '\t';
@@ -1697,7 +1691,7 @@ utf8key(struct mchars *mc, struct str *key)
          * pointer as ourselvse and get out of here.
          */
         if (strcspn(val, res) == bsz) {
-               key->utf8 = key->key;
+               key->rendered = key->key;
                 return;
         } 
  
@@ -1732,50 +1726,58 @@ utf8key(struct mchars *mc, struct str *key)
                 /* Read past the slash. */
  
                 val++;
-               u = 0;
  
                 /*
                  * Parse the escape sequence and see if it's a
                  * predefined character or special character.
                  */
+
                 esc = mandoc_escape
                         ((const char **)&val, &seq, &len);
                 if (ESCAPE_ERROR == esc)
                         break;
-
                 if (ESCAPE_SPECIAL != esc)
                         continue;
-               if (0 == (u = mchars_spec2cp(mc, seq, len)))
-                       continue;
  
                 /*
-                * If we have a Unicode codepoint, try to convert that
-                * to a UTF-8 byte string.
+                * Render the special character
+                * as either UTF-8 or ASCII.
                  */
-               cpp = utfbuf;
-               if (0 == (sz = utf8(u, utfbuf)))
-                       continue;
+
+               if (write_utf8) {
+                       if (0 == (u = mchars_spec2cp(mc, seq, len)))
+                               continue;
+                       cpp = utfbuf;
+                       if (0 == (sz = utf8(u, utfbuf)))
+                               continue;
+                       sz = strlen(cpp);
+               } else {
+                       cpp = mchars_spec2str(mc, seq, len, &sz);
+                       if (NULL == cpp)
+                               continue;
+                       if (ASCII_NBRSP == *cpp) {
+                               cpp = " ";
+                               sz = 1;
+                       }
+               }
  
                 /* Copy the rendered glyph into the stream. */
  
-               sz = strlen(cpp);
                 bsz += sz;
-
                 buf = mandoc_realloc(buf, bsz);
-
                 memcpy(&buf[pos], cpp, sz);
                 pos += sz;
         }
  
         buf[pos] = '\0';
-       key->utf8 = buf;
+       key->rendered = buf;
  }
  
  /*
   * Flush the current page's terms (and their bits) into the database.
   * Wrap the entire set of additions in a transaction to make sqlite be a
   * little faster.
- * Also, UTF-8-encode the description at the last possible moment.
+ * Also, handle escape sequences at the last possible moment.
   */
  static void
  dbindex(const struct mpage *mpage, struct mchars *mc)
@@ -1798,9 +1800,9 @@ dbindex(const struct mpage *mpage, struct mchars *mc)
                 key = ohash_find(&strings,
                         ohash_qlookup(&strings, mpage->desc));
                 assert(NULL != key);
-               if (NULL == key->utf8)
-                       utf8key(mc, key);
-               desc = key->utf8;
+               if (NULL == key->rendered)
+                       render_key(mc, key);
+               desc = key->rendered;
         }
  
         SQL_EXEC("BEGIN TRANSACTION");
@@ -1834,16 +1836,16 @@ dbindex(const struct mpage *mpage, struct mchars *mc)
         for (key = ohash_first(&strings, &slot); NULL != key;
              key = ohash_next(&strings, &slot)) {
                 assert(key->mpage == mpage);
-               if (NULL == key->utf8)
-                       utf8key(mc, key);
+               if (NULL == key->rendered)
+                       render_key(mc, key);
                 i = 1;
                 SQL_BIND_INT64(stmts[STMT_INSERT_KEY], i, key->mask);
-               SQL_BIND_TEXT(stmts[STMT_INSERT_KEY], i, key->utf8);
+               SQL_BIND_TEXT(stmts[STMT_INSERT_KEY], i, key->rendered);
                 SQL_BIND_INT64(stmts[STMT_INSERT_KEY], i, recno);
                 SQL_STEP(stmts[STMT_INSERT_KEY]);
                 sqlite3_reset(stmts[STMT_INSERT_KEY]);
-               if (key->utf8 != key->key)
-                       free(key->utf8);
+               if (key->rendered != key->key)
+                       free(key->rendered);
                 free(key);
         }