more info on man(7) .Xr hyperlinking

[mandoc.git] / preconv.c
diff --git a/preconv.c b/preconv.c

index 0c6076ecb425e932df8ec871d668439bf25fe9fc..1fc137a9640debca69e4d58312735b2f7dd8f6d9 100644 (file)
--- a/preconv.c
+++ b/preconv.c
@@ -1,4 +1,4 @@
-/*     $Id: preconv.c,v 1.12 2014/11/14 04:24:04 schwarze Exp $ */
+/*     $Id: preconv.c,v 1.15 2015/10/06 18:32:19 schwarze Exp $ */
  /*
   * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
  /*
   * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
@@ -19,6 +19,7 @@
  
  #include <sys/types.h>
  
  
  #include <sys/types.h>
  
+#include <assert.h>
  #include <stdio.h>
  #include <string.h>
  #include "mandoc.h"
  #include <stdio.h>
  #include <string.h>
  #include "mandoc.h"
@@ -28,98 +29,80 @@ int
  preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
      int *filenc)
  {
  preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
      int *filenc)
  {
-       size_t           i;
-       int              state;
+       unsigned char   *cu;
+       int              nby;
         unsigned int     accum;
         unsigned int     accum;
-       unsigned char    cu;
+
+       cu = (unsigned char *)ib->buf + *ii;
+       assert(*cu & 0x80);
  
         if ( ! (*filenc & MPARSE_UTF8))
                 goto latin;
  
  
         if ( ! (*filenc & MPARSE_UTF8))
                 goto latin;
  
-       state = 0;
-       accum = 0U;
-
-       for (i = *ii; i < ib->sz; i++) {
-               cu = ib->buf[i];
-               if (state) {
-                       if ( ! (cu & 128) || (cu & 64)) {
-                               /* Bad sequence header. */
-                               break;
-                       }
-
-                       /* Accept only legitimate bit patterns. */
-
-                       if (cu > 191 || cu < 128) {
-                               /* Bad in-sequence bits. */
-                               break;
-                       }
-
-                       accum |= (cu & 63) << --state * 6;
-
-                       if (state)
-                               continue;
-
-                       if (accum < 0x80)
-                               ob->buf[(*oi)++] = accum;
-                       else
-                               *oi += snprintf(ob->buf + *oi,
-                                   11, "\\[u%.4X]", accum);
-                       *ii = i + 1;
-                       *filenc &= ~MPARSE_LATIN1;
-                       return(1);
-               } else {
-                       /*
-                        * Entering a UTF-8 state:  if we encounter a
-                        * UTF-8 bitmask, calculate the expected UTF-8
-                        * state from it.
-                        */
-                       for (state = 0; state < 7; state++)
-                               if ( ! (cu & (1 << (7 - state))))
-                                       break;
-
-                       /* Accept only legitimate bit patterns. */
-
-                       switch (state--) {
-                       case (4):
-                               if (cu <= 244 && cu >= 240) {
-                                       accum = (cu & 7) << 18;
-                                       continue;
-                               }
-                               /* Bad 4-sequence start bits. */
-                               break;
-                       case (3):
-                               if (cu <= 239 && cu >= 224) {
-                                       accum = (cu & 15) << 12;
-                                       continue;
-                               }
-                               /* Bad 3-sequence start bits. */
-                               break;
-                       case (2):
-                               if (cu <= 223 && cu >= 194) {
-                                       accum = (cu & 31) << 6;
-                                       continue;
-                               }
-                               /* Bad 2-sequence start bits. */
-                               break;
-                       default:
-                               /* Bad sequence bit mask. */
-                               break;
-                       }
-                       break;
-               }
+       nby = 1;
+       while (nby < 5 && *cu & (1 << (7 - nby)))
+               nby++;
+
+       switch (nby) {
+       case 2:
+               accum = *cu & 0x1f;
+               if (accum < 0x02)  /* Obfuscated ASCII. */
+                       goto latin;
+               break;
+       case 3:
+               accum = *cu & 0x0f;
+               break;
+       case 4:
+               accum = *cu & 0x07;
+               if (accum > 0x04) /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:  /* Bad sequence header. */
+               goto latin;
         }
  
         }
  
-       /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+       cu++;
+       switch (nby) {
+       case 3:
+               if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+                   (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+                       goto latin;
+               break;
+       case 4:
+               if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+                   (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:
+               break;
+       }
+
+       while (--nby) {
+               if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+                       goto latin;
+               accum <<= 6;
+               accum += *cu & 0x3f;
+               cu++;
+       }
+
+       assert(accum > 0x7f);
+       assert(accum < 0x110000);
+       assert(accum < 0xd800 || accum > 0xdfff);
+
+       *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+       *ii = (char *)cu - ib->buf;
+       *filenc &= ~MPARSE_LATIN1;
+       return 1;
  
  latin:
         if ( ! (*filenc & MPARSE_LATIN1))
  
  latin:
         if ( ! (*filenc & MPARSE_LATIN1))
-               return(0);
+               return 0;
  
         *oi += snprintf(ob->buf + *oi, 11,
             "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
  
         *filenc &= ~MPARSE_UTF8;
  
         *oi += snprintf(ob->buf + *oi, 11,
             "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
  
         *filenc &= ~MPARSE_UTF8;
-       return(1);
+       return 1;
  }
  
  int
  }
  
  int
@@ -140,7 +123,7 @@ preconv_cue(const struct buf *b, size_t offset)
  
         if ((sz = (size_t)(eoln - ln)) < 10 ||
             memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
  
         if ((sz = (size_t)(eoln - ln)) < 10 ||
             memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
-               return(MPARSE_UTF8 | MPARSE_LATIN1);
+               return MPARSE_UTF8 | MPARSE_LATIN1;
  
         /* Move after the header and adjust for the trailer. */
  
  
         /* Move after the header and adjust for the trailer. */
  
@@ -179,15 +162,15 @@ preconv_cue(const struct buf *b, size_t offset)
                         sz--;
                 }
                 if (0 == sz)
                         sz--;
                 }
                 if (0 == sz)
-                       return(0);
+                       return 0;
  
                 /* Check us against known encodings. */
  
                 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
  
                 /* Check us against known encodings. */
  
                 if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
-                       return(MPARSE_UTF8);
+                       return MPARSE_UTF8;
                 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
                 if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
-                       return(MPARSE_LATIN1);
-               return(0);
+                       return MPARSE_LATIN1;
+               return 0;
         }
         }
-       return(MPARSE_UTF8 | MPARSE_LATIN1);
+       return MPARSE_UTF8 | MPARSE_LATIN1;
  }
  }