]> git.cameronkatri.com Git - mandoc.git/blobdiff - preconv.c
If the layout or data of an individual cell in a tbl(7) contains
[mandoc.git] / preconv.c
index 0c6076ecb425e932df8ec871d668439bf25fe9fc..9ed627d4467d5a320f39c456f8b21c3c9280f0e7 100644 (file)
--- a/preconv.c
+++ b/preconv.c
@@ -1,4 +1,4 @@
-/*     $Id: preconv.c,v 1.12 2014/11/14 04:24:04 schwarze Exp $ */
+/*     $Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp $ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
 
 #include <sys/types.h>
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
+
 #include "mandoc.h"
+#include "roff.h"
+#include "mandoc_parse.h"
 #include "libmandoc.h"
 
 int
-preconv_encode(struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
+preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
     int *filenc)
 {
-       size_t           i;
-       int              state;
-       unsigned int     accum;
-       unsigned char    cu;
+       const unsigned char     *cu;
+       int                      nby;
+       unsigned int             accum;
+
+       cu = (const unsigned char *)ib->buf + *ii;
+       assert(*cu & 0x80);
 
        if ( ! (*filenc & MPARSE_UTF8))
                goto latin;
 
-       state = 0;
-       accum = 0U;
-
-       for (i = *ii; i < ib->sz; i++) {
-               cu = ib->buf[i];
-               if (state) {
-                       if ( ! (cu & 128) || (cu & 64)) {
-                               /* Bad sequence header. */
-                               break;
-                       }
-
-                       /* Accept only legitimate bit patterns. */
-
-                       if (cu > 191 || cu < 128) {
-                               /* Bad in-sequence bits. */
-                               break;
-                       }
-
-                       accum |= (cu & 63) << --state * 6;
-
-                       if (state)
-                               continue;
-
-                       if (accum < 0x80)
-                               ob->buf[(*oi)++] = accum;
-                       else
-                               *oi += snprintf(ob->buf + *oi,
-                                   11, "\\[u%.4X]", accum);
-                       *ii = i + 1;
-                       *filenc &= ~MPARSE_LATIN1;
-                       return(1);
-               } else {
-                       /*
-                        * Entering a UTF-8 state:  if we encounter a
-                        * UTF-8 bitmask, calculate the expected UTF-8
-                        * state from it.
-                        */
-                       for (state = 0; state < 7; state++)
-                               if ( ! (cu & (1 << (7 - state))))
-                                       break;
-
-                       /* Accept only legitimate bit patterns. */
-
-                       switch (state--) {
-                       case (4):
-                               if (cu <= 244 && cu >= 240) {
-                                       accum = (cu & 7) << 18;
-                                       continue;
-                               }
-                               /* Bad 4-sequence start bits. */
-                               break;
-                       case (3):
-                               if (cu <= 239 && cu >= 224) {
-                                       accum = (cu & 15) << 12;
-                                       continue;
-                               }
-                               /* Bad 3-sequence start bits. */
-                               break;
-                       case (2):
-                               if (cu <= 223 && cu >= 194) {
-                                       accum = (cu & 31) << 6;
-                                       continue;
-                               }
-                               /* Bad 2-sequence start bits. */
-                               break;
-                       default:
-                               /* Bad sequence bit mask. */
-                               break;
-                       }
-                       break;
-               }
+       nby = 1;
+       while (nby < 5 && *cu & (1 << (7 - nby)))
+               nby++;
+
+       switch (nby) {
+       case 2:
+               accum = *cu & 0x1f;
+               if (accum < 0x02)  /* Obfuscated ASCII. */
+                       goto latin;
+               break;
+       case 3:
+               accum = *cu & 0x0f;
+               break;
+       case 4:
+               accum = *cu & 0x07;
+               if (accum > 0x04) /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:  /* Bad sequence header. */
+               goto latin;
        }
 
-       /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+       cu++;
+       switch (nby) {
+       case 3:
+               if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+                   (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+                       goto latin;
+               break;
+       case 4:
+               if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+                   (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:
+               break;
+       }
+
+       while (--nby) {
+               if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+                       goto latin;
+               accum <<= 6;
+               accum += *cu & 0x3f;
+               cu++;
+       }
+
+       assert(accum > 0x7f);
+       assert(accum < 0x110000);
+       assert(accum < 0xd800 || accum > 0xdfff);
+
+       *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+       *ii = (const char *)cu - ib->buf;
+       *filenc &= ~MPARSE_LATIN1;
+       return 1;
 
 latin:
        if ( ! (*filenc & MPARSE_LATIN1))
-               return(0);
+               return 0;
 
        *oi += snprintf(ob->buf + *oi, 11,
            "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
 
        *filenc &= ~MPARSE_UTF8;
-       return(1);
+       return 1;
 }
 
 int
@@ -140,7 +126,7 @@ preconv_cue(const struct buf *b, size_t offset)
 
        if ((sz = (size_t)(eoln - ln)) < 10 ||
            memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
-               return(MPARSE_UTF8 | MPARSE_LATIN1);
+               return MPARSE_UTF8 | MPARSE_LATIN1;
 
        /* Move after the header and adjust for the trailer. */
 
@@ -179,15 +165,15 @@ preconv_cue(const struct buf *b, size_t offset)
                        sz--;
                }
                if (0 == sz)
-                       return(0);
+                       return 0;
 
                /* Check us against known encodings. */
 
                if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
-                       return(MPARSE_UTF8);
+                       return MPARSE_UTF8;
                if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
-                       return(MPARSE_LATIN1);
-               return(0);
+                       return MPARSE_LATIN1;
+               return 0;
        }
-       return(MPARSE_UTF8 | MPARSE_LATIN1);
+       return MPARSE_UTF8 | MPARSE_LATIN1;
 }