]> git.cameronkatri.com Git - mandoc.git/blobdiff - preconv.c
Tedu support for the -xsh4.2 argument to the mdoc(7) .St macro
[mandoc.git] / preconv.c
index ad2bf5e20727c32dd5a8a97470c4370091cb1906..9ed627d4467d5a320f39c456f8b21c3c9280f0e7 100644 (file)
--- a/preconv.c
+++ b/preconv.c
@@ -1,4 +1,4 @@
-/*     $Id: preconv.c,v 1.9 2014/10/25 01:03:52 schwarze Exp $ */
+/*     $Id: preconv.c,v 1.17 2018/12/13 11:55:47 schwarze Exp $ */
 /*
  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
  * Copyright (c) 2014 Ingo Schwarze <schwarze@openbsd.org>
 
 #include <sys/types.h>
 
+#include <assert.h>
 #include <stdio.h>
 #include <string.h>
+
 #include "mandoc.h"
+#include "roff.h"
+#include "mandoc_parse.h"
 #include "libmandoc.h"
 
 int
-preconv_encode(struct buf *ib, struct buf *ob, int *filenc)
+preconv_encode(const struct buf *ib, size_t *ii, struct buf *ob, size_t *oi,
+    int *filenc)
 {
-       int              state, be;
-       unsigned int     accum;
-       size_t           i;
-       unsigned char    cu;
-       const long       one = 1L;
+       const unsigned char     *cu;
+       int                      nby;
+       unsigned int             accum;
+
+       cu = (const unsigned char *)ib->buf + *ii;
+       assert(*cu & 0x80);
 
        if ( ! (*filenc & MPARSE_UTF8))
                goto latin;
 
-       state = 0;
-       accum = 0U;
-       be = 0;
-
-       /* Quick test for big-endian value. */
-
-       if ( ! (*((const char *)(&one))))
-               be = 1;
-
-       for (i = ib->offs; i < ib->sz; i++) {
-               cu = ib->buf[i];
-               if (state) {
-                       if ( ! (cu & 128) || (cu & 64)) {
-                               /* Bad sequence header. */
-                               break;
-                       }
-
-                       /* Accept only legitimate bit patterns. */
-
-                       if (cu > 191 || cu < 128) {
-                               /* Bad in-sequence bits. */
-                               break;
-                       }
-
-                       accum |= (cu & 63) << --state * 6;
-
-                       if (state)
-                               continue;
-
-                       /*
-                        * Accum is held in little-endian order as
-                        * stipulated by the UTF-8 sequence coding.  We
-                        * need to convert to a native big-endian if our
-                        * architecture requires it.
-                        */
-
-                       if (be)
-                               accum = (accum >> 24) | 
-                                       ((accum << 8) & 0x00FF0000) |
-                                       ((accum >> 8) & 0x0000FF00) |
-                                       (accum << 24);
-
-                       if (accum < 0x80)
-                               ob->buf[ob->offs++] = accum;
-                       else
-                               ob->offs += snprintf(ob->buf + ob->offs,
-                                   11, "\\[u%.4X]", accum);
-                       ib->offs = i + 1;
-                       *filenc &= ~MPARSE_LATIN1;
-                       return(1);
-               } else {
-                       /*
-                        * Entering a UTF-8 state:  if we encounter a
-                        * UTF-8 bitmask, calculate the expected UTF-8
-                        * state from it.
-                        */
-                       for (state = 0; state < 7; state++) 
-                               if ( ! (cu & (1 << (7 - state))))
-                                       break;
-
-                       /* Accept only legitimate bit patterns. */
-
-                       switch (state--) {
-                       case (4):
-                               if (cu <= 244 && cu >= 240) {
-                                       accum = (cu & 7) << 18;
-                                       continue;
-                               }
-                               /* Bad 4-sequence start bits. */
-                               break;
-                       case (3):
-                               if (cu <= 239 && cu >= 224) {
-                                       accum = (cu & 15) << 12;
-                                       continue;
-                               }
-                               /* Bad 3-sequence start bits. */
-                               break;
-                       case (2):
-                               if (cu <= 223 && cu >= 194) {
-                                       accum = (cu & 31) << 6;
-                                       continue;
-                               }
-                               /* Bad 2-sequence start bits. */
-                               break;
-                       default:
-                               /* Bad sequence bit mask. */
-                               break;
-                       }
-                       break;
-               }
+       nby = 1;
+       while (nby < 5 && *cu & (1 << (7 - nby)))
+               nby++;
+
+       switch (nby) {
+       case 2:
+               accum = *cu & 0x1f;
+               if (accum < 0x02)  /* Obfuscated ASCII. */
+                       goto latin;
+               break;
+       case 3:
+               accum = *cu & 0x0f;
+               break;
+       case 4:
+               accum = *cu & 0x07;
+               if (accum > 0x04) /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:  /* Bad sequence header. */
+               goto latin;
        }
 
-       /* FALLTHROUGH: Invalid or incomplete UTF-8 sequence. */
+       cu++;
+       switch (nby) {
+       case 3:
+               if ((accum == 0x00 && ! (*cu & 0x20)) ||  /* Use 2-byte. */
+                   (accum == 0x0d && *cu & 0x20))  /* Surrogates. */
+                       goto latin;
+               break;
+       case 4:
+               if ((accum == 0x00 && ! (*cu & 0x30)) ||  /* Use 3-byte. */
+                   (accum == 0x04 && *cu & 0x30))  /* Beyond Unicode. */
+                       goto latin;
+               break;
+       default:
+               break;
+       }
+
+       while (--nby) {
+               if ((*cu & 0xc0) != 0x80)  /* Invalid continuation. */
+                       goto latin;
+               accum <<= 6;
+               accum += *cu & 0x3f;
+               cu++;
+       }
+
+       assert(accum > 0x7f);
+       assert(accum < 0x110000);
+       assert(accum < 0xd800 || accum > 0xdfff);
+
+       *oi += snprintf(ob->buf + *oi, 11, "\\[u%.4X]", accum);
+       *ii = (const char *)cu - ib->buf;
+       *filenc &= ~MPARSE_LATIN1;
+       return 1;
 
 latin:
        if ( ! (*filenc & MPARSE_LATIN1))
-               return(0);
+               return 0;
 
-       ob->offs += snprintf(ob->buf + ob->offs, 11,
-           "\\[u%.4X]", (unsigned char)ib->buf[ib->offs++]);
+       *oi += snprintf(ob->buf + *oi, 11,
+           "\\[u%.4X]", (unsigned char)ib->buf[(*ii)++]);
 
        *filenc &= ~MPARSE_UTF8;
-       return(1);
+       return 1;
 }
 
 int
-preconv_cue(const struct buf *b)
+preconv_cue(const struct buf *b, size_t offset)
 {
        const char      *ln, *eoln, *eoph;
        size_t           sz, phsz;
 
-       ln = b->buf + b->offs;
-       sz = b->sz - b->offs;
+       ln = b->buf + offset;
+       sz = b->sz - offset;
 
        /* Look for the end-of-line. */
 
@@ -157,10 +124,9 @@ preconv_cue(const struct buf *b)
 
        /* Check if we have the correct header/trailer. */
 
-       if ((sz = (size_t)(eoln - ln)) < 10 || 
-                       memcmp(ln, ".\\\" -*-", 7) ||
-                       memcmp(eoln - 3, "-*-", 3))
-               return(MPARSE_UTF8 | MPARSE_LATIN1);
+       if ((sz = (size_t)(eoln - ln)) < 10 ||
+           memcmp(ln, ".\\\" -*-", 7) || memcmp(eoln - 3, "-*-", 3))
+               return MPARSE_UTF8 | MPARSE_LATIN1;
 
        /* Move after the header and adjust for the trailer. */
 
@@ -189,7 +155,7 @@ preconv_cue(const struct buf *b)
                        sz -= phsz;
                        ln += phsz;
                        continue;
-               } 
+               }
 
                sz -= 7;
                ln += 7;
@@ -199,15 +165,15 @@ preconv_cue(const struct buf *b)
                        sz--;
                }
                if (0 == sz)
-                       return(0);
+                       return 0;
 
                /* Check us against known encodings. */
 
                if (phsz > 4 && !strncasecmp(ln, "utf-8", 5))
-                       return(MPARSE_UTF8);
+                       return MPARSE_UTF8;
                if (phsz > 10 && !strncasecmp(ln, "iso-latin-1", 11))
-                       return(MPARSE_LATIN1);
-               return(0);
+                       return MPARSE_LATIN1;
+               return 0;
        }
-       return(MPARSE_UTF8 | MPARSE_LATIN1);
+       return MPARSE_UTF8 | MPARSE_LATIN1;
 }