preconv.c

   1 /*      $Id: preconv.c,v 1.8 2014/08/16 19:00:01 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4  *
   5  * Permission to use, copy, modify, and distribute this software for any
   6  * purpose with or without fee is hereby granted, provided that the above
   7  * copyright notice and this permission notice appear in all copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16  */
  17 #include "config.h"
  18
  19 #include <sys/types.h>
  20 #if HAVE_MMAP
  21 #include <sys/stat.h>
  22 #include <sys/mman.h>
  23 #endif
  24
  25 #include <assert.h>
  26 #include <fcntl.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <unistd.h>
  31
  32 /*
  33  * The read_whole_file() and resize_buf() functions are copied from
  34  * read.c, including all dependency code.
  35  */
  36
  37 enum    enc {
  38         ENC_UTF_8, /* UTF-8 */
  39         ENC_US_ASCII, /* US-ASCII */
  40         ENC_LATIN_1, /* Latin-1 */
  41         ENC__MAX
  42 };
  43
  44 struct  buf {
  45         char             *buf; /* binary input buffer */
  46         size_t            sz; /* size of binary buffer */
  47         size_t            offs; /* starting buffer offset */
  48 };
  49
  50 struct  encode {
  51         const char       *name;
  52         int             (*conv)(const struct buf *);
  53 };
  54
  55 static  int      cue_enc(const struct buf *, size_t *, enum enc *);
  56 static  int      conv_latin_1(const struct buf *);
  57 static  int      conv_us_ascii(const struct buf *);
  58 static  int      conv_utf_8(const struct buf *);
  59 static  int      read_whole_file(const char *, int,
  60                         struct buf *, int *);
  61 static  void     resize_buf(struct buf *, size_t);
  62 static  void     usage(void);
  63
  64 static  const struct encode encs[ENC__MAX] = {
  65         { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
  66         { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
  67         { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
  68 };
  69
  70 static  const char       *progname;
  71
  72 static void
  73 usage(void)
  74 {
  75
  76         fprintf(stderr, "usage: %s "
  77                         "[-D enc] "
  78                         "[-e ENC] "
  79                         "[file]\n", progname);
  80 }
  81
  82 static int
  83 conv_latin_1(const struct buf *b)
  84 {
  85         size_t           i;
  86         unsigned char    cu;
  87         const char      *cp;
  88
  89         cp = b->buf + (int)b->offs;
  90
  91         /*
  92          * Latin-1 falls into the first 256 code-points of Unicode, so
  93          * there's no need for any sort of translation.  Just make the
  94          * 8-bit characters use the Unicode escape.
  95          * Note that binary values 128 < v < 160 are passed through
  96          * unmodified to mandoc.
  97          */
  98
  99         for (i = b->offs; i < b->sz; i++) {
 100                 cu = (unsigned char)*cp++;
 101                 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
 102         }
 103
 104         return(1);
 105 }
 106
 107 static int
 108 conv_us_ascii(const struct buf *b)
 109 {
 110
 111         /*
 112          * US-ASCII has no conversion since it falls into the first 128
 113          * bytes of Unicode.
 114          */
 115
 116         fwrite(b->buf, 1, b->sz, stdout);
 117         return(1);
 118 }
 119
 120 static int
 121 conv_utf_8(const struct buf *b)
 122 {
 123         int              state, be;
 124         unsigned int     accum;
 125         size_t           i;
 126         unsigned char    cu;
 127         const char      *cp;
 128         const long       one = 1L;
 129
 130         cp = b->buf + (int)b->offs;
 131         state = 0;
 132         accum = 0U;
 133         be = 0;
 134
 135         /* Quick test for big-endian value. */
 136
 137         if ( ! (*((const char *)(&one))))
 138                 be = 1;
 139
 140         for (i = b->offs; i < b->sz; i++) {
 141                 cu = (unsigned char)*cp++;
 142                 if (state) {
 143                         if ( ! (cu & 128) || (cu & 64)) {
 144                                 /* Bad sequence header. */
 145                                 return(0);
 146                         }
 147
 148                         /* Accept only legitimate bit patterns. */
 149
 150                         if (cu > 191 || cu < 128) {
 151                                 /* Bad in-sequence bits. */
 152                                 return(0);
 153                         }
 154
 155                         accum |= (cu & 63) << --state * 6;
 156
 157                         /*
 158                          * Accum is held in little-endian order as
 159                          * stipulated by the UTF-8 sequence coding.  We
 160                          * need to convert to a native big-endian if our
 161                          * architecture requires it.
 162                          */
 163
 164                         if (0 == state && be)
 165                                 accum = (accum >> 24) |
 166                                         ((accum << 8) & 0x00FF0000) |
 167                                         ((accum >> 8) & 0x0000FF00) |
 168                                         (accum << 24);
 169
 170                         if (0 == state) {
 171                                 accum < 128U ? putchar(accum) :
 172                                         printf("\\[u%.4X]", accum);
 173                                 accum = 0U;
 174                         }
 175                 } else if (cu & (1 << 7)) {
 176                         /*
 177                          * Entering a UTF-8 state:  if we encounter a
 178                          * UTF-8 bitmask, calculate the expected UTF-8
 179                          * state from it.
 180                          */
 181                         for (state = 0; state < 7; state++)
 182                                 if ( ! (cu & (1 << (7 - state))))
 183                                         break;
 184
 185                         /* Accept only legitimate bit patterns. */
 186
 187                         switch (state) {
 188                         case (4):
 189                                 if (cu <= 244 && cu >= 240) {
 190                                         accum = (cu & 7) << 18;
 191                                         break;
 192                                 }
 193                                 /* Bad 4-sequence start bits. */
 194                                 return(0);
 195                         case (3):
 196                                 if (cu <= 239 && cu >= 224) {
 197                                         accum = (cu & 15) << 12;
 198                                         break;
 199                                 }
 200                                 /* Bad 3-sequence start bits. */
 201                                 return(0);
 202                         case (2):
 203                                 if (cu <= 223 && cu >= 194) {
 204                                         accum = (cu & 31) << 6;
 205                                         break;
 206                                 }
 207                                 /* Bad 2-sequence start bits. */
 208                                 return(0);
 209                         default:
 210                                 /* Bad sequence bit mask. */
 211                                 return(0);
 212                         }
 213                         state--;
 214                 } else
 215                         putchar(cu);
 216         }
 217
 218         if (0 != state) {
 219                 /* Bad trailing bits. */
 220                 return(0);
 221         }
 222
 223         return(1);
 224 }
 225
 226 static void
 227 resize_buf(struct buf *buf, size_t initial)
 228 {
 229
 230         buf->sz = buf->sz > initial / 2 ?
 231                 2 * buf->sz : initial;
 232
 233         buf->buf = realloc(buf->buf, buf->sz);
 234         if (NULL == buf->buf) {
 235                 perror(NULL);
 236                 exit(EXIT_FAILURE);
 237         }
 238 }
 239
 240 static int
 241 read_whole_file(const char *f, int fd,
 242                 struct buf *fb, int *with_mmap)
 243 {
 244         size_t           off;
 245         ssize_t          ssz;
 246
 247 #if HAVE_MMAP
 248         struct stat      st;
 249         if (-1 == fstat(fd, &st)) {
 250                 perror(f);
 251                 return(0);
 252         }
 253
 254         /*
 255          * If we're a regular file, try just reading in the whole entry
 256          * via mmap().  This is faster than reading it into blocks, and
 257          * since each file is only a few bytes to begin with, I'm not
 258          * concerned that this is going to tank any machines.
 259          */
 260
 261         if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
 262                 fprintf(stderr, "%s: input too large\n", f);
 263                 return(0);
 264         }
 265
 266         if (S_ISREG(st.st_mode)) {
 267                 *with_mmap = 1;
 268                 fb->sz = (size_t)st.st_size;
 269                 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
 270                 if (fb->buf != MAP_FAILED)
 271                         return(1);
 272         }
 273 #endif
 274
 275         /*
 276          * If this isn't a regular file (like, say, stdin), then we must
 277          * go the old way and just read things in bit by bit.
 278          */
 279
 280         *with_mmap = 0;
 281         off = 0;
 282         fb->sz = 0;
 283         fb->buf = NULL;
 284         for (;;) {
 285                 if (off == fb->sz && fb->sz == (1U << 31)) {
 286                         fprintf(stderr, "%s: input too large\n", f);
 287                         break;
 288                 }
 289
 290                 if (off == fb->sz)
 291                         resize_buf(fb, 65536);
 292
 293                 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
 294                 if (ssz == 0) {
 295                         fb->sz = off;
 296                         return(1);
 297                 }
 298                 if (ssz == -1) {
 299                         perror(f);
 300                         break;
 301                 }
 302                 off += (size_t)ssz;
 303         }
 304
 305         free(fb->buf);
 306         fb->buf = NULL;
 307         return(0);
 308 }
 309
 310 static int
 311 cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
 312 {
 313         const char      *ln, *eoln, *eoph;
 314         size_t           sz, phsz, nsz;
 315         int              i;
 316
 317         ln = b->buf + (int)*offs;
 318         sz = b->sz - *offs;
 319
 320         /* Look for the end-of-line. */
 321
 322         if (NULL == (eoln = memchr(ln, '\n', sz)))
 323                 return(-1);
 324
 325         /* Set next-line marker. */
 326
 327         *offs = (size_t)((eoln + 1) - b->buf);
 328
 329         /* Check if we have the correct header/trailer. */
 330
 331         if ((sz = (size_t)(eoln - ln)) < 10 ||
 332                         memcmp(ln, ".\\\" -*-", 7) ||
 333                         memcmp(eoln - 3, "-*-", 3))
 334                 return(0);
 335
 336         /* Move after the header and adjust for the trailer. */
 337
 338         ln += 7;
 339         sz -= 10;
 340
 341         while (sz > 0) {
 342                 while (sz > 0 && ' ' == *ln) {
 343                         ln++;
 344                         sz--;
 345                 }
 346                 if (0 == sz)
 347                         break;
 348
 349                 /* Find the end-of-phrase marker (or eoln). */
 350
 351                 if (NULL == (eoph = memchr(ln, ';', sz)))
 352                         eoph = eoln - 3;
 353                 else
 354                         eoph++;
 355
 356                 /* Only account for the "coding" phrase. */
 357
 358                 if ((phsz = (size_t)(eoph - ln)) < 7 ||
 359                                 strncasecmp(ln, "coding:", 7)) {
 360                         sz -= phsz;
 361                         ln += phsz;
 362                         continue;
 363                 }
 364
 365                 sz -= 7;
 366                 ln += 7;
 367
 368                 while (sz > 0 && ' ' == *ln) {
 369                         ln++;
 370                         sz--;
 371                 }
 372                 if (0 == sz)
 373                         break;
 374
 375                 /* Check us against known encodings. */
 376
 377                 for (i = 0; i < (int)ENC__MAX; i++) {
 378                         nsz = strlen(encs[i].name);
 379                         if (phsz < nsz)
 380                                 continue;
 381                         if (strncasecmp(ln, encs[i].name, nsz))
 382                                 continue;
 383
 384                         *enc = (enum enc)i;
 385                         return(1);
 386                 }
 387
 388                 /* Unknown encoding. */
 389
 390                 *enc = ENC__MAX;
 391                 return(1);
 392         }
 393
 394         return(0);
 395 }
 396
 397 int
 398 main(int argc, char *argv[])
 399 {
 400         int              i, ch, map, fd, rc;
 401         struct buf       b;
 402         const char      *fn;
 403         enum enc         enc, def;
 404         unsigned char    bom[3] = { 0xEF, 0xBB, 0xBF };
 405         size_t           offs;
 406         extern int       optind;
 407         extern char     *optarg;
 408
 409         progname = strrchr(argv[0], '/');
 410         if (progname == NULL)
 411                 progname = argv[0];
 412         else
 413                 ++progname;
 414
 415         fn = "<stdin>";
 416         fd = STDIN_FILENO;
 417         rc = EXIT_FAILURE;
 418         enc = def = ENC__MAX;
 419         map = 0;
 420
 421         memset(&b, 0, sizeof(struct buf));
 422
 423         while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
 424                 switch (ch) {
 425                 case ('D'):
 426                         /* FALLTHROUGH */
 427                 case ('e'):
 428                         for (i = 0; i < (int)ENC__MAX; i++) {
 429                                 if (strcasecmp(optarg, encs[i].name))
 430                                         continue;
 431                                 break;
 432                         }
 433                         if (i < (int)ENC__MAX) {
 434                                 if ('D' == ch)
 435                                         def = (enum enc)i;
 436                                 else
 437                                         enc = (enum enc)i;
 438                                 break;
 439                         }
 440
 441                         fprintf(stderr, "%s: Bad encoding\n", optarg);
 442                         return(EXIT_FAILURE);
 443                 case ('r'):
 444                         /* FALLTHROUGH */
 445                 case ('d'):
 446                         /* FALLTHROUGH */
 447                 case ('v'):
 448                         /* Compatibility with GNU preconv. */
 449                         break;
 450                 case ('h'):
 451                         /* Compatibility with GNU preconv. */
 452                         /* FALLTHROUGH */
 453                 default:
 454                         usage();
 455                         return(EXIT_FAILURE);
 456                 }
 457
 458         argc -= optind;
 459         argv += optind;
 460
 461         /*
 462          * Open and read the first argument on the command-line.
 463          * If we don't have one, we default to stdin.
 464          */
 465
 466         if (argc > 0) {
 467                 fn = *argv;
 468                 fd = open(fn, O_RDONLY, 0);
 469                 if (-1 == fd) {
 470                         perror(fn);
 471                         return(EXIT_FAILURE);
 472                 }
 473         }
 474
 475         if ( ! read_whole_file(fn, fd, &b, &map))
 476                 goto out;
 477
 478         /* Try to read the UTF-8 BOM. */
 479
 480         if (ENC__MAX == enc)
 481                 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
 482                         b.offs = 3;
 483                         enc = ENC_UTF_8;
 484                 }
 485
 486         /* Try reading from the "-*-" cue. */
 487
 488         if (ENC__MAX == enc) {
 489                 offs = b.offs;
 490                 ch = cue_enc(&b, &offs, &enc);
 491                 if (0 == ch)
 492                         ch = cue_enc(&b, &offs, &enc);
 493         }
 494
 495         /*
 496          * No encoding has been detected.
 497          * Thus, we either fall into our default encoder, if specified,
 498          * or use Latin-1 if all else fails.
 499          */
 500
 501         if (ENC__MAX == enc)
 502                 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
 503
 504         if ( ! (*encs[(int)enc].conv)(&b)) {
 505                 fprintf(stderr, "%s: Bad encoding\n", fn);
 506                 goto out;
 507         }
 508
 509         rc = EXIT_SUCCESS;
 510 out:
 511 #if HAVE_MMAP
 512         if (map)
 513                 munmap(b.buf, b.sz);
 514         else
 515 #endif
 516                 free(b.buf);
 517
 518         if (fd > STDIN_FILENO)
 519                 close(fd);
 520
 521         return(rc);
 522 }