preconv.c

   1 /*      $Id: preconv.c,v 1.4 2011/05/26 21:13:07 kristaps Exp $ */
   2 /*
   3  * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
   4  *
   5  * Permission to use, copy, modify, and distribute this software for any
   6  * purpose with or without fee is hereby granted, provided that the above
   7  * copyright notice and this permission notice appear in all copies.
   8  *
   9  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
  10  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  11  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
  12  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  13  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  14  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  15  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  16  */
  17 #ifdef HAVE_CONFIG_H
  18 #include "config.h"
  19 #endif
  20
  21 #include <sys/stat.h>
  22 #include <sys/mman.h>
  23
  24 #include <assert.h>
  25 #include <fcntl.h>
  26 #include <stdio.h>
  27 #include <stdlib.h>
  28 #include <string.h>
  29 #include <unistd.h>
  30
  31 /*
  32  * The read_whole_file() and resize_buf() functions are copied from
  33  * read.c, including all dependency code (MAP_FILE, etc.).
  34  */
  35
  36 #ifndef MAP_FILE
  37 #define MAP_FILE        0
  38 #endif
  39
  40 enum    enc {
  41         ENC_UTF_8, /* UTF-8 */
  42         ENC_US_ASCII, /* US-ASCII */
  43         ENC_LATIN_1, /* Latin-1 */
  44         ENC__MAX
  45 };
  46
  47 struct  buf {
  48         char             *buf; /* binary input buffer */
  49         size_t            sz; /* size of binary buffer */
  50         size_t            offs; /* starting buffer offset */
  51 };
  52
  53 struct  encode {
  54         const char       *name;
  55         int             (*conv)(const struct buf *);
  56 };
  57
  58 static  int      cue_enc(const struct buf *, size_t *, enum enc *);
  59 static  int      conv_latin_1(const struct buf *);
  60 static  int      conv_us_ascii(const struct buf *);
  61 static  int      conv_utf_8(const struct buf *);
  62 static  int      read_whole_file(const char *, int,
  63                         struct buf *, int *);
  64 static  void     resize_buf(struct buf *, size_t);
  65 static  void     usage(void);
  66
  67 static  const struct encode encs[ENC__MAX] = {
  68         { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
  69         { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
  70         { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
  71 };
  72
  73 static  const char       *progname;
  74
  75 static void
  76 usage(void)
  77 {
  78
  79         fprintf(stderr, "usage: %s "
  80                         "[-D enc] "
  81                         "[-e ENC] "
  82                         "[file]\n", progname);
  83 }
  84
  85 static int
  86 conv_latin_1(const struct buf *b)
  87 {
  88         size_t           i;
  89         unsigned char    cu;
  90         const char      *cp;
  91
  92         cp = b->buf + (int)b->offs;
  93
  94         /*
  95          * Latin-1 falls into the first 256 code-points of Unicode, so
  96          * there's no need for any sort of translation.  Just make the
  97          * 8-bit characters use the Unicode escape.
  98          * Note that binary values 128 < v < 160 are passed through
  99          * unmodified to mandoc.
 100          */
 101
 102         for (i = b->offs; i < b->sz; i++) {
 103                 cu = (unsigned char)*cp++;
 104                 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
 105         }
 106
 107         return(1);
 108 }
 109
 110 static int
 111 conv_us_ascii(const struct buf *b)
 112 {
 113
 114         /*
 115          * US-ASCII has no conversion since it falls into the first 128
 116          * bytes of Unicode.
 117          */
 118
 119         fwrite(b->buf, 1, b->sz, stdout);
 120         return(1);
 121 }
 122
 123 static int
 124 conv_utf_8(const struct buf *b)
 125 {
 126         int              state, be;
 127         unsigned int     accum;
 128         size_t           i;
 129         unsigned char    cu;
 130         const char      *cp;
 131         const long       one = 1L;
 132
 133         cp = b->buf + (int)b->offs;
 134         state = 0;
 135         accum = 0U;
 136         be = 0;
 137
 138         /* Quick test for big-endian value. */
 139
 140         if ( ! (*((const char *)(&one))))
 141                 be = 1;
 142
 143         for (i = b->offs; i < b->sz; i++) {
 144                 cu = (unsigned char)*cp++;
 145                 if (state) {
 146                         if ( ! (cu & 128) || (cu & 64)) {
 147                                 /* Bad sequence header. */
 148                                 return(0);
 149                         }
 150
 151                         /* Accept only legitimate bit patterns. */
 152
 153                         if (cu > 191 || cu < 128) {
 154                                 /* Bad in-sequence bits. */
 155                                 return(0);
 156                         }
 157
 158                         accum |= (cu & 63) << --state * 6;
 159
 160                         /*
 161                          * Accum is held in little-endian order as
 162                          * stipulated by the UTF-8 sequence coding.  We
 163                          * need to convert to a native big-endian if our
 164                          * architecture requires it.
 165                          */
 166
 167                         if (0 == state && be)
 168                                 accum = (accum >> 24) |
 169                                         ((accum << 8) & 0x00FF0000) |
 170                                         ((accum >> 8) & 0x0000FF00) |
 171                                         (accum << 24);
 172
 173                         if (0 == state) {
 174                                 accum < 128U ? putchar(accum) :
 175                                         printf("\\[u%.4X]", accum);
 176                                 accum = 0U;
 177                         }
 178                 } else if (cu & (1 << 7)) {
 179                         /*
 180                          * Entering a UTF-8 state:  if we encounter a
 181                          * UTF-8 bitmask, calculate the expected UTF-8
 182                          * state from it.
 183                          */
 184                         for (state = 0; state < 7; state++)
 185                                 if ( ! (cu & (1 << (7 - state))))
 186                                         break;
 187
 188                         /* Accept only legitimate bit patterns. */
 189
 190                         switch (state) {
 191                         case (4):
 192                                 if (cu <= 244 && cu >= 240) {
 193                                         accum = (cu & 7) << 18;
 194                                         break;
 195                                 }
 196                                 /* Bad 4-sequence start bits. */
 197                                 return(0);
 198                         case (3):
 199                                 if (cu <= 239 && cu >= 224) {
 200                                         accum = (cu & 15) << 12;
 201                                         break;
 202                                 }
 203                                 /* Bad 3-sequence start bits. */
 204                                 return(0);
 205                         case (2):
 206                                 if (cu <= 223 && cu >= 194) {
 207                                         accum = (cu & 31) << 6;
 208                                         break;
 209                                 }
 210                                 /* Bad 2-sequence start bits. */
 211                                 return(0);
 212                         default:
 213                                 /* Bad sequence bit mask. */
 214                                 return(0);
 215                         }
 216                         state--;
 217                 } else
 218                         putchar(cu);
 219         }
 220
 221         if (0 != state) {
 222                 /* Bad trailing bits. */
 223                 return(0);
 224         }
 225
 226         return(1);
 227 }
 228
 229 static void
 230 resize_buf(struct buf *buf, size_t initial)
 231 {
 232
 233         buf->sz = buf->sz > initial / 2 ?
 234                 2 * buf->sz : initial;
 235
 236         buf->buf = realloc(buf->buf, buf->sz);
 237         if (NULL == buf->buf) {
 238                 perror(NULL);
 239                 exit(EXIT_FAILURE);
 240         }
 241 }
 242
 243 static int
 244 read_whole_file(const char *f, int fd,
 245                 struct buf *fb, int *with_mmap)
 246 {
 247         struct stat      st;
 248         size_t           off;
 249         ssize_t          ssz;
 250
 251         if (-1 == fstat(fd, &st)) {
 252                 perror(f);
 253                 return(0);
 254         }
 255
 256         /*
 257          * If we're a regular file, try just reading in the whole entry
 258          * via mmap().  This is faster than reading it into blocks, and
 259          * since each file is only a few bytes to begin with, I'm not
 260          * concerned that this is going to tank any machines.
 261          */
 262
 263         if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
 264                 fprintf(stderr, "%s: input too large\n", f);
 265                 return(0);
 266         }
 267
 268         if (S_ISREG(st.st_mode)) {
 269                 *with_mmap = 1;
 270                 fb->sz = (size_t)st.st_size;
 271                 fb->buf = mmap(NULL, fb->sz, PROT_READ,
 272                                 MAP_FILE|MAP_SHARED, fd, 0);
 273                 if (fb->buf != MAP_FAILED)
 274                         return(1);
 275         }
 276
 277         /*
 278          * If this isn't a regular file (like, say, stdin), then we must
 279          * go the old way and just read things in bit by bit.
 280          */
 281
 282         *with_mmap = 0;
 283         off = 0;
 284         fb->sz = 0;
 285         fb->buf = NULL;
 286         for (;;) {
 287                 if (off == fb->sz && fb->sz == (1U << 31)) {
 288                         fprintf(stderr, "%s: input too large\n", f);
 289                         break;
 290                 }
 291
 292                 if (off == fb->sz)
 293                         resize_buf(fb, 65536);
 294
 295                 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
 296                 if (ssz == 0) {
 297                         fb->sz = off;
 298                         return(1);
 299                 }
 300                 if (ssz == -1) {
 301                         perror(f);
 302                         break;
 303                 }
 304                 off += (size_t)ssz;
 305         }
 306
 307         free(fb->buf);
 308         fb->buf = NULL;
 309         return(0);
 310 }
 311
 312 static int
 313 cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
 314 {
 315         const char      *ln, *eoln, *eoph;
 316         size_t           sz, phsz, nsz;
 317         int              i;
 318
 319         ln = b->buf + (int)*offs;
 320         sz = b->sz - *offs;
 321
 322         /* Look for the end-of-line. */
 323
 324         if (NULL == (eoln = memchr(ln, '\n', sz)))
 325                 return(-1);
 326
 327         /* Set next-line marker. */
 328
 329         *offs = (size_t)((eoln + 1) - b->buf);
 330
 331         /* Check if we have the correct header/trailer. */
 332
 333         if ((sz = (size_t)(eoln - ln)) < 10 ||
 334                         memcmp(ln, ".\\\" -*-", 7) ||
 335                         memcmp(eoln - 3, "-*-", 3))
 336                 return(0);
 337
 338         /* Move after the header and adjust for the trailer. */
 339
 340         ln += 7;
 341         sz -= 10;
 342
 343         while (sz > 0) {
 344                 while (sz > 0 && ' ' == *ln) {
 345                         ln++;
 346                         sz--;
 347                 }
 348                 if (0 == sz)
 349                         break;
 350
 351                 /* Find the end-of-phrase marker (or eoln). */
 352
 353                 if (NULL == (eoph = memchr(ln, ';', sz)))
 354                         eoph = eoln - 3;
 355                 else
 356                         eoph++;
 357
 358                 /* Only account for the "coding" phrase. */
 359
 360                 if ((phsz = (size_t)(eoph - ln)) < 7 ||
 361                                 strncasecmp(ln, "coding:", 7)) {
 362                         sz -= phsz;
 363                         ln += phsz;
 364                         continue;
 365                 }
 366
 367                 sz -= 7;
 368                 ln += 7;
 369
 370                 while (sz > 0 && ' ' == *ln) {
 371                         ln++;
 372                         sz--;
 373                 }
 374                 if (0 == sz)
 375                         break;
 376
 377                 /* Check us against known encodings. */
 378
 379                 for (i = 0; i < (int)ENC__MAX; i++) {
 380                         nsz = strlen(encs[i].name);
 381                         if (phsz < nsz)
 382                                 continue;
 383                         if (strncasecmp(ln, encs[i].name, nsz))
 384                                 continue;
 385
 386                         *enc = (enum enc)i;
 387                         return(1);
 388                 }
 389
 390                 /* Unknown encoding. */
 391
 392                 *enc = ENC__MAX;
 393                 return(1);
 394         }
 395
 396         return(0);
 397 }
 398
 399 int
 400 main(int argc, char *argv[])
 401 {
 402         int              i, ch, map, fd, rc;
 403         struct buf       b;
 404         const char      *fn;
 405         enum enc         enc, def;
 406         unsigned char    bom[3] = { 0xEF, 0xBB, 0xBF };
 407         size_t           offs;
 408         extern int       optind;
 409         extern char     *optarg;
 410
 411         progname = strrchr(argv[0], '/');
 412         if (progname == NULL)
 413                 progname = argv[0];
 414         else
 415                 ++progname;
 416
 417         fn = "<stdin>";
 418         fd = STDIN_FILENO;
 419         rc = EXIT_FAILURE;
 420         enc = def = ENC__MAX;
 421         map = 0;
 422
 423         memset(&b, 0, sizeof(struct buf));
 424
 425         while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
 426                 switch (ch) {
 427                 case ('D'):
 428                         /* FALLTHROUGH */
 429                 case ('e'):
 430                         for (i = 0; i < (int)ENC__MAX; i++) {
 431                                 if (strcasecmp(optarg, encs[i].name))
 432                                         continue;
 433                                 break;
 434                         }
 435                         if (i < (int)ENC__MAX) {
 436                                 if ('D' == ch)
 437                                         def = (enum enc)i;
 438                                 else
 439                                         enc = (enum enc)i;
 440                                 break;
 441                         }
 442
 443                         fprintf(stderr, "%s: Bad encoding\n", optarg);
 444                         return(EXIT_FAILURE);
 445                 case ('r'):
 446                         /* FALLTHROUGH */
 447                 case ('d'):
 448                         /* FALLTHROUGH */
 449                 case ('v'):
 450                         /* Compatibility with GNU preconv. */
 451                         break;
 452                 case ('h'):
 453                         /* Compatibility with GNU preconv. */
 454                         /* FALLTHROUGH */
 455                 default:
 456                         usage();
 457                         return(EXIT_FAILURE);
 458                 }
 459
 460         argc -= optind;
 461         argv += optind;
 462
 463         /*
 464          * Open and read the first argument on the command-line.
 465          * If we don't have one, we default to stdin.
 466          */
 467
 468         if (argc > 0) {
 469                 fn = *argv;
 470                 fd = open(fn, O_RDONLY, 0);
 471                 if (-1 == fd) {
 472                         perror(fn);
 473                         return(EXIT_FAILURE);
 474                 }
 475         }
 476
 477         if ( ! read_whole_file(fn, fd, &b, &map))
 478                 goto out;
 479
 480         /* Try to read the UTF-8 BOM. */
 481
 482         if (ENC__MAX == enc)
 483                 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
 484                         b.offs = 3;
 485                         enc = ENC_UTF_8;
 486                 }
 487
 488         /* Try reading from the "-*-" cue. */
 489
 490         if (ENC__MAX == enc) {
 491                 offs = b.offs;
 492                 ch = cue_enc(&b, &offs, &enc);
 493                 if (0 == ch)
 494                         ch = cue_enc(&b, &offs, &enc);
 495         }
 496
 497         /*
 498          * No encoding has been detected.
 499          * Thus, we either fall into our default encoder, if specified,
 500          * or use Latin-1 if all else fails.
 501          */
 502
 503         if (ENC__MAX == enc)
 504                 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
 505
 506         if ( ! (*encs[(int)enc].conv)(&b)) {
 507                 fprintf(stderr, "%s: Bad encoding\n", fn);
 508                 goto out;
 509         }
 510
 511         rc = EXIT_SUCCESS;
 512 out:
 513         if (map)
 514                 munmap(b.buf, b.sz);
 515         else
 516                 free(b.buf);
 517
 518         if (fd > STDIN_FILENO)
 519                 close(fd);
 520
 521         return(rc);
 522 }