]> git.cameronkatri.com Git - mandoc.git/blob - preconv.c
Add support for 1/2, 1/4, and 3/4 (needed by eqn).
[mandoc.git] / preconv.c
1 /* $Id: preconv.c,v 1.4 2011/05/26 21:13:07 kristaps Exp $ */
2 /*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include <sys/stat.h>
22 #include <sys/mman.h>
23
24 #include <assert.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 /*
32 * The read_whole_file() and resize_buf() functions are copied from
33 * read.c, including all dependency code (MAP_FILE, etc.).
34 */
35
36 #ifndef MAP_FILE
37 #define MAP_FILE 0
38 #endif
39
40 enum enc {
41 ENC_UTF_8, /* UTF-8 */
42 ENC_US_ASCII, /* US-ASCII */
43 ENC_LATIN_1, /* Latin-1 */
44 ENC__MAX
45 };
46
47 struct buf {
48 char *buf; /* binary input buffer */
49 size_t sz; /* size of binary buffer */
50 size_t offs; /* starting buffer offset */
51 };
52
53 struct encode {
54 const char *name;
55 int (*conv)(const struct buf *);
56 };
57
58 static int cue_enc(const struct buf *, size_t *, enum enc *);
59 static int conv_latin_1(const struct buf *);
60 static int conv_us_ascii(const struct buf *);
61 static int conv_utf_8(const struct buf *);
62 static int read_whole_file(const char *, int,
63 struct buf *, int *);
64 static void resize_buf(struct buf *, size_t);
65 static void usage(void);
66
67 static const struct encode encs[ENC__MAX] = {
68 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
69 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
70 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
71 };
72
73 static const char *progname;
74
75 static void
76 usage(void)
77 {
78
79 fprintf(stderr, "usage: %s "
80 "[-D enc] "
81 "[-e ENC] "
82 "[file]\n", progname);
83 }
84
85 static int
86 conv_latin_1(const struct buf *b)
87 {
88 size_t i;
89 unsigned char cu;
90 const char *cp;
91
92 cp = b->buf + (int)b->offs;
93
94 /*
95 * Latin-1 falls into the first 256 code-points of Unicode, so
96 * there's no need for any sort of translation. Just make the
97 * 8-bit characters use the Unicode escape.
98 * Note that binary values 128 < v < 160 are passed through
99 * unmodified to mandoc.
100 */
101
102 for (i = b->offs; i < b->sz; i++) {
103 cu = (unsigned char)*cp++;
104 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
105 }
106
107 return(1);
108 }
109
110 static int
111 conv_us_ascii(const struct buf *b)
112 {
113
114 /*
115 * US-ASCII has no conversion since it falls into the first 128
116 * bytes of Unicode.
117 */
118
119 fwrite(b->buf, 1, b->sz, stdout);
120 return(1);
121 }
122
123 static int
124 conv_utf_8(const struct buf *b)
125 {
126 int state, be;
127 unsigned int accum;
128 size_t i;
129 unsigned char cu;
130 const char *cp;
131 const long one = 1L;
132
133 cp = b->buf + (int)b->offs;
134 state = 0;
135 accum = 0U;
136 be = 0;
137
138 /* Quick test for big-endian value. */
139
140 if ( ! (*((const char *)(&one))))
141 be = 1;
142
143 for (i = b->offs; i < b->sz; i++) {
144 cu = (unsigned char)*cp++;
145 if (state) {
146 if ( ! (cu & 128) || (cu & 64)) {
147 /* Bad sequence header. */
148 return(0);
149 }
150
151 /* Accept only legitimate bit patterns. */
152
153 if (cu > 191 || cu < 128) {
154 /* Bad in-sequence bits. */
155 return(0);
156 }
157
158 accum |= (cu & 63) << --state * 6;
159
160 /*
161 * Accum is held in little-endian order as
162 * stipulated by the UTF-8 sequence coding. We
163 * need to convert to a native big-endian if our
164 * architecture requires it.
165 */
166
167 if (0 == state && be)
168 accum = (accum >> 24) |
169 ((accum << 8) & 0x00FF0000) |
170 ((accum >> 8) & 0x0000FF00) |
171 (accum << 24);
172
173 if (0 == state) {
174 accum < 128U ? putchar(accum) :
175 printf("\\[u%.4X]", accum);
176 accum = 0U;
177 }
178 } else if (cu & (1 << 7)) {
179 /*
180 * Entering a UTF-8 state: if we encounter a
181 * UTF-8 bitmask, calculate the expected UTF-8
182 * state from it.
183 */
184 for (state = 0; state < 7; state++)
185 if ( ! (cu & (1 << (7 - state))))
186 break;
187
188 /* Accept only legitimate bit patterns. */
189
190 switch (state) {
191 case (4):
192 if (cu <= 244 && cu >= 240) {
193 accum = (cu & 7) << 18;
194 break;
195 }
196 /* Bad 4-sequence start bits. */
197 return(0);
198 case (3):
199 if (cu <= 239 && cu >= 224) {
200 accum = (cu & 15) << 12;
201 break;
202 }
203 /* Bad 3-sequence start bits. */
204 return(0);
205 case (2):
206 if (cu <= 223 && cu >= 194) {
207 accum = (cu & 31) << 6;
208 break;
209 }
210 /* Bad 2-sequence start bits. */
211 return(0);
212 default:
213 /* Bad sequence bit mask. */
214 return(0);
215 }
216 state--;
217 } else
218 putchar(cu);
219 }
220
221 if (0 != state) {
222 /* Bad trailing bits. */
223 return(0);
224 }
225
226 return(1);
227 }
228
229 static void
230 resize_buf(struct buf *buf, size_t initial)
231 {
232
233 buf->sz = buf->sz > initial / 2 ?
234 2 * buf->sz : initial;
235
236 buf->buf = realloc(buf->buf, buf->sz);
237 if (NULL == buf->buf) {
238 perror(NULL);
239 exit(EXIT_FAILURE);
240 }
241 }
242
243 static int
244 read_whole_file(const char *f, int fd,
245 struct buf *fb, int *with_mmap)
246 {
247 struct stat st;
248 size_t off;
249 ssize_t ssz;
250
251 if (-1 == fstat(fd, &st)) {
252 perror(f);
253 return(0);
254 }
255
256 /*
257 * If we're a regular file, try just reading in the whole entry
258 * via mmap(). This is faster than reading it into blocks, and
259 * since each file is only a few bytes to begin with, I'm not
260 * concerned that this is going to tank any machines.
261 */
262
263 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
264 fprintf(stderr, "%s: input too large\n", f);
265 return(0);
266 }
267
268 if (S_ISREG(st.st_mode)) {
269 *with_mmap = 1;
270 fb->sz = (size_t)st.st_size;
271 fb->buf = mmap(NULL, fb->sz, PROT_READ,
272 MAP_FILE|MAP_SHARED, fd, 0);
273 if (fb->buf != MAP_FAILED)
274 return(1);
275 }
276
277 /*
278 * If this isn't a regular file (like, say, stdin), then we must
279 * go the old way and just read things in bit by bit.
280 */
281
282 *with_mmap = 0;
283 off = 0;
284 fb->sz = 0;
285 fb->buf = NULL;
286 for (;;) {
287 if (off == fb->sz && fb->sz == (1U << 31)) {
288 fprintf(stderr, "%s: input too large\n", f);
289 break;
290 }
291
292 if (off == fb->sz)
293 resize_buf(fb, 65536);
294
295 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
296 if (ssz == 0) {
297 fb->sz = off;
298 return(1);
299 }
300 if (ssz == -1) {
301 perror(f);
302 break;
303 }
304 off += (size_t)ssz;
305 }
306
307 free(fb->buf);
308 fb->buf = NULL;
309 return(0);
310 }
311
312 static int
313 cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
314 {
315 const char *ln, *eoln, *eoph;
316 size_t sz, phsz, nsz;
317 int i;
318
319 ln = b->buf + (int)*offs;
320 sz = b->sz - *offs;
321
322 /* Look for the end-of-line. */
323
324 if (NULL == (eoln = memchr(ln, '\n', sz)))
325 return(-1);
326
327 /* Set next-line marker. */
328
329 *offs = (size_t)((eoln + 1) - b->buf);
330
331 /* Check if we have the correct header/trailer. */
332
333 if ((sz = (size_t)(eoln - ln)) < 10 ||
334 memcmp(ln, ".\\\" -*-", 7) ||
335 memcmp(eoln - 3, "-*-", 3))
336 return(0);
337
338 /* Move after the header and adjust for the trailer. */
339
340 ln += 7;
341 sz -= 10;
342
343 while (sz > 0) {
344 while (sz > 0 && ' ' == *ln) {
345 ln++;
346 sz--;
347 }
348 if (0 == sz)
349 break;
350
351 /* Find the end-of-phrase marker (or eoln). */
352
353 if (NULL == (eoph = memchr(ln, ';', sz)))
354 eoph = eoln - 3;
355 else
356 eoph++;
357
358 /* Only account for the "coding" phrase. */
359
360 if ((phsz = (size_t)(eoph - ln)) < 7 ||
361 strncasecmp(ln, "coding:", 7)) {
362 sz -= phsz;
363 ln += phsz;
364 continue;
365 }
366
367 sz -= 7;
368 ln += 7;
369
370 while (sz > 0 && ' ' == *ln) {
371 ln++;
372 sz--;
373 }
374 if (0 == sz)
375 break;
376
377 /* Check us against known encodings. */
378
379 for (i = 0; i < (int)ENC__MAX; i++) {
380 nsz = strlen(encs[i].name);
381 if (phsz < nsz)
382 continue;
383 if (strncasecmp(ln, encs[i].name, nsz))
384 continue;
385
386 *enc = (enum enc)i;
387 return(1);
388 }
389
390 /* Unknown encoding. */
391
392 *enc = ENC__MAX;
393 return(1);
394 }
395
396 return(0);
397 }
398
399 int
400 main(int argc, char *argv[])
401 {
402 int i, ch, map, fd, rc;
403 struct buf b;
404 const char *fn;
405 enum enc enc, def;
406 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
407 size_t offs;
408 extern int optind;
409 extern char *optarg;
410
411 progname = strrchr(argv[0], '/');
412 if (progname == NULL)
413 progname = argv[0];
414 else
415 ++progname;
416
417 fn = "<stdin>";
418 fd = STDIN_FILENO;
419 rc = EXIT_FAILURE;
420 enc = def = ENC__MAX;
421 map = 0;
422
423 memset(&b, 0, sizeof(struct buf));
424
425 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
426 switch (ch) {
427 case ('D'):
428 /* FALLTHROUGH */
429 case ('e'):
430 for (i = 0; i < (int)ENC__MAX; i++) {
431 if (strcasecmp(optarg, encs[i].name))
432 continue;
433 break;
434 }
435 if (i < (int)ENC__MAX) {
436 if ('D' == ch)
437 def = (enum enc)i;
438 else
439 enc = (enum enc)i;
440 break;
441 }
442
443 fprintf(stderr, "%s: Bad encoding\n", optarg);
444 return(EXIT_FAILURE);
445 case ('r'):
446 /* FALLTHROUGH */
447 case ('d'):
448 /* FALLTHROUGH */
449 case ('v'):
450 /* Compatibility with GNU preconv. */
451 break;
452 case ('h'):
453 /* Compatibility with GNU preconv. */
454 /* FALLTHROUGH */
455 default:
456 usage();
457 return(EXIT_FAILURE);
458 }
459
460 argc -= optind;
461 argv += optind;
462
463 /*
464 * Open and read the first argument on the command-line.
465 * If we don't have one, we default to stdin.
466 */
467
468 if (argc > 0) {
469 fn = *argv;
470 fd = open(fn, O_RDONLY, 0);
471 if (-1 == fd) {
472 perror(fn);
473 return(EXIT_FAILURE);
474 }
475 }
476
477 if ( ! read_whole_file(fn, fd, &b, &map))
478 goto out;
479
480 /* Try to read the UTF-8 BOM. */
481
482 if (ENC__MAX == enc)
483 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
484 b.offs = 3;
485 enc = ENC_UTF_8;
486 }
487
488 /* Try reading from the "-*-" cue. */
489
490 if (ENC__MAX == enc) {
491 offs = b.offs;
492 ch = cue_enc(&b, &offs, &enc);
493 if (0 == ch)
494 ch = cue_enc(&b, &offs, &enc);
495 }
496
497 /*
498 * No encoding has been detected.
499 * Thus, we either fall into our default encoder, if specified,
500 * or use Latin-1 if all else fails.
501 */
502
503 if (ENC__MAX == enc)
504 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
505
506 if ( ! (*encs[(int)enc].conv)(&b)) {
507 fprintf(stderr, "%s: Bad encoding\n", fn);
508 goto out;
509 }
510
511 rc = EXIT_SUCCESS;
512 out:
513 if (map)
514 munmap(b.buf, b.sz);
515 else
516 free(b.buf);
517
518 if (fd > STDIN_FILENO)
519 close(fd);
520
521 return(rc);
522 }