]> git.cameronkatri.com Git - mandoc.git/blob - preconv.c
da5af9b4f5ead994acf16f12e0772c6749ccc55b
[mandoc.git] / preconv.c
1 /* $Id: preconv.c,v 1.2 2011/05/26 12:01:14 kristaps Exp $ */
2 /*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include <sys/stat.h>
22 #include <sys/mman.h>
23
24 #include <assert.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 /*
32 * The read_whole_file() and resize_buf() functions are copied from
33 * read.c, including all dependency code (MAP_FILE, etc.).
34 */
35
36 #ifndef MAP_FILE
37 #define MAP_FILE 0
38 #endif
39
40 enum enc {
41 ENC_UTF_8, /* UTF-8 */
42 ENC_US_ASCII, /* US-ASCII */
43 ENC_LATIN_1, /* Latin-1 */
44 ENC__MAX
45 };
46
47 struct buf {
48 char *buf; /* binary input buffer */
49 size_t sz; /* size of binary buffer */
50 size_t offs; /* starting buffer offset */
51 };
52
53 struct encode {
54 const char *name;
55 int (*conv)(const struct buf *);
56 };
57
58 static int conv_latin_1(const struct buf *);
59 static int conv_us_ascii(const struct buf *);
60 static int conv_utf_8(const struct buf *);
61 static int read_whole_file(const char *, int,
62 struct buf *, int *);
63 static void resize_buf(struct buf *, size_t);
64 static void usage(void);
65
66 static const struct encode encs[ENC__MAX] = {
67 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
68 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
69 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
70 };
71
72 static const char *progname;
73
74 static void
75 usage(void)
76 {
77
78 fprintf(stderr, "usage: %s "
79 "[-D enc] "
80 "[-e ENC] "
81 "[file]\n", progname);
82 }
83
84 static int
85 conv_latin_1(const struct buf *b)
86 {
87 size_t i;
88 unsigned char cu;
89 const char *cp;
90
91 cp = b->buf + (int)b->offs;
92
93 /*
94 * Latin-1 falls into the first 256 code-points of Unicode, so
95 * there's no need for any sort of translation. Just make the
96 * 8-bit characters use the Unicode escape.
97 */
98
99 for (i = b->offs; i < b->sz; i++) {
100 cu = (unsigned char)*cp++;
101 cu < 128U ? putchar(cu) : printf("\\[u%.4X]", cu);
102 }
103
104 return(1);
105 }
106
107 static int
108 conv_us_ascii(const struct buf *b)
109 {
110
111 /*
112 * US-ASCII has no conversion since it falls into the first 128
113 * bytes of Unicode.
114 */
115
116 fwrite(b->buf, 1, b->sz, stdout);
117 return(1);
118 }
119
120 static int
121 conv_utf_8(const struct buf *b)
122 {
123 int state, be;
124 unsigned int accum;
125 size_t i;
126 unsigned char cu;
127 const char *cp;
128 const long one = 1L;
129
130 cp = b->buf + (int)b->offs;
131 state = 0;
132 accum = 0U;
133 be = 0;
134
135 /* Quick test for big-endian value. */
136
137 if ( ! (*((char *)(&one))))
138 be = 1;
139
140 for (i = b->offs; i < b->sz; i++) {
141 cu = (unsigned char)*cp++;
142 if (state) {
143 if ( ! (cu & 128) || (cu & 64)) {
144 /* Bad sequence header. */
145 return(0);
146 }
147
148 /* Accept only legitimate bit patterns. */
149
150 if (cu > 191 || cu < 128) {
151 /* Bad in-sequence bits. */
152 return(0);
153 }
154
155 accum |= (cu & 63) << --state * 6;
156
157 /*
158 * Accum is held in little-endian order as
159 * stipulated by the UTF-8 sequence coding. We
160 * need to convert to a native big-endian if our
161 * architecture requires it.
162 */
163
164 if (0 == state && be)
165 accum = (accum >> 24) |
166 ((accum << 8) & 0x00FF0000) |
167 ((accum >> 8) & 0x0000FF00) |
168 (accum << 24);
169
170 if (0 == state) {
171 accum < 128U ? putchar(accum) :
172 printf("\\[u%.4X]", accum);
173 accum = 0U;
174 }
175 } else if (cu & (1 << 7)) {
176 /*
177 * Entering a UTF-8 state: if we encounter a
178 * UTF-8 bitmask, calculate the expected UTF-8
179 * state from it.
180 */
181 for (state = 0; state < 7; state++)
182 if ( ! (cu & (1 << (7 - state))))
183 break;
184
185 /* Accept only legitimate bit patterns. */
186
187 switch (state) {
188 case (4):
189 if (cu <= 244 && cu >= 240) {
190 accum = (cu & 7) << 18;
191 break;
192 }
193 /* Bad 4-sequence start bits. */
194 return(0);
195 case (3):
196 if (cu <= 239 && cu >= 224) {
197 accum = (cu & 15) << 12;
198 break;
199 }
200 /* Bad 3-sequence start bits. */
201 return(0);
202 case (2):
203 if (cu <= 223 && cu >= 194) {
204 accum = (cu & 31) << 6;
205 break;
206 }
207 /* Bad 2-sequence start bits. */
208 return(0);
209 default:
210 /* Bad sequence bit mask. */
211 return(0);
212 }
213 state--;
214 } else
215 putchar(cu);
216 }
217
218 if (0 != state) {
219 /* Bad trailing bits. */
220 return(0);
221 }
222
223 return(1);
224 }
225
226 static void
227 resize_buf(struct buf *buf, size_t initial)
228 {
229
230 buf->sz = buf->sz > initial / 2 ?
231 2 * buf->sz : initial;
232
233 buf->buf = realloc(buf->buf, buf->sz);
234 if (NULL == buf->buf) {
235 perror(NULL);
236 exit(EXIT_FAILURE);
237 }
238 }
239
240 static int
241 read_whole_file(const char *f, int fd,
242 struct buf *fb, int *with_mmap)
243 {
244 struct stat st;
245 size_t off;
246 ssize_t ssz;
247
248 if (-1 == fstat(fd, &st)) {
249 perror(f);
250 return(0);
251 }
252
253 /*
254 * If we're a regular file, try just reading in the whole entry
255 * via mmap(). This is faster than reading it into blocks, and
256 * since each file is only a few bytes to begin with, I'm not
257 * concerned that this is going to tank any machines.
258 */
259
260 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
261 fprintf(stderr, "%s: input too large\n", f);
262 return(0);
263 }
264
265 if (S_ISREG(st.st_mode)) {
266 *with_mmap = 1;
267 fb->sz = (size_t)st.st_size;
268 fb->buf = mmap(NULL, fb->sz, PROT_READ,
269 MAP_FILE|MAP_SHARED, fd, 0);
270 if (fb->buf != MAP_FAILED)
271 return(1);
272 }
273
274 /*
275 * If this isn't a regular file (like, say, stdin), then we must
276 * go the old way and just read things in bit by bit.
277 */
278
279 *with_mmap = 0;
280 off = 0;
281 fb->sz = 0;
282 fb->buf = NULL;
283 for (;;) {
284 if (off == fb->sz && fb->sz == (1U << 31)) {
285 fprintf(stderr, "%s: input too large\n", f);
286 break;
287 }
288
289 if (off == fb->sz)
290 resize_buf(fb, 65536);
291
292 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
293 if (ssz == 0) {
294 fb->sz = off;
295 return(1);
296 }
297 if (ssz == -1) {
298 perror(f);
299 break;
300 }
301 off += (size_t)ssz;
302 }
303
304 free(fb->buf);
305 fb->buf = NULL;
306 return(0);
307 }
308
309 int
310 main(int argc, char *argv[])
311 {
312 int i, ch, map, fd, rc;
313 struct buf b;
314 const char *fn;
315 enum enc enc, def;
316 const char bom[3] = { 0xEF, 0xBB, 0xBF };
317 extern int optind;
318 extern char *optarg;
319
320 progname = strrchr(argv[0], '/');
321 if (progname == NULL)
322 progname = argv[0];
323 else
324 ++progname;
325
326 fn = "<stdin>";
327 fd = STDIN_FILENO;
328 rc = EXIT_FAILURE;
329 enc = def = ENC__MAX;
330 map = 0;
331
332 memset(&b, 0, sizeof(struct buf));
333
334 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
335 switch (ch) {
336 case ('D'):
337 /* FALLTHROUGH */
338 case ('e'):
339 for (i = 0; i < ENC__MAX; i++) {
340 if (strcasecmp(optarg, encs[i].name))
341 continue;
342 break;
343 }
344 if (i < ENC__MAX) {
345 if ('D' == ch)
346 def = (enum enc)i;
347 else
348 enc = (enum enc)i;
349 break;
350 }
351
352 fprintf(stderr, "%s: Bad encoding\n", optarg);
353 return(EXIT_FAILURE);
354 case ('r'):
355 /* FALLTHROUGH */
356 case ('d'):
357 /* FALLTHROUGH */
358 case ('v'):
359 /* Compatibility with GNU preconv. */
360 break;
361 case ('h'):
362 /* Compatibility with GNU preconv. */
363 /* FALLTHROUGH */
364 default:
365 usage();
366 return(EXIT_FAILURE);
367 }
368
369 argc -= optind;
370 argv += optind;
371
372 /*
373 * Open and read the first argument on the command-line.
374 * If we don't have one, we default to stdin.
375 */
376
377 if (argc > 0) {
378 fn = *argv;
379 fd = open(fn, O_RDONLY, 0);
380 if (-1 == fd) {
381 perror(fn);
382 return(EXIT_FAILURE);
383 }
384 }
385
386 if ( ! read_whole_file(fn, fd, &b, &map))
387 goto out;
388
389 /* Try to read the UTF-8 BOM. */
390
391 if (ENC__MAX == enc)
392 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
393 b.offs = 3;
394 enc = ENC_UTF_8;
395 }
396
397 /*
398 * No encoding has been detected.
399 * Thus, we either fall into our default encoder, if specified,
400 * or use Latin-1 if all else fails.
401 */
402
403 if (ENC__MAX == enc)
404 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
405
406 if ( ! (*encs[(int)enc].conv)(&b))
407 goto out;
408
409 rc = EXIT_SUCCESS;
410 out:
411 if (map)
412 munmap(b.buf, b.sz);
413 else
414 free(b.buf);
415
416 if (fd > STDIN_FILENO)
417 close(fd);
418
419 return(rc);
420 }