]> git.cameronkatri.com Git - mandoc.git/blob - preconv.c
document new SEE ALSO .Xr warnings
[mandoc.git] / preconv.c
1 /* $Id: preconv.c,v 1.8 2014/08/16 19:00:01 schwarze Exp $ */
2 /*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #include "config.h"
18
19 #include <sys/types.h>
20 #if HAVE_MMAP
21 #include <sys/stat.h>
22 #include <sys/mman.h>
23 #endif
24
25 #include <assert.h>
26 #include <fcntl.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <unistd.h>
31
32 /*
33 * The read_whole_file() and resize_buf() functions are copied from
34 * read.c, including all dependency code.
35 */
36
37 enum enc {
38 ENC_UTF_8, /* UTF-8 */
39 ENC_US_ASCII, /* US-ASCII */
40 ENC_LATIN_1, /* Latin-1 */
41 ENC__MAX
42 };
43
44 struct buf {
45 char *buf; /* binary input buffer */
46 size_t sz; /* size of binary buffer */
47 size_t offs; /* starting buffer offset */
48 };
49
50 struct encode {
51 const char *name;
52 int (*conv)(const struct buf *);
53 };
54
55 static int cue_enc(const struct buf *, size_t *, enum enc *);
56 static int conv_latin_1(const struct buf *);
57 static int conv_us_ascii(const struct buf *);
58 static int conv_utf_8(const struct buf *);
59 static int read_whole_file(const char *, int,
60 struct buf *, int *);
61 static void resize_buf(struct buf *, size_t);
62 static void usage(void);
63
64 static const struct encode encs[ENC__MAX] = {
65 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
66 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
67 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
68 };
69
70 static const char *progname;
71
72 static void
73 usage(void)
74 {
75
76 fprintf(stderr, "usage: %s "
77 "[-D enc] "
78 "[-e ENC] "
79 "[file]\n", progname);
80 }
81
82 static int
83 conv_latin_1(const struct buf *b)
84 {
85 size_t i;
86 unsigned char cu;
87 const char *cp;
88
89 cp = b->buf + (int)b->offs;
90
91 /*
92 * Latin-1 falls into the first 256 code-points of Unicode, so
93 * there's no need for any sort of translation. Just make the
94 * 8-bit characters use the Unicode escape.
95 * Note that binary values 128 < v < 160 are passed through
96 * unmodified to mandoc.
97 */
98
99 for (i = b->offs; i < b->sz; i++) {
100 cu = (unsigned char)*cp++;
101 cu < 160U ? putchar(cu) : printf("\\[u%.4X]", cu);
102 }
103
104 return(1);
105 }
106
107 static int
108 conv_us_ascii(const struct buf *b)
109 {
110
111 /*
112 * US-ASCII has no conversion since it falls into the first 128
113 * bytes of Unicode.
114 */
115
116 fwrite(b->buf, 1, b->sz, stdout);
117 return(1);
118 }
119
120 static int
121 conv_utf_8(const struct buf *b)
122 {
123 int state, be;
124 unsigned int accum;
125 size_t i;
126 unsigned char cu;
127 const char *cp;
128 const long one = 1L;
129
130 cp = b->buf + (int)b->offs;
131 state = 0;
132 accum = 0U;
133 be = 0;
134
135 /* Quick test for big-endian value. */
136
137 if ( ! (*((const char *)(&one))))
138 be = 1;
139
140 for (i = b->offs; i < b->sz; i++) {
141 cu = (unsigned char)*cp++;
142 if (state) {
143 if ( ! (cu & 128) || (cu & 64)) {
144 /* Bad sequence header. */
145 return(0);
146 }
147
148 /* Accept only legitimate bit patterns. */
149
150 if (cu > 191 || cu < 128) {
151 /* Bad in-sequence bits. */
152 return(0);
153 }
154
155 accum |= (cu & 63) << --state * 6;
156
157 /*
158 * Accum is held in little-endian order as
159 * stipulated by the UTF-8 sequence coding. We
160 * need to convert to a native big-endian if our
161 * architecture requires it.
162 */
163
164 if (0 == state && be)
165 accum = (accum >> 24) |
166 ((accum << 8) & 0x00FF0000) |
167 ((accum >> 8) & 0x0000FF00) |
168 (accum << 24);
169
170 if (0 == state) {
171 accum < 128U ? putchar(accum) :
172 printf("\\[u%.4X]", accum);
173 accum = 0U;
174 }
175 } else if (cu & (1 << 7)) {
176 /*
177 * Entering a UTF-8 state: if we encounter a
178 * UTF-8 bitmask, calculate the expected UTF-8
179 * state from it.
180 */
181 for (state = 0; state < 7; state++)
182 if ( ! (cu & (1 << (7 - state))))
183 break;
184
185 /* Accept only legitimate bit patterns. */
186
187 switch (state) {
188 case (4):
189 if (cu <= 244 && cu >= 240) {
190 accum = (cu & 7) << 18;
191 break;
192 }
193 /* Bad 4-sequence start bits. */
194 return(0);
195 case (3):
196 if (cu <= 239 && cu >= 224) {
197 accum = (cu & 15) << 12;
198 break;
199 }
200 /* Bad 3-sequence start bits. */
201 return(0);
202 case (2):
203 if (cu <= 223 && cu >= 194) {
204 accum = (cu & 31) << 6;
205 break;
206 }
207 /* Bad 2-sequence start bits. */
208 return(0);
209 default:
210 /* Bad sequence bit mask. */
211 return(0);
212 }
213 state--;
214 } else
215 putchar(cu);
216 }
217
218 if (0 != state) {
219 /* Bad trailing bits. */
220 return(0);
221 }
222
223 return(1);
224 }
225
226 static void
227 resize_buf(struct buf *buf, size_t initial)
228 {
229
230 buf->sz = buf->sz > initial / 2 ?
231 2 * buf->sz : initial;
232
233 buf->buf = realloc(buf->buf, buf->sz);
234 if (NULL == buf->buf) {
235 perror(NULL);
236 exit(EXIT_FAILURE);
237 }
238 }
239
240 static int
241 read_whole_file(const char *f, int fd,
242 struct buf *fb, int *with_mmap)
243 {
244 size_t off;
245 ssize_t ssz;
246
247 #if HAVE_MMAP
248 struct stat st;
249 if (-1 == fstat(fd, &st)) {
250 perror(f);
251 return(0);
252 }
253
254 /*
255 * If we're a regular file, try just reading in the whole entry
256 * via mmap(). This is faster than reading it into blocks, and
257 * since each file is only a few bytes to begin with, I'm not
258 * concerned that this is going to tank any machines.
259 */
260
261 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
262 fprintf(stderr, "%s: input too large\n", f);
263 return(0);
264 }
265
266 if (S_ISREG(st.st_mode)) {
267 *with_mmap = 1;
268 fb->sz = (size_t)st.st_size;
269 fb->buf = mmap(NULL, fb->sz, PROT_READ, MAP_SHARED, fd, 0);
270 if (fb->buf != MAP_FAILED)
271 return(1);
272 }
273 #endif
274
275 /*
276 * If this isn't a regular file (like, say, stdin), then we must
277 * go the old way and just read things in bit by bit.
278 */
279
280 *with_mmap = 0;
281 off = 0;
282 fb->sz = 0;
283 fb->buf = NULL;
284 for (;;) {
285 if (off == fb->sz && fb->sz == (1U << 31)) {
286 fprintf(stderr, "%s: input too large\n", f);
287 break;
288 }
289
290 if (off == fb->sz)
291 resize_buf(fb, 65536);
292
293 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
294 if (ssz == 0) {
295 fb->sz = off;
296 return(1);
297 }
298 if (ssz == -1) {
299 perror(f);
300 break;
301 }
302 off += (size_t)ssz;
303 }
304
305 free(fb->buf);
306 fb->buf = NULL;
307 return(0);
308 }
309
310 static int
311 cue_enc(const struct buf *b, size_t *offs, enum enc *enc)
312 {
313 const char *ln, *eoln, *eoph;
314 size_t sz, phsz, nsz;
315 int i;
316
317 ln = b->buf + (int)*offs;
318 sz = b->sz - *offs;
319
320 /* Look for the end-of-line. */
321
322 if (NULL == (eoln = memchr(ln, '\n', sz)))
323 return(-1);
324
325 /* Set next-line marker. */
326
327 *offs = (size_t)((eoln + 1) - b->buf);
328
329 /* Check if we have the correct header/trailer. */
330
331 if ((sz = (size_t)(eoln - ln)) < 10 ||
332 memcmp(ln, ".\\\" -*-", 7) ||
333 memcmp(eoln - 3, "-*-", 3))
334 return(0);
335
336 /* Move after the header and adjust for the trailer. */
337
338 ln += 7;
339 sz -= 10;
340
341 while (sz > 0) {
342 while (sz > 0 && ' ' == *ln) {
343 ln++;
344 sz--;
345 }
346 if (0 == sz)
347 break;
348
349 /* Find the end-of-phrase marker (or eoln). */
350
351 if (NULL == (eoph = memchr(ln, ';', sz)))
352 eoph = eoln - 3;
353 else
354 eoph++;
355
356 /* Only account for the "coding" phrase. */
357
358 if ((phsz = (size_t)(eoph - ln)) < 7 ||
359 strncasecmp(ln, "coding:", 7)) {
360 sz -= phsz;
361 ln += phsz;
362 continue;
363 }
364
365 sz -= 7;
366 ln += 7;
367
368 while (sz > 0 && ' ' == *ln) {
369 ln++;
370 sz--;
371 }
372 if (0 == sz)
373 break;
374
375 /* Check us against known encodings. */
376
377 for (i = 0; i < (int)ENC__MAX; i++) {
378 nsz = strlen(encs[i].name);
379 if (phsz < nsz)
380 continue;
381 if (strncasecmp(ln, encs[i].name, nsz))
382 continue;
383
384 *enc = (enum enc)i;
385 return(1);
386 }
387
388 /* Unknown encoding. */
389
390 *enc = ENC__MAX;
391 return(1);
392 }
393
394 return(0);
395 }
396
397 int
398 main(int argc, char *argv[])
399 {
400 int i, ch, map, fd, rc;
401 struct buf b;
402 const char *fn;
403 enum enc enc, def;
404 unsigned char bom[3] = { 0xEF, 0xBB, 0xBF };
405 size_t offs;
406 extern int optind;
407 extern char *optarg;
408
409 progname = strrchr(argv[0], '/');
410 if (progname == NULL)
411 progname = argv[0];
412 else
413 ++progname;
414
415 fn = "<stdin>";
416 fd = STDIN_FILENO;
417 rc = EXIT_FAILURE;
418 enc = def = ENC__MAX;
419 map = 0;
420
421 memset(&b, 0, sizeof(struct buf));
422
423 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
424 switch (ch) {
425 case ('D'):
426 /* FALLTHROUGH */
427 case ('e'):
428 for (i = 0; i < (int)ENC__MAX; i++) {
429 if (strcasecmp(optarg, encs[i].name))
430 continue;
431 break;
432 }
433 if (i < (int)ENC__MAX) {
434 if ('D' == ch)
435 def = (enum enc)i;
436 else
437 enc = (enum enc)i;
438 break;
439 }
440
441 fprintf(stderr, "%s: Bad encoding\n", optarg);
442 return(EXIT_FAILURE);
443 case ('r'):
444 /* FALLTHROUGH */
445 case ('d'):
446 /* FALLTHROUGH */
447 case ('v'):
448 /* Compatibility with GNU preconv. */
449 break;
450 case ('h'):
451 /* Compatibility with GNU preconv. */
452 /* FALLTHROUGH */
453 default:
454 usage();
455 return(EXIT_FAILURE);
456 }
457
458 argc -= optind;
459 argv += optind;
460
461 /*
462 * Open and read the first argument on the command-line.
463 * If we don't have one, we default to stdin.
464 */
465
466 if (argc > 0) {
467 fn = *argv;
468 fd = open(fn, O_RDONLY, 0);
469 if (-1 == fd) {
470 perror(fn);
471 return(EXIT_FAILURE);
472 }
473 }
474
475 if ( ! read_whole_file(fn, fd, &b, &map))
476 goto out;
477
478 /* Try to read the UTF-8 BOM. */
479
480 if (ENC__MAX == enc)
481 if (b.sz > 3 && 0 == memcmp(b.buf, bom, 3)) {
482 b.offs = 3;
483 enc = ENC_UTF_8;
484 }
485
486 /* Try reading from the "-*-" cue. */
487
488 if (ENC__MAX == enc) {
489 offs = b.offs;
490 ch = cue_enc(&b, &offs, &enc);
491 if (0 == ch)
492 ch = cue_enc(&b, &offs, &enc);
493 }
494
495 /*
496 * No encoding has been detected.
497 * Thus, we either fall into our default encoder, if specified,
498 * or use Latin-1 if all else fails.
499 */
500
501 if (ENC__MAX == enc)
502 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
503
504 if ( ! (*encs[(int)enc].conv)(&b)) {
505 fprintf(stderr, "%s: Bad encoding\n", fn);
506 goto out;
507 }
508
509 rc = EXIT_SUCCESS;
510 out:
511 #if HAVE_MMAP
512 if (map)
513 munmap(b.buf, b.sz);
514 else
515 #endif
516 free(b.buf);
517
518 if (fd > STDIN_FILENO)
519 close(fd);
520
521 return(rc);
522 }