]> git.cameronkatri.com Git - mandoc.git/blob - preconv.c
ce091ec88afc126ed13cff45786f392b361f5c24
[mandoc.git] / preconv.c
1 /* $Id: preconv.c,v 1.1 2011/05/26 00:30:11 kristaps Exp $ */
2 /*
3 * Copyright (c) 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 *
5 * Permission to use, copy, modify, and distribute this software for any
6 * purpose with or without fee is hereby granted, provided that the above
7 * copyright notice and this permission notice appear in all copies.
8 *
9 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
10 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
11 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
12 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
13 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
14 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
15 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
16 */
17 #ifdef HAVE_CONFIG_H
18 #include "config.h"
19 #endif
20
21 #include <sys/stat.h>
22 #include <sys/mman.h>
23
24 #include <assert.h>
25 #include <fcntl.h>
26 #include <stdio.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 /*
32 * The read_whole_file() and resize_buf() functions are copied from
33 * read.c, including all dependency code (MAP_FILE, etc.).
34 */
35
36 #ifndef MAP_FILE
37 #define MAP_FILE 0
38 #endif
39
40 enum enc {
41 ENC_UTF_8, /* UTF-8 */
42 ENC_US_ASCII, /* US-ASCII */
43 ENC_LATIN_1, /* Latin-1 */
44 ENC__MAX
45 };
46
47 struct buf {
48 char *buf; /* binary input buffer */
49 size_t sz; /* size of binary buffer */
50 size_t offs; /* starting buffer offset */
51 };
52
53 struct encode {
54 const char *name;
55 int (*conv)(const struct buf *);
56 };
57
58 static int conv_latin_1(const struct buf *);
59 static int conv_us_ascii(const struct buf *);
60 static int conv_utf_8(const struct buf *);
61 static int read_whole_file(const char *, int,
62 struct buf *, int *);
63 static void resize_buf(struct buf *, size_t);
64 static void usage(void);
65
66 static const struct encode encs[ENC__MAX] = {
67 { "utf-8", conv_utf_8 }, /* ENC_UTF_8 */
68 { "us-ascii", conv_us_ascii }, /* ENC_US_ASCII */
69 { "latin-1", conv_latin_1 }, /* ENC_LATIN_1 */
70 };
71
72 static const char *progname;
73
74 static void
75 usage(void)
76 {
77
78 fprintf(stderr, "usage: %s "
79 "[-D enc] "
80 "[-e ENC] "
81 "[file]\n", progname);
82 }
83
84 static int
85 conv_latin_1(const struct buf *b)
86 {
87 size_t i;
88 unsigned char c;
89 const char *cp;
90
91 cp = b->buf + (int)b->offs;
92
93 /*
94 * Latin-1 falls into the first 256 code-points of Unicode, so
95 * there's no need for any sort of translation. Just make the
96 * 8-bit characters use the Unicode escape.
97 */
98
99 for (i = b->offs; i < b->sz; i++) {
100 c = (unsigned char)*cp++;
101 c < 128 ? putchar(c) : printf("\\[u%.4X]", c);
102 }
103
104 return(1);
105 }
106
107 static int
108 conv_us_ascii(const struct buf *b)
109 {
110
111 /*
112 * US-ASCII has no conversion since it falls into the first 128
113 * bytes of Unicode.
114 */
115
116 fwrite(b->buf, 1, b->sz, stdout);
117 return(1);
118 }
119
120 static int
121 conv_utf_8(const struct buf *b)
122 {
123
124 return(1);
125 }
126
127 static void
128 resize_buf(struct buf *buf, size_t initial)
129 {
130
131 buf->sz = buf->sz > initial / 2 ?
132 2 * buf->sz : initial;
133
134 buf->buf = realloc(buf->buf, buf->sz);
135 if (NULL == buf->buf) {
136 perror(NULL);
137 exit(EXIT_FAILURE);
138 }
139 }
140
141 static int
142 read_whole_file(const char *f, int fd,
143 struct buf *fb, int *with_mmap)
144 {
145 struct stat st;
146 size_t off;
147 ssize_t ssz;
148
149 if (-1 == fstat(fd, &st)) {
150 perror(f);
151 return(0);
152 }
153
154 /*
155 * If we're a regular file, try just reading in the whole entry
156 * via mmap(). This is faster than reading it into blocks, and
157 * since each file is only a few bytes to begin with, I'm not
158 * concerned that this is going to tank any machines.
159 */
160
161 if (S_ISREG(st.st_mode) && st.st_size >= (1U << 31)) {
162 fprintf(stderr, "%s: input too large\n", f);
163 return(0);
164 }
165
166 if (S_ISREG(st.st_mode)) {
167 *with_mmap = 1;
168 fb->sz = (size_t)st.st_size;
169 fb->buf = mmap(NULL, fb->sz, PROT_READ,
170 MAP_FILE|MAP_SHARED, fd, 0);
171 if (fb->buf != MAP_FAILED)
172 return(1);
173 }
174
175 /*
176 * If this isn't a regular file (like, say, stdin), then we must
177 * go the old way and just read things in bit by bit.
178 */
179
180 *with_mmap = 0;
181 off = 0;
182 fb->sz = 0;
183 fb->buf = NULL;
184 for (;;) {
185 if (off == fb->sz && fb->sz == (1U << 31)) {
186 fprintf(stderr, "%s: input too large\n", f);
187 break;
188 }
189
190 if (off == fb->sz)
191 resize_buf(fb, 65536);
192
193 ssz = read(fd, fb->buf + (int)off, fb->sz - off);
194 if (ssz == 0) {
195 fb->sz = off;
196 return(1);
197 }
198 if (ssz == -1) {
199 perror(f);
200 break;
201 }
202 off += (size_t)ssz;
203 }
204
205 free(fb->buf);
206 fb->buf = NULL;
207 return(0);
208 }
209
210 int
211 main(int argc, char *argv[])
212 {
213 int i, ch, map, fd, rc;
214 struct buf buf;
215 const char *fn;
216 enum enc enc, def;
217 extern int optind;
218 extern char *optarg;
219
220 progname = strrchr(argv[0], '/');
221 if (progname == NULL)
222 progname = argv[0];
223 else
224 ++progname;
225
226 fn = "<stdin>";
227 fd = STDIN_FILENO;
228 rc = EXIT_FAILURE;
229 enc = def = ENC__MAX;
230 map = 0;
231
232 memset(&buf, 0, sizeof(struct buf));
233
234 while (-1 != (ch = getopt(argc, argv, "D:e:rdvh")))
235 switch (ch) {
236 case ('D'):
237 /* FALLTHROUGH */
238 case ('e'):
239 for (i = 0; i < ENC__MAX; i++) {
240 if (strcasecmp(optarg, encs[i].name))
241 continue;
242 break;
243 }
244 if (i < ENC__MAX) {
245 if ('D' == ch)
246 def = (enum enc)i;
247 else
248 enc = (enum enc)i;
249 break;
250 }
251
252 fprintf(stderr, "%s: Bad encoding\n", optarg);
253 return(EXIT_FAILURE);
254 case ('r'):
255 /* FALLTHROUGH */
256 case ('d'):
257 /* FALLTHROUGH */
258 case ('v'):
259 /* Compatibility with GNU preconv. */
260 break;
261 case ('h'):
262 /* Compatibility with GNU preconv. */
263 /* FALLTHROUGH */
264 default:
265 usage();
266 return(EXIT_FAILURE);
267 }
268
269 argc -= optind;
270 argv += optind;
271
272 /*
273 * Open and read the first argument on the command-line.
274 * If we don't have one, we default to stdin.
275 */
276
277 if (argc > 0) {
278 fn = *argv;
279 fd = open(fn, O_RDONLY, 0);
280 if (-1 == fd) {
281 perror(fn);
282 return(EXIT_FAILURE);
283 }
284 }
285
286 if ( ! read_whole_file(fn, fd, &buf, &map))
287 goto out;
288
289 if (ENC__MAX == enc) {
290 /* TODO: search for BOM. */
291 }
292
293 /*
294 * No encoding has been detected.
295 * Thus, we either fall into our default encoder, if specified,
296 * or use Latin-1 if all else fails.
297 */
298
299 if (ENC__MAX == enc)
300 enc = ENC__MAX == def ? ENC_LATIN_1 : def;
301
302 if ( ! (*encs[(int)enc].conv)(&buf))
303 goto out;
304
305 rc = EXIT_SUCCESS;
306 out:
307 if (map)
308 munmap(buf.buf, buf.sz);
309 else
310 free(buf.buf);
311
312 if (fd > STDIN_FILENO)
313 close(fd);
314
315 return(rc);
316 }