]> git.cameronkatri.com Git - mandoc.git/blob - html.c
First steps in HTML5: use UTF8 meta-charset and HTML5 doctype identifier.
[mandoc.git] / html.c
1 /* $Id: html.c,v 1.164 2014/09/27 08:54:34 kristaps Exp $ */
2 /*
3 * Copyright (c) 2008, 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 * Copyright (c) 2011, 2012, 2013, 2014 Ingo Schwarze <schwarze@openbsd.org>
5 *
6 * Permission to use, copy, modify, and distribute this software for any
7 * purpose with or without fee is hereby granted, provided that the above
8 * copyright notice and this permission notice appear in all copies.
9 *
10 * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 */
18 #include "config.h"
19
20 #include <sys/types.h>
21
22 #include <assert.h>
23 #include <ctype.h>
24 #include <stdarg.h>
25 #include <stdio.h>
26 #include <stdint.h>
27 #include <stdlib.h>
28 #include <string.h>
29 #include <unistd.h>
30
31 #include "mandoc.h"
32 #include "mandoc_aux.h"
33 #include "libmandoc.h"
34 #include "out.h"
35 #include "html.h"
36 #include "main.h"
37
38 struct htmldata {
39 const char *name;
40 int flags;
41 #define HTML_CLRLINE (1 << 0)
42 #define HTML_NOSTACK (1 << 1)
43 #define HTML_AUTOCLOSE (1 << 2) /* Tag has auto-closure. */
44 };
45
46 static const struct htmldata htmltags[TAG_MAX] = {
47 {"html", HTML_CLRLINE}, /* TAG_HTML */
48 {"head", HTML_CLRLINE}, /* TAG_HEAD */
49 {"body", HTML_CLRLINE}, /* TAG_BODY */
50 {"meta", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_META */
51 {"title", HTML_CLRLINE}, /* TAG_TITLE */
52 {"div", HTML_CLRLINE}, /* TAG_DIV */
53 {"h1", 0}, /* TAG_H1 */
54 {"h2", 0}, /* TAG_H2 */
55 {"span", 0}, /* TAG_SPAN */
56 {"link", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_LINK */
57 {"br", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_BR */
58 {"a", 0}, /* TAG_A */
59 {"table", HTML_CLRLINE}, /* TAG_TABLE */
60 {"tbody", HTML_CLRLINE}, /* TAG_TBODY */
61 {"col", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_COL */
62 {"tr", HTML_CLRLINE}, /* TAG_TR */
63 {"td", HTML_CLRLINE}, /* TAG_TD */
64 {"li", HTML_CLRLINE}, /* TAG_LI */
65 {"ul", HTML_CLRLINE}, /* TAG_UL */
66 {"ol", HTML_CLRLINE}, /* TAG_OL */
67 {"dl", HTML_CLRLINE}, /* TAG_DL */
68 {"dt", HTML_CLRLINE}, /* TAG_DT */
69 {"dd", HTML_CLRLINE}, /* TAG_DD */
70 {"blockquote", HTML_CLRLINE}, /* TAG_BLOCKQUOTE */
71 {"p", HTML_CLRLINE | HTML_NOSTACK | HTML_AUTOCLOSE}, /* TAG_P */
72 {"pre", HTML_CLRLINE }, /* TAG_PRE */
73 {"b", 0 }, /* TAG_B */
74 {"i", 0 }, /* TAG_I */
75 {"code", 0 }, /* TAG_CODE */
76 {"small", 0 }, /* TAG_SMALL */
77 };
78
79 static const char *const htmlattrs[ATTR_MAX] = {
80 "http-equiv", /* ATTR_HTTPEQUIV */
81 "content", /* ATTR_CONTENT */
82 "name", /* ATTR_NAME */
83 "rel", /* ATTR_REL */
84 "href", /* ATTR_HREF */
85 "type", /* ATTR_TYPE */
86 "media", /* ATTR_MEDIA */
87 "class", /* ATTR_CLASS */
88 "style", /* ATTR_STYLE */
89 "width", /* ATTR_WIDTH */
90 "id", /* ATTR_ID */
91 "summary", /* ATTR_SUMMARY */
92 "align", /* ATTR_ALIGN */
93 "colspan", /* ATTR_COLSPAN */
94 "charset", /* ATTR_CHARSET */
95 };
96
97 static const char *const roffscales[SCALE_MAX] = {
98 "cm", /* SCALE_CM */
99 "in", /* SCALE_IN */
100 "pc", /* SCALE_PC */
101 "pt", /* SCALE_PT */
102 "em", /* SCALE_EM */
103 "em", /* SCALE_MM */
104 "ex", /* SCALE_EN */
105 "ex", /* SCALE_BU */
106 "em", /* SCALE_VS */
107 "ex", /* SCALE_FS */
108 };
109
110 static void bufncat(struct html *, const char *, size_t);
111 static void print_ctag(struct html *, enum htmltag);
112 static int print_escape(char);
113 static int print_encode(struct html *, const char *, int);
114 static void print_metaf(struct html *, enum mandoc_esc);
115 static void print_attr(struct html *, const char *, const char *);
116 static void *ml_alloc(char *, enum htmltype);
117
118
119 static void *
120 ml_alloc(char *outopts, enum htmltype type)
121 {
122 struct html *h;
123 const char *toks[5];
124 char *v;
125
126 toks[0] = "style";
127 toks[1] = "man";
128 toks[2] = "includes";
129 toks[3] = "fragment";
130 toks[4] = NULL;
131
132 h = mandoc_calloc(1, sizeof(struct html));
133
134 h->type = type;
135 h->tags.head = NULL;
136 h->symtab = mchars_alloc();
137
138 while (outopts && *outopts)
139 switch (getsubopt(&outopts, UNCONST(toks), &v)) {
140 case 0:
141 h->style = v;
142 break;
143 case 1:
144 h->base_man = v;
145 break;
146 case 2:
147 h->base_includes = v;
148 break;
149 case 3:
150 h->oflags |= HTML_FRAGMENT;
151 break;
152 default:
153 break;
154 }
155
156 return(h);
157 }
158
159 void *
160 html_alloc(char *outopts)
161 {
162
163 return(ml_alloc(outopts, HTML_HTML_4_01_STRICT));
164 }
165
166 void *
167 xhtml_alloc(char *outopts)
168 {
169
170 return(ml_alloc(outopts, HTML_XHTML_1_0_STRICT));
171 }
172
173 void
174 html_free(void *p)
175 {
176 struct tag *tag;
177 struct html *h;
178
179 h = (struct html *)p;
180
181 while ((tag = h->tags.head) != NULL) {
182 h->tags.head = tag->next;
183 free(tag);
184 }
185
186 if (h->symtab)
187 mchars_free(h->symtab);
188
189 free(h);
190 }
191
192 void
193 print_gen_head(struct html *h)
194 {
195 struct htmlpair tag[4];
196
197 tag[0].key = ATTR_CHARSET;
198 tag[0].val = "utf-8";
199 print_otag(h, TAG_META, 1, tag);
200
201 if (h->style) {
202 tag[0].key = ATTR_REL;
203 tag[0].val = "stylesheet";
204 tag[1].key = ATTR_HREF;
205 tag[1].val = h->style;
206 tag[2].key = ATTR_TYPE;
207 tag[2].val = "text/css";
208 tag[3].key = ATTR_MEDIA;
209 tag[3].val = "all";
210 print_otag(h, TAG_LINK, 4, tag);
211 }
212 }
213
214 static void
215 print_metaf(struct html *h, enum mandoc_esc deco)
216 {
217 enum htmlfont font;
218
219 switch (deco) {
220 case ESCAPE_FONTPREV:
221 font = h->metal;
222 break;
223 case ESCAPE_FONTITALIC:
224 font = HTMLFONT_ITALIC;
225 break;
226 case ESCAPE_FONTBOLD:
227 font = HTMLFONT_BOLD;
228 break;
229 case ESCAPE_FONTBI:
230 font = HTMLFONT_BI;
231 break;
232 case ESCAPE_FONT:
233 /* FALLTHROUGH */
234 case ESCAPE_FONTROMAN:
235 font = HTMLFONT_NONE;
236 break;
237 default:
238 abort();
239 /* NOTREACHED */
240 }
241
242 if (h->metaf) {
243 print_tagq(h, h->metaf);
244 h->metaf = NULL;
245 }
246
247 h->metal = h->metac;
248 h->metac = font;
249
250 switch (font) {
251 case HTMLFONT_ITALIC:
252 h->metaf = print_otag(h, TAG_I, 0, NULL);
253 break;
254 case HTMLFONT_BOLD:
255 h->metaf = print_otag(h, TAG_B, 0, NULL);
256 break;
257 case HTMLFONT_BI:
258 h->metaf = print_otag(h, TAG_B, 0, NULL);
259 print_otag(h, TAG_I, 0, NULL);
260 break;
261 default:
262 break;
263 }
264 }
265
266 int
267 html_strlen(const char *cp)
268 {
269 size_t rsz;
270 int skip, sz;
271
272 /*
273 * Account for escaped sequences within string length
274 * calculations. This follows the logic in term_strlen() as we
275 * must calculate the width of produced strings.
276 * Assume that characters are always width of "1". This is
277 * hacky, but it gets the job done for approximation of widths.
278 */
279
280 sz = 0;
281 skip = 0;
282 while (1) {
283 rsz = strcspn(cp, "\\");
284 if (rsz) {
285 cp += rsz;
286 if (skip) {
287 skip = 0;
288 rsz--;
289 }
290 sz += rsz;
291 }
292 if ('\0' == *cp)
293 break;
294 cp++;
295 switch (mandoc_escape(&cp, NULL, NULL)) {
296 case ESCAPE_ERROR:
297 return(sz);
298 case ESCAPE_UNICODE:
299 /* FALLTHROUGH */
300 case ESCAPE_NUMBERED:
301 /* FALLTHROUGH */
302 case ESCAPE_SPECIAL:
303 if (skip)
304 skip = 0;
305 else
306 sz++;
307 break;
308 case ESCAPE_SKIPCHAR:
309 skip = 1;
310 break;
311 default:
312 break;
313 }
314 }
315 return(sz);
316 }
317
318 static int
319 print_escape(char c)
320 {
321
322 switch (c) {
323 case '<':
324 printf("&lt;");
325 break;
326 case '>':
327 printf("&gt;");
328 break;
329 case '&':
330 printf("&amp;");
331 break;
332 case '"':
333 printf("&quot;");
334 break;
335 case ASCII_NBRSP:
336 putchar('-');
337 break;
338 case ASCII_HYPH:
339 putchar('-');
340 /* FALLTHROUGH */
341 case ASCII_BREAK:
342 break;
343 default:
344 return(0);
345 }
346 return(1);
347 }
348
349 static int
350 print_encode(struct html *h, const char *p, int norecurse)
351 {
352 size_t sz;
353 int c, len, nospace;
354 const char *seq;
355 enum mandoc_esc esc;
356 static const char rejs[9] = { '\\', '<', '>', '&', '"',
357 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' };
358
359 nospace = 0;
360
361 while ('\0' != *p) {
362 if (HTML_SKIPCHAR & h->flags && '\\' != *p) {
363 h->flags &= ~HTML_SKIPCHAR;
364 p++;
365 continue;
366 }
367
368 sz = strcspn(p, rejs);
369
370 fwrite(p, 1, sz, stdout);
371 p += (int)sz;
372
373 if ('\0' == *p)
374 break;
375
376 if (print_escape(*p++))
377 continue;
378
379 esc = mandoc_escape(&p, &seq, &len);
380 if (ESCAPE_ERROR == esc)
381 break;
382
383 switch (esc) {
384 case ESCAPE_FONT:
385 /* FALLTHROUGH */
386 case ESCAPE_FONTPREV:
387 /* FALLTHROUGH */
388 case ESCAPE_FONTBOLD:
389 /* FALLTHROUGH */
390 case ESCAPE_FONTITALIC:
391 /* FALLTHROUGH */
392 case ESCAPE_FONTBI:
393 /* FALLTHROUGH */
394 case ESCAPE_FONTROMAN:
395 if (0 == norecurse)
396 print_metaf(h, esc);
397 continue;
398 case ESCAPE_SKIPCHAR:
399 h->flags |= HTML_SKIPCHAR;
400 continue;
401 default:
402 break;
403 }
404
405 if (h->flags & HTML_SKIPCHAR) {
406 h->flags &= ~HTML_SKIPCHAR;
407 continue;
408 }
409
410 switch (esc) {
411 case ESCAPE_UNICODE:
412 /* Skip past "u" header. */
413 c = mchars_num2uc(seq + 1, len - 1);
414 if ('\0' != c)
415 printf("&#x%x;", c);
416 break;
417 case ESCAPE_NUMBERED:
418 c = mchars_num2char(seq, len);
419 if ( ! ('\0' == c || print_escape(c)))
420 putchar(c);
421 break;
422 case ESCAPE_SPECIAL:
423 c = mchars_spec2cp(h->symtab, seq, len);
424 if (c > 0)
425 printf("&#%d;", c);
426 else if (-1 == c && 1 == len &&
427 !print_escape(*seq))
428 putchar((int)*seq);
429 break;
430 case ESCAPE_NOSPACE:
431 if ('\0' == *p)
432 nospace = 1;
433 break;
434 default:
435 break;
436 }
437 }
438
439 return(nospace);
440 }
441
442 static void
443 print_attr(struct html *h, const char *key, const char *val)
444 {
445 printf(" %s=\"", key);
446 (void)print_encode(h, val, 1);
447 putchar('\"');
448 }
449
450 struct tag *
451 print_otag(struct html *h, enum htmltag tag,
452 int sz, const struct htmlpair *p)
453 {
454 int i;
455 struct tag *t;
456
457 /* Push this tags onto the stack of open scopes. */
458
459 if ( ! (HTML_NOSTACK & htmltags[tag].flags)) {
460 t = mandoc_malloc(sizeof(struct tag));
461 t->tag = tag;
462 t->next = h->tags.head;
463 h->tags.head = t;
464 } else
465 t = NULL;
466
467 if ( ! (HTML_NOSPACE & h->flags))
468 if ( ! (HTML_CLRLINE & htmltags[tag].flags)) {
469 /* Manage keeps! */
470 if ( ! (HTML_KEEP & h->flags)) {
471 if (HTML_PREKEEP & h->flags)
472 h->flags |= HTML_KEEP;
473 putchar(' ');
474 } else
475 printf("&#160;");
476 }
477
478 if ( ! (h->flags & HTML_NONOSPACE))
479 h->flags &= ~HTML_NOSPACE;
480 else
481 h->flags |= HTML_NOSPACE;
482
483 /* Print out the tag name and attributes. */
484
485 printf("<%s", htmltags[tag].name);
486 for (i = 0; i < sz; i++)
487 print_attr(h, htmlattrs[p[i].key], p[i].val);
488
489 /* Accommodate for XML "well-formed" singleton escaping. */
490
491 if (HTML_AUTOCLOSE & htmltags[tag].flags)
492 switch (h->type) {
493 case HTML_XHTML_1_0_STRICT:
494 putchar('/');
495 break;
496 default:
497 break;
498 }
499
500 putchar('>');
501
502 h->flags |= HTML_NOSPACE;
503
504 if ((HTML_AUTOCLOSE | HTML_CLRLINE) & htmltags[tag].flags)
505 putchar('\n');
506
507 return(t);
508 }
509
510 static void
511 print_ctag(struct html *h, enum htmltag tag)
512 {
513
514 printf("</%s>", htmltags[tag].name);
515 if (HTML_CLRLINE & htmltags[tag].flags) {
516 h->flags |= HTML_NOSPACE;
517 putchar('\n');
518 }
519 }
520
521 void
522 print_gen_decls(struct html *h)
523 {
524
525 puts("<!DOCTYPE html>");
526 }
527
528 void
529 print_text(struct html *h, const char *word)
530 {
531
532 if ( ! (HTML_NOSPACE & h->flags)) {
533 /* Manage keeps! */
534 if ( ! (HTML_KEEP & h->flags)) {
535 if (HTML_PREKEEP & h->flags)
536 h->flags |= HTML_KEEP;
537 putchar(' ');
538 } else
539 printf("&#160;");
540 }
541
542 assert(NULL == h->metaf);
543 switch (h->metac) {
544 case HTMLFONT_ITALIC:
545 h->metaf = print_otag(h, TAG_I, 0, NULL);
546 break;
547 case HTMLFONT_BOLD:
548 h->metaf = print_otag(h, TAG_B, 0, NULL);
549 break;
550 case HTMLFONT_BI:
551 h->metaf = print_otag(h, TAG_B, 0, NULL);
552 print_otag(h, TAG_I, 0, NULL);
553 break;
554 default:
555 break;
556 }
557
558 assert(word);
559 if ( ! print_encode(h, word, 0)) {
560 if ( ! (h->flags & HTML_NONOSPACE))
561 h->flags &= ~HTML_NOSPACE;
562 } else
563 h->flags |= HTML_NOSPACE;
564
565 if (h->metaf) {
566 print_tagq(h, h->metaf);
567 h->metaf = NULL;
568 }
569
570 h->flags &= ~HTML_IGNDELIM;
571 }
572
573 void
574 print_tagq(struct html *h, const struct tag *until)
575 {
576 struct tag *tag;
577
578 while ((tag = h->tags.head) != NULL) {
579 /*
580 * Remember to close out and nullify the current
581 * meta-font and table, if applicable.
582 */
583 if (tag == h->metaf)
584 h->metaf = NULL;
585 if (tag == h->tblt)
586 h->tblt = NULL;
587 print_ctag(h, tag->tag);
588 h->tags.head = tag->next;
589 free(tag);
590 if (until && tag == until)
591 return;
592 }
593 }
594
595 void
596 print_stagq(struct html *h, const struct tag *suntil)
597 {
598 struct tag *tag;
599
600 while ((tag = h->tags.head) != NULL) {
601 if (suntil && tag == suntil)
602 return;
603 /*
604 * Remember to close out and nullify the current
605 * meta-font and table, if applicable.
606 */
607 if (tag == h->metaf)
608 h->metaf = NULL;
609 if (tag == h->tblt)
610 h->tblt = NULL;
611 print_ctag(h, tag->tag);
612 h->tags.head = tag->next;
613 free(tag);
614 }
615 }
616
617 void
618 bufinit(struct html *h)
619 {
620
621 h->buf[0] = '\0';
622 h->buflen = 0;
623 }
624
625 void
626 bufcat_style(struct html *h, const char *key, const char *val)
627 {
628
629 bufcat(h, key);
630 bufcat(h, ":");
631 bufcat(h, val);
632 bufcat(h, ";");
633 }
634
635 void
636 bufcat(struct html *h, const char *p)
637 {
638
639 /*
640 * XXX This is broken and not easy to fix.
641 * When using the -Oincludes option, buffmt_includes()
642 * may pass in strings overrunning BUFSIZ, causing a crash.
643 */
644
645 h->buflen = strlcat(h->buf, p, BUFSIZ);
646 assert(h->buflen < BUFSIZ);
647 }
648
649 void
650 bufcat_fmt(struct html *h, const char *fmt, ...)
651 {
652 va_list ap;
653
654 va_start(ap, fmt);
655 (void)vsnprintf(h->buf + (int)h->buflen,
656 BUFSIZ - h->buflen - 1, fmt, ap);
657 va_end(ap);
658 h->buflen = strlen(h->buf);
659 }
660
661 static void
662 bufncat(struct html *h, const char *p, size_t sz)
663 {
664
665 assert(h->buflen + sz + 1 < BUFSIZ);
666 strncat(h->buf, p, sz);
667 h->buflen += sz;
668 }
669
670 void
671 buffmt_includes(struct html *h, const char *name)
672 {
673 const char *p, *pp;
674
675 pp = h->base_includes;
676
677 bufinit(h);
678 while (NULL != (p = strchr(pp, '%'))) {
679 bufncat(h, pp, (size_t)(p - pp));
680 switch (*(p + 1)) {
681 case'I':
682 bufcat(h, name);
683 break;
684 default:
685 bufncat(h, p, 2);
686 break;
687 }
688 pp = p + 2;
689 }
690 if (pp)
691 bufcat(h, pp);
692 }
693
694 void
695 buffmt_man(struct html *h, const char *name, const char *sec)
696 {
697 const char *p, *pp;
698
699 pp = h->base_man;
700
701 bufinit(h);
702 while (NULL != (p = strchr(pp, '%'))) {
703 bufncat(h, pp, (size_t)(p - pp));
704 switch (*(p + 1)) {
705 case 'S':
706 bufcat(h, sec ? sec : "1");
707 break;
708 case 'N':
709 bufcat_fmt(h, "%s", name);
710 break;
711 default:
712 bufncat(h, p, 2);
713 break;
714 }
715 pp = p + 2;
716 }
717 if (pp)
718 bufcat(h, pp);
719 }
720
721 void
722 bufcat_su(struct html *h, const char *p, const struct roffsu *su)
723 {
724 double v;
725
726 v = su->scale;
727 if (SCALE_MM == su->unit && 0.0 == (v /= 100.0))
728 v = 1.0;
729 else if (SCALE_BU == su->unit)
730 v /= 24.0;
731
732 bufcat_fmt(h, "%s: %.2f%s;", p, v, roffscales[su->unit]);
733 }
734
735 void
736 bufcat_id(struct html *h, const char *src)
737 {
738
739 /* Cf. <http://www.w3.org/TR/html4/types.html#h-6.2>. */
740
741 while ('\0' != *src)
742 bufcat_fmt(h, "%.2x", *src++);
743 }