html.c

   1 /* $Id: html.c,v 1.271 2020/10/16 17:22:43 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2011-2015, 2017-2020 Ingo Schwarze <schwarze@openbsd.org>
   4  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  *
  18  * Common functions for mandoc(1) HTML formatters.
  19  * For use by individual formatters and by the main program.
  20  */
  21 #include "config.h"
  22
  23 #include <sys/types.h>
  24 #include <sys/stat.h>
  25
  26 #include <assert.h>
  27 #include <ctype.h>
  28 #include <stdarg.h>
  29 #include <stddef.h>
  30 #include <stdio.h>
  31 #include <stdint.h>
  32 #include <stdlib.h>
  33 #include <string.h>
  34 #include <unistd.h>
  35
  36 #include "mandoc_aux.h"
  37 #include "mandoc_ohash.h"
  38 #include "mandoc.h"
  39 #include "roff.h"
  40 #include "out.h"
  41 #include "html.h"
  42 #include "manconf.h"
  43 #include "main.h"
  44
  45 struct  htmldata {
  46         const char       *name;
  47         int               flags;
  48 #define HTML_INPHRASE    (1 << 0)  /* Can appear in phrasing context. */
  49 #define HTML_TOPHRASE    (1 << 1)  /* Establishes phrasing context. */
  50 #define HTML_NOSTACK     (1 << 2)  /* Does not have an end tag. */
  51 #define HTML_NLBEFORE    (1 << 3)  /* Output line break before opening. */
  52 #define HTML_NLBEGIN     (1 << 4)  /* Output line break after opening. */
  53 #define HTML_NLEND       (1 << 5)  /* Output line break before closing. */
  54 #define HTML_NLAFTER     (1 << 6)  /* Output line break after closing. */
  55 #define HTML_NLAROUND    (HTML_NLBEFORE | HTML_NLAFTER)
  56 #define HTML_NLINSIDE    (HTML_NLBEGIN | HTML_NLEND)
  57 #define HTML_NLALL       (HTML_NLAROUND | HTML_NLINSIDE)
  58 #define HTML_INDENT      (1 << 7)  /* Indent content by two spaces. */
  59 #define HTML_NOINDENT    (1 << 8)  /* Exception: never indent content. */
  60 };
  61
  62 static  const struct htmldata htmltags[TAG_MAX] = {
  63         {"html",        HTML_NLALL},
  64         {"head",        HTML_NLALL | HTML_INDENT},
  65         {"meta",        HTML_NOSTACK | HTML_NLALL},
  66         {"link",        HTML_NOSTACK | HTML_NLALL},
  67         {"style",       HTML_NLALL | HTML_INDENT},
  68         {"title",       HTML_NLAROUND},
  69         {"body",        HTML_NLALL},
  70         {"div",         HTML_NLAROUND},
  71         {"section",     HTML_NLALL},
  72         {"table",       HTML_NLALL | HTML_INDENT},
  73         {"tr",          HTML_NLALL | HTML_INDENT},
  74         {"td",          HTML_NLAROUND},
  75         {"li",          HTML_NLAROUND | HTML_INDENT},
  76         {"ul",          HTML_NLALL | HTML_INDENT},
  77         {"ol",          HTML_NLALL | HTML_INDENT},
  78         {"dl",          HTML_NLALL | HTML_INDENT},
  79         {"dt",          HTML_NLAROUND},
  80         {"dd",          HTML_NLAROUND | HTML_INDENT},
  81         {"h1",          HTML_TOPHRASE | HTML_NLAROUND},
  82         {"h2",          HTML_TOPHRASE | HTML_NLAROUND},
  83         {"p",           HTML_TOPHRASE | HTML_NLAROUND | HTML_INDENT},
  84         {"pre",         HTML_TOPHRASE | HTML_NLAROUND | HTML_NOINDENT},
  85         {"a",           HTML_INPHRASE | HTML_TOPHRASE},
  86         {"b",           HTML_INPHRASE | HTML_TOPHRASE},
  87         {"cite",        HTML_INPHRASE | HTML_TOPHRASE},
  88         {"code",        HTML_INPHRASE | HTML_TOPHRASE},
  89         {"i",           HTML_INPHRASE | HTML_TOPHRASE},
  90         {"small",       HTML_INPHRASE | HTML_TOPHRASE},
  91         {"span",        HTML_INPHRASE | HTML_TOPHRASE},
  92         {"var",         HTML_INPHRASE | HTML_TOPHRASE},
  93         {"br",          HTML_INPHRASE | HTML_NOSTACK | HTML_NLALL},
  94         {"mark",        HTML_INPHRASE },
  95         {"math",        HTML_INPHRASE | HTML_NLALL | HTML_INDENT},
  96         {"mrow",        0},
  97         {"mi",          0},
  98         {"mn",          0},
  99         {"mo",          0},
 100         {"msup",        0},
 101         {"msub",        0},
 102         {"msubsup",     0},
 103         {"mfrac",       0},
 104         {"msqrt",       0},
 105         {"mfenced",     0},
 106         {"mtable",      0},
 107         {"mtr",         0},
 108         {"mtd",         0},
 109         {"munderover",  0},
 110         {"munder",      0},
 111         {"mover",       0},
 112 };
 113
 114 /* Avoid duplicate HTML id= attributes. */
 115
 116 struct  id_entry {
 117         int      ord;   /* Ordinal number of the latest occurrence. */
 118         char     id[];  /* The id= attribute without any ordinal suffix. */
 119 };
 120 static  struct ohash     id_unique;
 121
 122 static  void     html_reset_internal(struct html *);
 123 static  void     print_byte(struct html *, char);
 124 static  void     print_endword(struct html *);
 125 static  void     print_indent(struct html *);
 126 static  void     print_word(struct html *, const char *);
 127
 128 static  void     print_ctag(struct html *, struct tag *);
 129 static  int      print_escape(struct html *, char);
 130 static  int      print_encode(struct html *, const char *, const char *, int);
 131 static  void     print_href(struct html *, const char *, const char *, int);
 132 static  void     print_metaf(struct html *);
 133
 134
 135 void *
 136 html_alloc(const struct manoutput *outopts)
 137 {
 138         struct html     *h;
 139
 140         h = mandoc_calloc(1, sizeof(struct html));
 141
 142         h->tag = NULL;
 143         h->style = outopts->style;
 144         if ((h->base_man1 = outopts->man) == NULL)
 145                 h->base_man2 = NULL;
 146         else if ((h->base_man2 = strchr(h->base_man1, ';')) != NULL)
 147                 *h->base_man2++ = '\0';
 148         h->base_includes = outopts->includes;
 149         if (outopts->fragment)
 150                 h->oflags |= HTML_FRAGMENT;
 151         if (outopts->toc)
 152                 h->oflags |= HTML_TOC;
 153
 154         mandoc_ohash_init(&id_unique, 4, offsetof(struct id_entry, id));
 155
 156         return h;
 157 }
 158
 159 static void
 160 html_reset_internal(struct html *h)
 161 {
 162         struct tag      *tag;
 163         struct id_entry *entry;
 164         unsigned int     slot;
 165
 166         while ((tag = h->tag) != NULL) {
 167                 h->tag = tag->next;
 168                 free(tag);
 169         }
 170         entry = ohash_first(&id_unique, &slot);
 171         while (entry != NULL) {
 172                 free(entry);
 173                 entry = ohash_next(&id_unique, &slot);
 174         }
 175         ohash_delete(&id_unique);
 176 }
 177
 178 void
 179 html_reset(void *p)
 180 {
 181         html_reset_internal(p);
 182         mandoc_ohash_init(&id_unique, 4, offsetof(struct id_entry, id));
 183 }
 184
 185 void
 186 html_free(void *p)
 187 {
 188         html_reset_internal(p);
 189         free(p);
 190 }
 191
 192 void
 193 print_gen_head(struct html *h)
 194 {
 195         struct tag      *t;
 196
 197         print_otag(h, TAG_META, "?", "charset", "utf-8");
 198         if (h->style != NULL) {
 199                 print_otag(h, TAG_LINK, "?h??", "rel", "stylesheet",
 200                     h->style, "type", "text/css", "media", "all");
 201                 return;
 202         }
 203
 204         /*
 205          * Print a minimal embedded style sheet.
 206          */
 207
 208         t = print_otag(h, TAG_STYLE, "");
 209         print_text(h, "table.head, table.foot { width: 100%; }");
 210         print_endline(h);
 211         print_text(h, "td.head-rtitle, td.foot-os { text-align: right; }");
 212         print_endline(h);
 213         print_text(h, "td.head-vol { text-align: center; }");
 214         print_endline(h);
 215         print_text(h, ".Nd, .Bf, .Op { display: inline; }");
 216         print_endline(h);
 217         print_text(h, ".Pa, .Ad { font-style: italic; }");
 218         print_endline(h);
 219         print_text(h, ".Ms { font-weight: bold; }");
 220         print_endline(h);
 221         print_text(h, ".Bl-diag ");
 222         print_byte(h, '>');
 223         print_text(h, " dt { font-weight: bold; }");
 224         print_endline(h);
 225         print_text(h, "code.Nm, .Fl, .Cm, .Ic, code.In, .Fd, .Fn, .Cd "
 226             "{ font-weight: bold; font-family: inherit; }");
 227         print_tagq(h, t);
 228 }
 229
 230 int
 231 html_setfont(struct html *h, enum mandoc_esc font)
 232 {
 233         switch (font) {
 234         case ESCAPE_FONTPREV:
 235                 font = h->metal;
 236                 break;
 237         case ESCAPE_FONTITALIC:
 238         case ESCAPE_FONTBOLD:
 239         case ESCAPE_FONTBI:
 240         case ESCAPE_FONTCW:
 241         case ESCAPE_FONTROMAN:
 242                 break;
 243         case ESCAPE_FONT:
 244                 font = ESCAPE_FONTROMAN;
 245                 break;
 246         default:
 247                 return 0;
 248         }
 249         h->metal = h->metac;
 250         h->metac = font;
 251         return 1;
 252 }
 253
 254 static void
 255 print_metaf(struct html *h)
 256 {
 257         if (h->metaf) {
 258                 print_tagq(h, h->metaf);
 259                 h->metaf = NULL;
 260         }
 261         switch (h->metac) {
 262         case ESCAPE_FONTITALIC:
 263                 h->metaf = print_otag(h, TAG_I, "");
 264                 break;
 265         case ESCAPE_FONTBOLD:
 266                 h->metaf = print_otag(h, TAG_B, "");
 267                 break;
 268         case ESCAPE_FONTBI:
 269                 h->metaf = print_otag(h, TAG_B, "");
 270                 print_otag(h, TAG_I, "");
 271                 break;
 272         case ESCAPE_FONTCW:
 273                 h->metaf = print_otag(h, TAG_SPAN, "c", "Li");
 274                 break;
 275         default:
 276                 break;
 277         }
 278 }
 279
 280 void
 281 html_close_paragraph(struct html *h)
 282 {
 283         struct tag      *this, *next;
 284         int              flags;
 285
 286         this = h->tag;
 287         for (;;) {
 288                 next = this->next;
 289                 flags = htmltags[this->tag].flags;
 290                 if (flags & (HTML_INPHRASE | HTML_TOPHRASE))
 291                         print_ctag(h, this);
 292                 if ((flags & HTML_INPHRASE) == 0)
 293                         break;
 294                 this = next;
 295         }
 296 }
 297
 298 /*
 299  * ROFF_nf switches to no-fill mode, ROFF_fi to fill mode.
 300  * TOKEN_NONE does not switch.  The old mode is returned.
 301  */
 302 enum roff_tok
 303 html_fillmode(struct html *h, enum roff_tok want)
 304 {
 305         struct tag      *t;
 306         enum roff_tok    had;
 307
 308         for (t = h->tag; t != NULL; t = t->next)
 309                 if (t->tag == TAG_PRE)
 310                         break;
 311
 312         had = t == NULL ? ROFF_fi : ROFF_nf;
 313
 314         if (want != had) {
 315                 switch (want) {
 316                 case ROFF_fi:
 317                         print_tagq(h, t);
 318                         break;
 319                 case ROFF_nf:
 320                         html_close_paragraph(h);
 321                         print_otag(h, TAG_PRE, "");
 322                         break;
 323                 case TOKEN_NONE:
 324                         break;
 325                 default:
 326                         abort();
 327                 }
 328         }
 329         return had;
 330 }
 331
 332 /*
 333  * Allocate a string to be used for the "id=" attribute of an HTML
 334  * element and/or as a segment identifier for a URI in an <a> element.
 335  * The function may fail and return NULL if the node lacks text data
 336  * to create the attribute from.
 337  * The caller is responsible for free(3)ing the returned string.
 338  *
 339  * If the "unique" argument is non-zero, the "id_unique" ohash table
 340  * is used for de-duplication.  If the "unique" argument is 1,
 341  * it is the first time the function is called for this tag and
 342  * location, so if an ordinal suffix is needed, it is incremented.
 343  * If the "unique" argument is 2, it is the second time the function
 344  * is called for this tag and location, so the ordinal suffix
 345  * remains unchanged.
 346  */
 347 char *
 348 html_make_id(const struct roff_node *n, int unique)
 349 {
 350         const struct roff_node  *nch;
 351         struct id_entry         *entry;
 352         char                    *buf, *cp;
 353         size_t                   len;
 354         unsigned int             slot;
 355
 356         if (n->tag != NULL)
 357                 buf = mandoc_strdup(n->tag);
 358         else {
 359                 switch (n->tok) {
 360                 case MDOC_Sh:
 361                 case MDOC_Ss:
 362                 case MDOC_Sx:
 363                 case MAN_SH:
 364                 case MAN_SS:
 365                         for (nch = n->child; nch != NULL; nch = nch->next)
 366                                 if (nch->type != ROFFT_TEXT)
 367                                         return NULL;
 368                         buf = NULL;
 369                         deroff(&buf, n);
 370                         if (buf == NULL)
 371                                 return NULL;
 372                         break;
 373                 default:
 374                         if (n->child == NULL || n->child->type != ROFFT_TEXT)
 375                                 return NULL;
 376                         buf = mandoc_strdup(n->child->string);
 377                         break;
 378                 }
 379         }
 380
 381         /*
 382          * In ID attributes, only use ASCII characters that are
 383          * permitted in URL-fragment strings according to the
 384          * explicit list at:
 385          * https://url.spec.whatwg.org/#url-fragment-string
 386          * In addition, reserve '~' for ordinal suffixes.
 387          */
 388
 389         for (cp = buf; *cp != '\0'; cp++)
 390                 if (isalnum((unsigned char)*cp) == 0 &&
 391                     strchr("!$&'()*+,-./:;=?@_", *cp) == NULL)
 392                         *cp = '_';
 393
 394         if (unique == 0)
 395                 return buf;
 396
 397         /* Avoid duplicate HTML id= attributes. */
 398
 399         slot = ohash_qlookup(&id_unique, buf);
 400         if ((entry = ohash_find(&id_unique, slot)) == NULL) {
 401                 len = strlen(buf) + 1;
 402                 entry = mandoc_malloc(sizeof(*entry) + len);
 403                 entry->ord = 1;
 404                 memcpy(entry->id, buf, len);
 405                 ohash_insert(&id_unique, slot, entry);
 406         } else if (unique == 1)
 407                 entry->ord++;
 408
 409         if (entry->ord > 1) {
 410                 cp = buf;
 411                 mandoc_asprintf(&buf, "%s~%d", cp, entry->ord);
 412                 free(cp);
 413         }
 414         return buf;
 415 }
 416
 417 static int
 418 print_escape(struct html *h, char c)
 419 {
 420
 421         switch (c) {
 422         case '<':
 423                 print_word(h, "&lt;");
 424                 break;
 425         case '>':
 426                 print_word(h, "&gt;");
 427                 break;
 428         case '&':
 429                 print_word(h, "&amp;");
 430                 break;
 431         case '"':
 432                 print_word(h, "&quot;");
 433                 break;
 434         case ASCII_NBRSP:
 435                 print_word(h, "&nbsp;");
 436                 break;
 437         case ASCII_HYPH:
 438                 print_byte(h, '-');
 439                 break;
 440         case ASCII_BREAK:
 441                 break;
 442         default:
 443                 return 0;
 444         }
 445         return 1;
 446 }
 447
 448 static int
 449 print_encode(struct html *h, const char *p, const char *pend, int norecurse)
 450 {
 451         char             numbuf[16];
 452         const char      *seq;
 453         size_t           sz;
 454         int              c, len, breakline, nospace;
 455         enum mandoc_esc  esc;
 456         static const char rejs[10] = { ' ', '\\', '<', '>', '&', '"',
 457                 ASCII_NBRSP, ASCII_HYPH, ASCII_BREAK, '\0' };
 458
 459         if (pend == NULL)
 460                 pend = strchr(p, '\0');
 461
 462         breakline = 0;
 463         nospace = 0;
 464
 465         while (p < pend) {
 466                 if (HTML_SKIPCHAR & h->flags && '\\' != *p) {
 467                         h->flags &= ~HTML_SKIPCHAR;
 468                         p++;
 469                         continue;
 470                 }
 471
 472                 for (sz = strcspn(p, rejs); sz-- && p < pend; p++)
 473                         print_byte(h, *p);
 474
 475                 if (breakline &&
 476                     (p >= pend || *p == ' ' || *p == ASCII_NBRSP)) {
 477                         print_otag(h, TAG_BR, "");
 478                         breakline = 0;
 479                         while (p < pend && (*p == ' ' || *p == ASCII_NBRSP))
 480                                 p++;
 481                         continue;
 482                 }
 483
 484                 if (p >= pend)
 485                         break;
 486
 487                 if (*p == ' ') {
 488                         print_endword(h);
 489                         p++;
 490                         continue;
 491                 }
 492
 493                 if (print_escape(h, *p++))
 494                         continue;
 495
 496                 esc = mandoc_escape(&p, &seq, &len);
 497                 switch (esc) {
 498                 case ESCAPE_FONT:
 499                 case ESCAPE_FONTPREV:
 500                 case ESCAPE_FONTBOLD:
 501                 case ESCAPE_FONTITALIC:
 502                 case ESCAPE_FONTBI:
 503                 case ESCAPE_FONTCW:
 504                 case ESCAPE_FONTROMAN:
 505                         if (0 == norecurse) {
 506                                 h->flags |= HTML_NOSPACE;
 507                                 if (html_setfont(h, esc))
 508                                         print_metaf(h);
 509                                 h->flags &= ~HTML_NOSPACE;
 510                         }
 511                         continue;
 512                 case ESCAPE_SKIPCHAR:
 513                         h->flags |= HTML_SKIPCHAR;
 514                         continue;
 515                 case ESCAPE_ERROR:
 516                         continue;
 517                 default:
 518                         break;
 519                 }
 520
 521                 if (h->flags & HTML_SKIPCHAR) {
 522                         h->flags &= ~HTML_SKIPCHAR;
 523                         continue;
 524                 }
 525
 526                 switch (esc) {
 527                 case ESCAPE_UNICODE:
 528                         /* Skip past "u" header. */
 529                         c = mchars_num2uc(seq + 1, len - 1);
 530                         break;
 531                 case ESCAPE_NUMBERED:
 532                         c = mchars_num2char(seq, len);
 533                         if (c < 0)
 534                                 continue;
 535                         break;
 536                 case ESCAPE_SPECIAL:
 537                         c = mchars_spec2cp(seq, len);
 538                         if (c <= 0)
 539                                 continue;
 540                         break;
 541                 case ESCAPE_UNDEF:
 542                         c = *seq;
 543                         break;
 544                 case ESCAPE_DEVICE:
 545                         print_word(h, "html");
 546                         continue;
 547                 case ESCAPE_BREAK:
 548                         breakline = 1;
 549                         continue;
 550                 case ESCAPE_NOSPACE:
 551                         if ('\0' == *p)
 552                                 nospace = 1;
 553                         continue;
 554                 case ESCAPE_OVERSTRIKE:
 555                         if (len == 0)
 556                                 continue;
 557                         c = seq[len - 1];
 558                         break;
 559                 default:
 560                         continue;
 561                 }
 562                 if ((c < 0x20 && c != 0x09) ||
 563                     (c > 0x7E && c < 0xA0))
 564                         c = 0xFFFD;
 565                 if (c > 0x7E) {
 566                         (void)snprintf(numbuf, sizeof(numbuf), "&#x%.4X;", c);
 567                         print_word(h, numbuf);
 568                 } else if (print_escape(h, c) == 0)
 569                         print_byte(h, c);
 570         }
 571
 572         return nospace;
 573 }
 574
 575 static void
 576 print_href(struct html *h, const char *name, const char *sec, int man)
 577 {
 578         struct stat      sb;
 579         const char      *p, *pp;
 580         char            *filename;
 581
 582         if (man) {
 583                 pp = h->base_man1;
 584                 if (h->base_man2 != NULL) {
 585                         mandoc_asprintf(&filename, "%s.%s", name, sec);
 586                         if (stat(filename, &sb) == -1)
 587                                 pp = h->base_man2;
 588                         free(filename);
 589                 }
 590         } else
 591                 pp = h->base_includes;
 592
 593         while ((p = strchr(pp, '%')) != NULL) {
 594                 print_encode(h, pp, p, 1);
 595                 if (man && p[1] == 'S') {
 596                         if (sec == NULL)
 597                                 print_byte(h, '1');
 598                         else
 599                                 print_encode(h, sec, NULL, 1);
 600                 } else if ((man && p[1] == 'N') ||
 601                     (man == 0 && p[1] == 'I'))
 602                         print_encode(h, name, NULL, 1);
 603                 else
 604                         print_encode(h, p, p + 2, 1);
 605                 pp = p + 2;
 606         }
 607         if (*pp != '\0')
 608                 print_encode(h, pp, NULL, 1);
 609 }
 610
 611 struct tag *
 612 print_otag(struct html *h, enum htmltag tag, const char *fmt, ...)
 613 {
 614         va_list          ap;
 615         struct tag      *t;
 616         const char      *attr;
 617         char            *arg1, *arg2;
 618         int              style_written, tflags;
 619
 620         tflags = htmltags[tag].flags;
 621
 622         /* Flow content is not allowed in phrasing context. */
 623
 624         if ((tflags & HTML_INPHRASE) == 0) {
 625                 for (t = h->tag; t != NULL; t = t->next) {
 626                         if (t->closed)
 627                                 continue;
 628                         assert((htmltags[t->tag].flags & HTML_TOPHRASE) == 0);
 629                         break;
 630                 }
 631
 632         /*
 633          * Always wrap phrasing elements in a paragraph
 634          * unless already contained in some flow container;
 635          * never put them directly into a section.
 636          */
 637
 638         } else if (tflags & HTML_TOPHRASE && h->tag->tag == TAG_SECTION)
 639                 print_otag(h, TAG_P, "c", "Pp");
 640
 641         /* Push this tag onto the stack of open scopes. */
 642
 643         if ((tflags & HTML_NOSTACK) == 0) {
 644                 t = mandoc_malloc(sizeof(struct tag));
 645                 t->tag = tag;
 646                 t->next = h->tag;
 647                 t->refcnt = 0;
 648                 t->closed = 0;
 649                 h->tag = t;
 650         } else
 651                 t = NULL;
 652
 653         if (tflags & HTML_NLBEFORE)
 654                 print_endline(h);
 655         if (h->col == 0)
 656                 print_indent(h);
 657         else if ((h->flags & HTML_NOSPACE) == 0) {
 658                 if (h->flags & HTML_KEEP)
 659                         print_word(h, "&#x00A0;");
 660                 else {
 661                         if (h->flags & HTML_PREKEEP)
 662                                 h->flags |= HTML_KEEP;
 663                         print_endword(h);
 664                 }
 665         }
 666
 667         if ( ! (h->flags & HTML_NONOSPACE))
 668                 h->flags &= ~HTML_NOSPACE;
 669         else
 670                 h->flags |= HTML_NOSPACE;
 671
 672         /* Print out the tag name and attributes. */
 673
 674         print_byte(h, '<');
 675         print_word(h, htmltags[tag].name);
 676
 677         va_start(ap, fmt);
 678
 679         while (*fmt != '\0' && *fmt != 's') {
 680
 681                 /* Parse attributes and arguments. */
 682
 683                 arg1 = va_arg(ap, char *);
 684                 arg2 = NULL;
 685                 switch (*fmt++) {
 686                 case 'c':
 687                         attr = "class";
 688                         break;
 689                 case 'h':
 690                         attr = "href";
 691                         break;
 692                 case 'i':
 693                         attr = "id";
 694                         break;
 695                 case '?':
 696                         attr = arg1;
 697                         arg1 = va_arg(ap, char *);
 698                         break;
 699                 default:
 700                         abort();
 701                 }
 702                 if (*fmt == 'M')
 703                         arg2 = va_arg(ap, char *);
 704                 if (arg1 == NULL)
 705                         continue;
 706
 707                 /* Print the attributes. */
 708
 709                 print_byte(h, ' ');
 710                 print_word(h, attr);
 711                 print_byte(h, '=');
 712                 print_byte(h, '"');
 713                 switch (*fmt) {
 714                 case 'I':
 715                         print_href(h, arg1, NULL, 0);
 716                         fmt++;
 717                         break;
 718                 case 'M':
 719                         print_href(h, arg1, arg2, 1);
 720                         fmt++;
 721                         break;
 722                 case 'R':
 723                         print_byte(h, '#');
 724                         print_encode(h, arg1, NULL, 1);
 725                         fmt++;
 726                         break;
 727                 default:
 728                         print_encode(h, arg1, NULL, 1);
 729                         break;
 730                 }
 731                 print_byte(h, '"');
 732         }
 733
 734         style_written = 0;
 735         while (*fmt++ == 's') {
 736                 arg1 = va_arg(ap, char *);
 737                 arg2 = va_arg(ap, char *);
 738                 if (arg2 == NULL)
 739                         continue;
 740                 print_byte(h, ' ');
 741                 if (style_written == 0) {
 742                         print_word(h, "style=\"");
 743                         style_written = 1;
 744                 }
 745                 print_word(h, arg1);
 746                 print_byte(h, ':');
 747                 print_byte(h, ' ');
 748                 print_word(h, arg2);
 749                 print_byte(h, ';');
 750         }
 751         if (style_written)
 752                 print_byte(h, '"');
 753
 754         va_end(ap);
 755
 756         /* Accommodate for "well-formed" singleton escaping. */
 757
 758         if (htmltags[tag].flags & HTML_NOSTACK)
 759                 print_byte(h, '/');
 760
 761         print_byte(h, '>');
 762
 763         if (tflags & HTML_NLBEGIN)
 764                 print_endline(h);
 765         else
 766                 h->flags |= HTML_NOSPACE;
 767
 768         if (tflags & HTML_INDENT)
 769                 h->indent++;
 770         if (tflags & HTML_NOINDENT)
 771                 h->noindent++;
 772
 773         return t;
 774 }
 775
 776 /*
 777  * Print an element with an optional "id=" attribute.
 778  * If the element has phrasing content and an "id=" attribute,
 779  * also add a permalink: outside if it can be in phrasing context,
 780  * inside otherwise.
 781  */
 782 struct tag *
 783 print_otag_id(struct html *h, enum htmltag elemtype, const char *cattr,
 784     struct roff_node *n)
 785 {
 786         struct roff_node *nch;
 787         struct tag      *ret, *t;
 788         char            *id, *href;
 789
 790         ret = NULL;
 791         id = href = NULL;
 792         if (n->flags & NODE_ID)
 793                 id = html_make_id(n, 1);
 794         if (n->flags & NODE_HREF)
 795                 href = id == NULL ? html_make_id(n, 2) : id;
 796         if (href != NULL && htmltags[elemtype].flags & HTML_INPHRASE)
 797                 ret = print_otag(h, TAG_A, "chR", "permalink", href);
 798         t = print_otag(h, elemtype, "ci", cattr, id);
 799         if (ret == NULL) {
 800                 ret = t;
 801                 if (href != NULL && (nch = n->child) != NULL) {
 802                         /* man(7) is safe, it tags phrasing content only. */
 803                         if (n->tok > MDOC_MAX ||
 804                             htmltags[elemtype].flags & HTML_TOPHRASE)
 805                                 nch = NULL;
 806                         else  /* For mdoc(7), beware of nested blocks. */
 807                                 while (nch != NULL && nch->type == ROFFT_TEXT)
 808                                         nch = nch->next;
 809                         if (nch == NULL)
 810                                 print_otag(h, TAG_A, "chR", "permalink", href);
 811                 }
 812         }
 813         free(id);
 814         if (id == NULL)
 815                 free(href);
 816         return ret;
 817 }
 818
 819 static void
 820 print_ctag(struct html *h, struct tag *tag)
 821 {
 822         int      tflags;
 823
 824         if (tag->closed == 0) {
 825                 tag->closed = 1;
 826                 if (tag == h->metaf)
 827                         h->metaf = NULL;
 828                 if (tag == h->tblt)
 829                         h->tblt = NULL;
 830
 831                 tflags = htmltags[tag->tag].flags;
 832                 if (tflags & HTML_INDENT)
 833                         h->indent--;
 834                 if (tflags & HTML_NOINDENT)
 835                         h->noindent--;
 836                 if (tflags & HTML_NLEND)
 837                         print_endline(h);
 838                 print_indent(h);
 839                 print_byte(h, '<');
 840                 print_byte(h, '/');
 841                 print_word(h, htmltags[tag->tag].name);
 842                 print_byte(h, '>');
 843                 if (tflags & HTML_NLAFTER)
 844                         print_endline(h);
 845         }
 846         if (tag->refcnt == 0) {
 847                 h->tag = tag->next;
 848                 free(tag);
 849         }
 850 }
 851
 852 void
 853 print_gen_decls(struct html *h)
 854 {
 855         print_word(h, "<!DOCTYPE html>");
 856         print_endline(h);
 857 }
 858
 859 void
 860 print_gen_comment(struct html *h, struct roff_node *n)
 861 {
 862         int      wantblank;
 863
 864         print_word(h, "<!-- This is an automatically generated file."
 865             "  Do not edit.");
 866         h->indent = 1;
 867         wantblank = 0;
 868         while (n != NULL && n->type == ROFFT_COMMENT) {
 869                 if (strstr(n->string, "-->") == NULL &&
 870                     (wantblank || *n->string != '\0')) {
 871                         print_endline(h);
 872                         print_indent(h);
 873                         print_word(h, n->string);
 874                         wantblank = *n->string != '\0';
 875                 }
 876                 n = n->next;
 877         }
 878         if (wantblank)
 879                 print_endline(h);
 880         print_word(h, " -->");
 881         print_endline(h);
 882         h->indent = 0;
 883 }
 884
 885 void
 886 print_text(struct html *h, const char *word)
 887 {
 888         print_tagged_text(h, word, NULL);
 889 }
 890
 891 void
 892 print_tagged_text(struct html *h, const char *word, struct roff_node *n)
 893 {
 894         struct tag      *t;
 895         char            *href;
 896
 897         /*
 898          * Always wrap text in a paragraph unless already contained in
 899          * some flow container; never put it directly into a section.
 900          */
 901
 902         if (h->tag->tag == TAG_SECTION)
 903                 print_otag(h, TAG_P, "c", "Pp");
 904
 905         /* Output whitespace before this text? */
 906
 907         if (h->col && (h->flags & HTML_NOSPACE) == 0) {
 908                 if ( ! (HTML_KEEP & h->flags)) {
 909                         if (HTML_PREKEEP & h->flags)
 910                                 h->flags |= HTML_KEEP;
 911                         print_endword(h);
 912                 } else
 913                         print_word(h, "&#x00A0;");
 914         }
 915
 916         /*
 917          * Optionally switch fonts, optionally write a permalink, then
 918          * print the text, optionally surrounded by HTML whitespace.
 919          */
 920
 921         assert(h->metaf == NULL);
 922         print_metaf(h);
 923         print_indent(h);
 924
 925         if (n != NULL && (href = html_make_id(n, 2)) != NULL) {
 926                 t = print_otag(h, TAG_A, "chR", "permalink", href);
 927                 free(href);
 928         } else
 929                 t = NULL;
 930
 931         if ( ! print_encode(h, word, NULL, 0)) {
 932                 if ( ! (h->flags & HTML_NONOSPACE))
 933                         h->flags &= ~HTML_NOSPACE;
 934                 h->flags &= ~HTML_NONEWLINE;
 935         } else
 936                 h->flags |= HTML_NOSPACE | HTML_NONEWLINE;
 937
 938         if (h->metaf != NULL) {
 939                 print_tagq(h, h->metaf);
 940                 h->metaf = NULL;
 941         } else if (t != NULL)
 942                 print_tagq(h, t);
 943
 944         h->flags &= ~HTML_IGNDELIM;
 945 }
 946
 947 void
 948 print_tagq(struct html *h, const struct tag *until)
 949 {
 950         struct tag      *this, *next;
 951
 952         for (this = h->tag; this != NULL; this = next) {
 953                 next = this == until ? NULL : this->next;
 954                 print_ctag(h, this);
 955         }
 956 }
 957
 958 /*
 959  * Close out all open elements up to but excluding suntil.
 960  * Note that a paragraph just inside stays open together with it
 961  * because paragraphs include subsequent phrasing content.
 962  */
 963 void
 964 print_stagq(struct html *h, const struct tag *suntil)
 965 {
 966         struct tag      *this, *next;
 967
 968         for (this = h->tag; this != NULL; this = next) {
 969                 next = this->next;
 970                 if (this == suntil || (next == suntil &&
 971                     (this->tag == TAG_P || this->tag == TAG_PRE)))
 972                         break;
 973                 print_ctag(h, this);
 974         }
 975 }
 976
 977
 978 /***********************************************************************
 979  * Low level output functions.
 980  * They implement line breaking using a short static buffer.
 981  ***********************************************************************/
 982
 983 /*
 984  * Buffer one HTML output byte.
 985  * If the buffer is full, flush and deactivate it and start a new line.
 986  * If the buffer is inactive, print directly.
 987  */
 988 static void
 989 print_byte(struct html *h, char c)
 990 {
 991         if ((h->flags & HTML_BUFFER) == 0) {
 992                 putchar(c);
 993                 h->col++;
 994                 return;
 995         }
 996
 997         if (h->col + h->bufcol < sizeof(h->buf)) {
 998                 h->buf[h->bufcol++] = c;
 999                 return;
1000         }
1001
1002         putchar('\n');
1003         h->col = 0;
1004         print_indent(h);
1005         putchar(' ');
1006         putchar(' ');
1007         fwrite(h->buf, h->bufcol, 1, stdout);
1008         putchar(c);
1009         h->col = (h->indent + 1) * 2 + h->bufcol + 1;
1010         h->bufcol = 0;
1011         h->flags &= ~HTML_BUFFER;
1012 }
1013
1014 /*
1015  * If something was printed on the current output line, end it.
1016  * Not to be called right after print_indent().
1017  */
1018 void
1019 print_endline(struct html *h)
1020 {
1021         if (h->col == 0)
1022                 return;
1023
1024         if (h->bufcol) {
1025                 putchar(' ');
1026                 fwrite(h->buf, h->bufcol, 1, stdout);
1027                 h->bufcol = 0;
1028         }
1029         putchar('\n');
1030         h->col = 0;
1031         h->flags |= HTML_NOSPACE;
1032         h->flags &= ~HTML_BUFFER;
1033 }
1034
1035 /*
1036  * Flush the HTML output buffer.
1037  * If it is inactive, activate it.
1038  */
1039 static void
1040 print_endword(struct html *h)
1041 {
1042         if (h->noindent) {
1043                 print_byte(h, ' ');
1044                 return;
1045         }
1046
1047         if ((h->flags & HTML_BUFFER) == 0) {
1048                 h->col++;
1049                 h->flags |= HTML_BUFFER;
1050         } else if (h->bufcol) {
1051                 putchar(' ');
1052                 fwrite(h->buf, h->bufcol, 1, stdout);
1053                 h->col += h->bufcol + 1;
1054         }
1055         h->bufcol = 0;
1056 }
1057
1058 /*
1059  * If at the beginning of a new output line,
1060  * perform indentation and mark the line as containing output.
1061  * Make sure to really produce some output right afterwards,
1062  * but do not use print_otag() for producing it.
1063  */
1064 static void
1065 print_indent(struct html *h)
1066 {
1067         size_t   i;
1068
1069         if (h->col || h->noindent)
1070                 return;
1071
1072         h->col = h->indent * 2;
1073         for (i = 0; i < h->col; i++)
1074                 putchar(' ');
1075 }
1076
1077 /*
1078  * Print or buffer some characters
1079  * depending on the current HTML output buffer state.
1080  */
1081 static void
1082 print_word(struct html *h, const char *cp)
1083 {
1084         while (*cp != '\0')
1085                 print_byte(h, *cp++);
1086 }