mandoc.c

   1 /*      $Id: mandoc.c,v 1.113 2018/12/18 22:00:02 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011-2015, 2017, 2018 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include "config.h"
  19
  20 #include <sys/types.h>
  21
  22 #include <assert.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <time.h>
  30
  31 #include "mandoc_aux.h"
  32 #include "mandoc.h"
  33 #include "roff.h"
  34 #include "libmandoc.h"
  35
  36 static  int      a2time(time_t *, const char *, const char *);
  37 static  char    *time2a(time_t);
  38
  39
  40 enum mandoc_esc
  41 mandoc_font(const char *cp, int sz)
  42 {
  43         switch (sz) {
  44         case 0:
  45                 return ESCAPE_FONTPREV;
  46         case 1:
  47                 switch (cp[0]) {
  48                 case 'B':
  49                 case '3':
  50                         return ESCAPE_FONTBOLD;
  51                 case 'I':
  52                 case '2':
  53                         return ESCAPE_FONTITALIC;
  54                 case 'P':
  55                         return ESCAPE_FONTPREV;
  56                 case 'R':
  57                 case '1':
  58                         return ESCAPE_FONTROMAN;
  59                 case '4':
  60                         return ESCAPE_FONTBI;
  61                 default:
  62                         return ESCAPE_ERROR;
  63                 }
  64         case 2:
  65                 switch (cp[0]) {
  66                 case 'B':
  67                         switch (cp[1]) {
  68                         case 'I':
  69                                 return ESCAPE_FONTBI;
  70                         default:
  71                                 return ESCAPE_ERROR;
  72                         }
  73                 case 'C':
  74                         switch (cp[1]) {
  75                         case 'B':
  76                                 return ESCAPE_FONTBOLD;
  77                         case 'I':
  78                                 return ESCAPE_FONTITALIC;
  79                         case 'R':
  80                         case 'W':
  81                                 return ESCAPE_FONTCW;
  82                         default:
  83                                 return ESCAPE_ERROR;
  84                         }
  85                 default:
  86                         return ESCAPE_ERROR;
  87                 }
  88         default:
  89                 return ESCAPE_ERROR;
  90         }
  91 }
  92
  93 enum mandoc_esc
  94 mandoc_escape(const char **end, const char **start, int *sz)
  95 {
  96         const char      *local_start;
  97         int              local_sz, c, i;
  98         char             term;
  99         enum mandoc_esc  gly;
 100
 101         /*
 102          * When the caller doesn't provide return storage,
 103          * use local storage.
 104          */
 105
 106         if (NULL == start)
 107                 start = &local_start;
 108         if (NULL == sz)
 109                 sz = &local_sz;
 110
 111         /*
 112          * Treat "\E" just like "\";
 113          * it only makes a difference in copy mode.
 114          */
 115
 116         if (**end == 'E')
 117                 ++*end;
 118
 119         /*
 120          * Beyond the backslash, at least one input character
 121          * is part of the escape sequence.  With one exception
 122          * (see below), that character won't be returned.
 123          */
 124
 125         gly = ESCAPE_ERROR;
 126         *start = ++*end;
 127         *sz = 0;
 128         term = '\0';
 129
 130         switch ((*start)[-1]) {
 131         /*
 132          * First the glyphs.  There are several different forms of
 133          * these, but each eventually returns a substring of the glyph
 134          * name.
 135          */
 136         case '(':
 137                 gly = ESCAPE_SPECIAL;
 138                 *sz = 2;
 139                 break;
 140         case '[':
 141                 if (**start == ' ') {
 142                         ++*end;
 143                         return ESCAPE_ERROR;
 144                 }
 145                 gly = ESCAPE_SPECIAL;
 146                 term = ']';
 147                 break;
 148         case 'C':
 149                 if ('\'' != **start)
 150                         return ESCAPE_ERROR;
 151                 *start = ++*end;
 152                 gly = ESCAPE_SPECIAL;
 153                 term = '\'';
 154                 break;
 155
 156         /*
 157          * Escapes taking no arguments at all.
 158          */
 159         case '!':
 160         case '?':
 161                 return ESCAPE_UNSUPP;
 162         case '%':
 163         case '&':
 164         case ')':
 165         case ',':
 166         case '/':
 167         case '^':
 168         case 'a':
 169         case 'd':
 170         case 'r':
 171         case 't':
 172         case 'u':
 173         case '{':
 174         case '|':
 175         case '}':
 176                 return ESCAPE_IGNORE;
 177         case 'c':
 178                 return ESCAPE_NOSPACE;
 179         case 'p':
 180                 return ESCAPE_BREAK;
 181
 182         /*
 183          * The \z escape is supposed to output the following
 184          * character without advancing the cursor position.
 185          * Since we are mostly dealing with terminal mode,
 186          * let us just skip the next character.
 187          */
 188         case 'z':
 189                 return ESCAPE_SKIPCHAR;
 190
 191         /*
 192          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
 193          * 'X' is the trigger.  These have opaque sub-strings.
 194          */
 195         case 'F':
 196         case 'f':
 197         case 'g':
 198         case 'k':
 199         case 'M':
 200         case 'm':
 201         case 'n':
 202         case 'O':
 203         case 'V':
 204         case 'Y':
 205                 gly = (*start)[-1] == 'f' ? ESCAPE_FONT : ESCAPE_IGNORE;
 206                 switch (**start) {
 207                 case '(':
 208                         if ((*start)[-1] == 'O')
 209                                 gly = ESCAPE_ERROR;
 210                         *start = ++*end;
 211                         *sz = 2;
 212                         break;
 213                 case '[':
 214                         if ((*start)[-1] == 'O')
 215                                 gly = (*start)[1] == '5' ?
 216                                     ESCAPE_UNSUPP : ESCAPE_ERROR;
 217                         *start = ++*end;
 218                         term = ']';
 219                         break;
 220                 default:
 221                         if ((*start)[-1] == 'O') {
 222                                 switch (**start) {
 223                                 case '0':
 224                                         gly = ESCAPE_UNSUPP;
 225                                         break;
 226                                 case '1':
 227                                 case '2':
 228                                 case '3':
 229                                 case '4':
 230                                         break;
 231                                 default:
 232                                         gly = ESCAPE_ERROR;
 233                                         break;
 234                                 }
 235                         }
 236                         *sz = 1;
 237                         break;
 238                 }
 239                 break;
 240         case '*':
 241                 if (strncmp(*start, "(.T", 3) != 0)
 242                         abort();
 243                 gly = ESCAPE_DEVICE;
 244                 *start = ++*end;
 245                 *sz = 2;
 246                 break;
 247
 248         /*
 249          * These escapes are of the form \X'Y', where 'X' is the trigger
 250          * and 'Y' is any string.  These have opaque sub-strings.
 251          * The \B and \w escapes are handled in roff.c, roff_res().
 252          */
 253         case 'A':
 254         case 'b':
 255         case 'D':
 256         case 'R':
 257         case 'X':
 258         case 'Z':
 259                 gly = ESCAPE_IGNORE;
 260                 /* FALLTHROUGH */
 261         case 'o':
 262                 if (**start == '\0')
 263                         return ESCAPE_ERROR;
 264                 if (gly == ESCAPE_ERROR)
 265                         gly = ESCAPE_OVERSTRIKE;
 266                 term = **start;
 267                 *start = ++*end;
 268                 break;
 269
 270         /*
 271          * These escapes are of the form \X'N', where 'X' is the trigger
 272          * and 'N' resolves to a numerical expression.
 273          */
 274         case 'h':
 275         case 'H':
 276         case 'L':
 277         case 'l':
 278         case 'S':
 279         case 'v':
 280         case 'x':
 281                 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
 282                         if ('\0' != **start)
 283                                 ++*end;
 284                         return ESCAPE_ERROR;
 285                 }
 286                 switch ((*start)[-1]) {
 287                 case 'h':
 288                         gly = ESCAPE_HORIZ;
 289                         break;
 290                 case 'l':
 291                         gly = ESCAPE_HLINE;
 292                         break;
 293                 default:
 294                         gly = ESCAPE_IGNORE;
 295                         break;
 296                 }
 297                 term = **start;
 298                 *start = ++*end;
 299                 break;
 300
 301         /*
 302          * Special handling for the numbered character escape.
 303          * XXX Do any other escapes need similar handling?
 304          */
 305         case 'N':
 306                 if ('\0' == **start)
 307                         return ESCAPE_ERROR;
 308                 (*end)++;
 309                 if (isdigit((unsigned char)**start)) {
 310                         *sz = 1;
 311                         return ESCAPE_IGNORE;
 312                 }
 313                 (*start)++;
 314                 while (isdigit((unsigned char)**end))
 315                         (*end)++;
 316                 *sz = *end - *start;
 317                 if ('\0' != **end)
 318                         (*end)++;
 319                 return ESCAPE_NUMBERED;
 320
 321         /*
 322          * Sizes get a special category of their own.
 323          */
 324         case 's':
 325                 gly = ESCAPE_IGNORE;
 326
 327                 /* See +/- counts as a sign. */
 328                 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
 329                         *start = ++*end;
 330
 331                 switch (**end) {
 332                 case '(':
 333                         *start = ++*end;
 334                         *sz = 2;
 335                         break;
 336                 case '[':
 337                         *start = ++*end;
 338                         term = ']';
 339                         break;
 340                 case '\'':
 341                         *start = ++*end;
 342                         term = '\'';
 343                         break;
 344                 case '3':
 345                 case '2':
 346                 case '1':
 347                         *sz = (*end)[-1] == 's' &&
 348                             isdigit((unsigned char)(*end)[1]) ? 2 : 1;
 349                         break;
 350                 default:
 351                         *sz = 1;
 352                         break;
 353                 }
 354
 355                 break;
 356
 357         /*
 358          * Several special characters can be encoded as
 359          * one-byte escape sequences without using \[].
 360          */
 361         case ' ':
 362         case '\'':
 363         case '-':
 364         case '.':
 365         case '0':
 366         case ':':
 367         case '_':
 368         case '`':
 369         case 'e':
 370         case '~':
 371                 gly = ESCAPE_SPECIAL;
 372                 /* FALLTHROUGH */
 373         default:
 374                 if (gly == ESCAPE_ERROR)
 375                         gly = ESCAPE_UNDEF;
 376                 *start = --*end;
 377                 *sz = 1;
 378                 break;
 379         }
 380
 381         /*
 382          * Read up to the terminating character,
 383          * paying attention to nested escapes.
 384          */
 385
 386         if ('\0' != term) {
 387                 while (**end != term) {
 388                         switch (**end) {
 389                         case '\0':
 390                                 return ESCAPE_ERROR;
 391                         case '\\':
 392                                 (*end)++;
 393                                 if (ESCAPE_ERROR ==
 394                                     mandoc_escape(end, NULL, NULL))
 395                                         return ESCAPE_ERROR;
 396                                 break;
 397                         default:
 398                                 (*end)++;
 399                                 break;
 400                         }
 401                 }
 402                 *sz = (*end)++ - *start;
 403
 404                 /*
 405                  * The file chars.c only provides one common list
 406                  * of character names, but \[-] == \- is the only
 407                  * one of the characters with one-byte names that
 408                  * allows enclosing the name in brackets.
 409                  */
 410                 if (gly == ESCAPE_SPECIAL && *sz == 1 && **start != '-')
 411                         return ESCAPE_ERROR;
 412         } else {
 413                 assert(*sz > 0);
 414                 if ((size_t)*sz > strlen(*start))
 415                         return ESCAPE_ERROR;
 416                 *end += *sz;
 417         }
 418
 419         /* Run post-processors. */
 420
 421         switch (gly) {
 422         case ESCAPE_FONT:
 423                 gly = mandoc_font(*start, *sz);
 424                 break;
 425         case ESCAPE_SPECIAL:
 426                 if (**start == 'c') {
 427                         if (*sz < 6 || *sz > 7 ||
 428                             strncmp(*start, "char", 4) != 0 ||
 429                             (int)strspn(*start + 4, "0123456789") + 4 < *sz)
 430                                 break;
 431                         c = 0;
 432                         for (i = 4; i < *sz; i++)
 433                                 c = 10 * c + ((*start)[i] - '0');
 434                         if (c < 0x21 || (c > 0x7e && c < 0xa0) || c > 0xff)
 435                                 break;
 436                         *start += 4;
 437                         *sz -= 4;
 438                         gly = ESCAPE_NUMBERED;
 439                         break;
 440                 }
 441
 442                 /*
 443                  * Unicode escapes are defined in groff as \[u0000]
 444                  * to \[u10FFFF], where the contained value must be
 445                  * a valid Unicode codepoint.  Here, however, only
 446                  * check the length and range.
 447                  */
 448                 if (**start != 'u' || *sz < 5 || *sz > 7)
 449                         break;
 450                 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
 451                         break;
 452                 if (*sz == 6 && (*start)[1] == '0')
 453                         break;
 454                 if (*sz == 5 && (*start)[1] == 'D' &&
 455                     strchr("89ABCDEF", (*start)[2]) != NULL)
 456                         break;
 457                 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
 458                     + 1 == *sz)
 459                         gly = ESCAPE_UNICODE;
 460                 break;
 461         default:
 462                 break;
 463         }
 464
 465         return gly;
 466 }
 467
 468 static int
 469 a2time(time_t *t, const char *fmt, const char *p)
 470 {
 471         struct tm        tm;
 472         char            *pp;
 473
 474         memset(&tm, 0, sizeof(struct tm));
 475
 476         pp = NULL;
 477 #if HAVE_STRPTIME
 478         pp = strptime(p, fmt, &tm);
 479 #endif
 480         if (NULL != pp && '\0' == *pp) {
 481                 *t = mktime(&tm);
 482                 return 1;
 483         }
 484
 485         return 0;
 486 }
 487
 488 static char *
 489 time2a(time_t t)
 490 {
 491         struct tm       *tm;
 492         char            *buf, *p;
 493         size_t           ssz;
 494         int              isz;
 495
 496         tm = localtime(&t);
 497         if (tm == NULL)
 498                 return NULL;
 499
 500         /*
 501          * Reserve space:
 502          * up to 9 characters for the month (September) + blank
 503          * up to 2 characters for the day + comma + blank
 504          * 4 characters for the year and a terminating '\0'
 505          */
 506
 507         p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 508
 509         if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
 510                 goto fail;
 511         p += (int)ssz;
 512
 513         /*
 514          * The output format is just "%d" here, not "%2d" or "%02d".
 515          * That's also the reason why we can't just format the
 516          * date as a whole with "%B %e, %Y" or "%B %d, %Y".
 517          * Besides, the present approach is less prone to buffer
 518          * overflows, in case anybody should ever introduce the bug
 519          * of looking at LC_TIME.
 520          */
 521
 522         if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
 523                 goto fail;
 524         p += isz;
 525
 526         if (strftime(p, 4 + 1, "%Y", tm) == 0)
 527                 goto fail;
 528         return buf;
 529
 530 fail:
 531         free(buf);
 532         return NULL;
 533 }
 534
 535 char *
 536 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
 537 {
 538         char            *cp;
 539         time_t           t;
 540
 541         /* No date specified: use today's date. */
 542
 543         if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
 544                 mandoc_msg(MANDOCERR_DATE_MISSING, ln, pos, NULL);
 545                 return time2a(time(NULL));
 546         }
 547
 548         /* Valid mdoc(7) date format. */
 549
 550         if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
 551             a2time(&t, "%b %d, %Y", in)) {
 552                 cp = time2a(t);
 553                 if (t > time(NULL) + 86400)
 554                         mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", cp);
 555                 else if (*in != '$' && strcmp(in, cp) != 0)
 556                         mandoc_msg(MANDOCERR_DATE_NORM, ln, pos, "%s", cp);
 557                 return cp;
 558         }
 559
 560         /* In man(7), do not warn about the legacy format. */
 561
 562         if (a2time(&t, "%Y-%m-%d", in) == 0)
 563                 mandoc_msg(MANDOCERR_DATE_BAD, ln, pos, "%s", in);
 564         else if (t > time(NULL) + 86400)
 565                 mandoc_msg(MANDOCERR_DATE_FUTURE, ln, pos, "%s", in);
 566         else if (man->macroset == MACROSET_MDOC)
 567                 mandoc_msg(MANDOCERR_DATE_LEGACY, ln, pos, "Dd %s", in);
 568
 569         /* Use any non-mdoc(7) date verbatim. */
 570
 571         return mandoc_strdup(in);
 572 }
 573
 574 int
 575 mandoc_eos(const char *p, size_t sz)
 576 {
 577         const char      *q;
 578         int              enclosed, found;
 579
 580         if (0 == sz)
 581                 return 0;
 582
 583         /*
 584          * End-of-sentence recognition must include situations where
 585          * some symbols, such as `)', allow prior EOS punctuation to
 586          * propagate outward.
 587          */
 588
 589         enclosed = found = 0;
 590         for (q = p + (int)sz - 1; q >= p; q--) {
 591                 switch (*q) {
 592                 case '\"':
 593                 case '\'':
 594                 case ']':
 595                 case ')':
 596                         if (0 == found)
 597                                 enclosed = 1;
 598                         break;
 599                 case '.':
 600                 case '!':
 601                 case '?':
 602                         found = 1;
 603                         break;
 604                 default:
 605                         return found &&
 606                             (!enclosed || isalnum((unsigned char)*q));
 607                 }
 608         }
 609
 610         return found && !enclosed;
 611 }
 612
 613 /*
 614  * Convert a string to a long that may not be <0.
 615  * If the string is invalid, or is less than 0, return -1.
 616  */
 617 int
 618 mandoc_strntoi(const char *p, size_t sz, int base)
 619 {
 620         char             buf[32];
 621         char            *ep;
 622         long             v;
 623
 624         if (sz > 31)
 625                 return -1;
 626
 627         memcpy(buf, p, sz);
 628         buf[(int)sz] = '\0';
 629
 630         errno = 0;
 631         v = strtol(buf, &ep, base);
 632
 633         if (buf[0] == '\0' || *ep != '\0')
 634                 return -1;
 635
 636         if (v > INT_MAX)
 637                 v = INT_MAX;
 638         if (v < INT_MIN)
 639                 v = INT_MIN;
 640
 641         return (int)v;
 642 }