mandoc.c

   1 /*      $Id: mandoc.c,v 1.102 2017/06/14 01:31:26 schwarze Exp $ */
   2 /*
   3  * Copyright (c) 2008-2011, 2014 Kristaps Dzonsons <kristaps@bsd.lv>
   4  * Copyright (c) 2011-2015, 2017 Ingo Schwarze <schwarze@openbsd.org>
   5  *
   6  * Permission to use, copy, modify, and distribute this software for any
   7  * purpose with or without fee is hereby granted, provided that the above
   8  * copyright notice and this permission notice appear in all copies.
   9  *
  10  * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHORS DISCLAIM ALL WARRANTIES
  11  * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
  12  * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR
  13  * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
  14  * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
  15  * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
  16  * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
  17  */
  18 #include "config.h"
  19
  20 #include <sys/types.h>
  21
  22 #include <assert.h>
  23 #include <ctype.h>
  24 #include <errno.h>
  25 #include <limits.h>
  26 #include <stdlib.h>
  27 #include <stdio.h>
  28 #include <string.h>
  29 #include <time.h>
  30
  31 #include "mandoc_aux.h"
  32 #include "mandoc.h"
  33 #include "roff.h"
  34 #include "libmandoc.h"
  35
  36 static  int      a2time(time_t *, const char *, const char *);
  37 static  char    *time2a(time_t);
  38
  39
  40 enum mandoc_esc
  41 mandoc_escape(const char **end, const char **start, int *sz)
  42 {
  43         const char      *local_start;
  44         int              local_sz;
  45         char             term;
  46         enum mandoc_esc  gly;
  47
  48         /*
  49          * When the caller doesn't provide return storage,
  50          * use local storage.
  51          */
  52
  53         if (NULL == start)
  54                 start = &local_start;
  55         if (NULL == sz)
  56                 sz = &local_sz;
  57
  58         /*
  59          * Beyond the backslash, at least one input character
  60          * is part of the escape sequence.  With one exception
  61          * (see below), that character won't be returned.
  62          */
  63
  64         gly = ESCAPE_ERROR;
  65         *start = ++*end;
  66         *sz = 0;
  67         term = '\0';
  68
  69         switch ((*start)[-1]) {
  70         /*
  71          * First the glyphs.  There are several different forms of
  72          * these, but each eventually returns a substring of the glyph
  73          * name.
  74          */
  75         case '(':
  76                 gly = ESCAPE_SPECIAL;
  77                 *sz = 2;
  78                 break;
  79         case '[':
  80                 gly = ESCAPE_SPECIAL;
  81                 term = ']';
  82                 break;
  83         case 'C':
  84                 if ('\'' != **start)
  85                         return ESCAPE_ERROR;
  86                 *start = ++*end;
  87                 gly = ESCAPE_SPECIAL;
  88                 term = '\'';
  89                 break;
  90
  91         /*
  92          * Escapes taking no arguments at all.
  93          */
  94         case 'd':
  95         case 'u':
  96         case ',':
  97         case '/':
  98                 return ESCAPE_IGNORE;
  99         case 'p':
 100                 return ESCAPE_BREAK;
 101
 102         /*
 103          * The \z escape is supposed to output the following
 104          * character without advancing the cursor position.
 105          * Since we are mostly dealing with terminal mode,
 106          * let us just skip the next character.
 107          */
 108         case 'z':
 109                 return ESCAPE_SKIPCHAR;
 110
 111         /*
 112          * Handle all triggers matching \X(xy, \Xx, and \X[xxxx], where
 113          * 'X' is the trigger.  These have opaque sub-strings.
 114          */
 115         case 'F':
 116         case 'g':
 117         case 'k':
 118         case 'M':
 119         case 'm':
 120         case 'n':
 121         case 'V':
 122         case 'Y':
 123                 gly = ESCAPE_IGNORE;
 124                 /* FALLTHROUGH */
 125         case 'f':
 126                 if (ESCAPE_ERROR == gly)
 127                         gly = ESCAPE_FONT;
 128                 switch (**start) {
 129                 case '(':
 130                         *start = ++*end;
 131                         *sz = 2;
 132                         break;
 133                 case '[':
 134                         *start = ++*end;
 135                         term = ']';
 136                         break;
 137                 default:
 138                         *sz = 1;
 139                         break;
 140                 }
 141                 break;
 142
 143         /*
 144          * These escapes are of the form \X'Y', where 'X' is the trigger
 145          * and 'Y' is any string.  These have opaque sub-strings.
 146          * The \B and \w escapes are handled in roff.c, roff_res().
 147          */
 148         case 'A':
 149         case 'b':
 150         case 'D':
 151         case 'R':
 152         case 'X':
 153         case 'Z':
 154                 gly = ESCAPE_IGNORE;
 155                 /* FALLTHROUGH */
 156         case 'o':
 157                 if (**start == '\0')
 158                         return ESCAPE_ERROR;
 159                 if (gly == ESCAPE_ERROR)
 160                         gly = ESCAPE_OVERSTRIKE;
 161                 term = **start;
 162                 *start = ++*end;
 163                 break;
 164
 165         /*
 166          * These escapes are of the form \X'N', where 'X' is the trigger
 167          * and 'N' resolves to a numerical expression.
 168          */
 169         case 'h':
 170         case 'H':
 171         case 'L':
 172         case 'l':
 173         case 'S':
 174         case 'v':
 175         case 'x':
 176                 if (strchr(" %&()*+-./0123456789:<=>", **start)) {
 177                         if ('\0' != **start)
 178                                 ++*end;
 179                         return ESCAPE_ERROR;
 180                 }
 181                 switch ((*start)[-1]) {
 182                 case 'h':
 183                         gly = ESCAPE_HORIZ;
 184                         break;
 185                 case 'l':
 186                         gly = ESCAPE_HLINE;
 187                         break;
 188                 default:
 189                         gly = ESCAPE_IGNORE;
 190                         break;
 191                 }
 192                 term = **start;
 193                 *start = ++*end;
 194                 break;
 195
 196         /*
 197          * Special handling for the numbered character escape.
 198          * XXX Do any other escapes need similar handling?
 199          */
 200         case 'N':
 201                 if ('\0' == **start)
 202                         return ESCAPE_ERROR;
 203                 (*end)++;
 204                 if (isdigit((unsigned char)**start)) {
 205                         *sz = 1;
 206                         return ESCAPE_IGNORE;
 207                 }
 208                 (*start)++;
 209                 while (isdigit((unsigned char)**end))
 210                         (*end)++;
 211                 *sz = *end - *start;
 212                 if ('\0' != **end)
 213                         (*end)++;
 214                 return ESCAPE_NUMBERED;
 215
 216         /*
 217          * Sizes get a special category of their own.
 218          */
 219         case 's':
 220                 gly = ESCAPE_IGNORE;
 221
 222                 /* See +/- counts as a sign. */
 223                 if ('+' == **end || '-' == **end || ASCII_HYPH == **end)
 224                         *start = ++*end;
 225
 226                 switch (**end) {
 227                 case '(':
 228                         *start = ++*end;
 229                         *sz = 2;
 230                         break;
 231                 case '[':
 232                         *start = ++*end;
 233                         term = ']';
 234                         break;
 235                 case '\'':
 236                         *start = ++*end;
 237                         term = '\'';
 238                         break;
 239                 case '3':
 240                 case '2':
 241                 case '1':
 242                         *sz = (*end)[-1] == 's' &&
 243                             isdigit((unsigned char)(*end)[1]) ? 2 : 1;
 244                         break;
 245                 default:
 246                         *sz = 1;
 247                         break;
 248                 }
 249
 250                 break;
 251
 252         /*
 253          * Anything else is assumed to be a glyph.
 254          * In this case, pass back the character after the backslash.
 255          */
 256         default:
 257                 gly = ESCAPE_SPECIAL;
 258                 *start = --*end;
 259                 *sz = 1;
 260                 break;
 261         }
 262
 263         assert(ESCAPE_ERROR != gly);
 264
 265         /*
 266          * Read up to the terminating character,
 267          * paying attention to nested escapes.
 268          */
 269
 270         if ('\0' != term) {
 271                 while (**end != term) {
 272                         switch (**end) {
 273                         case '\0':
 274                                 return ESCAPE_ERROR;
 275                         case '\\':
 276                                 (*end)++;
 277                                 if (ESCAPE_ERROR ==
 278                                     mandoc_escape(end, NULL, NULL))
 279                                         return ESCAPE_ERROR;
 280                                 break;
 281                         default:
 282                                 (*end)++;
 283                                 break;
 284                         }
 285                 }
 286                 *sz = (*end)++ - *start;
 287         } else {
 288                 assert(*sz > 0);
 289                 if ((size_t)*sz > strlen(*start))
 290                         return ESCAPE_ERROR;
 291                 *end += *sz;
 292         }
 293
 294         /* Run post-processors. */
 295
 296         switch (gly) {
 297         case ESCAPE_FONT:
 298                 if (2 == *sz) {
 299                         if ('C' == **start) {
 300                                 /*
 301                                  * Treat constant-width font modes
 302                                  * just like regular font modes.
 303                                  */
 304                                 (*start)++;
 305                                 (*sz)--;
 306                         } else {
 307                                 if ('B' == (*start)[0] && 'I' == (*start)[1])
 308                                         gly = ESCAPE_FONTBI;
 309                                 break;
 310                         }
 311                 } else if (1 != *sz)
 312                         break;
 313
 314                 switch (**start) {
 315                 case '3':
 316                 case 'B':
 317                         gly = ESCAPE_FONTBOLD;
 318                         break;
 319                 case '2':
 320                 case 'I':
 321                         gly = ESCAPE_FONTITALIC;
 322                         break;
 323                 case 'P':
 324                         gly = ESCAPE_FONTPREV;
 325                         break;
 326                 case '1':
 327                 case 'R':
 328                         gly = ESCAPE_FONTROMAN;
 329                         break;
 330                 }
 331                 break;
 332         case ESCAPE_SPECIAL:
 333                 if (1 == *sz && 'c' == **start)
 334                         gly = ESCAPE_NOSPACE;
 335                 /*
 336                  * Unicode escapes are defined in groff as \[u0000]
 337                  * to \[u10FFFF], where the contained value must be
 338                  * a valid Unicode codepoint.  Here, however, only
 339                  * check the length and range.
 340                  */
 341                 if (**start != 'u' || *sz < 5 || *sz > 7)
 342                         break;
 343                 if (*sz == 7 && ((*start)[1] != '1' || (*start)[2] != '0'))
 344                         break;
 345                 if (*sz == 6 && (*start)[1] == '0')
 346                         break;
 347                 if (*sz == 5 && (*start)[1] == 'D' &&
 348                     strchr("89ABCDEF", (*start)[2]) != NULL)
 349                         break;
 350                 if ((int)strspn(*start + 1, "0123456789ABCDEFabcdef")
 351                     + 1 == *sz)
 352                         gly = ESCAPE_UNICODE;
 353                 break;
 354         default:
 355                 break;
 356         }
 357
 358         return gly;
 359 }
 360
 361 /*
 362  * Parse a quoted or unquoted roff-style request or macro argument.
 363  * Return a pointer to the parsed argument, which is either the original
 364  * pointer or advanced by one byte in case the argument is quoted.
 365  * NUL-terminate the argument in place.
 366  * Collapse pairs of quotes inside quoted arguments.
 367  * Advance the argument pointer to the next argument,
 368  * or to the NUL byte terminating the argument line.
 369  */
 370 char *
 371 mandoc_getarg(struct mparse *parse, char **cpp, int ln, int *pos)
 372 {
 373         char     *start, *cp;
 374         int       quoted, pairs, white;
 375
 376         /* Quoting can only start with a new word. */
 377         start = *cpp;
 378         quoted = 0;
 379         if ('"' == *start) {
 380                 quoted = 1;
 381                 start++;
 382         }
 383
 384         pairs = 0;
 385         white = 0;
 386         for (cp = start; '\0' != *cp; cp++) {
 387
 388                 /*
 389                  * Move the following text left
 390                  * after quoted quotes and after "\\" and "\t".
 391                  */
 392                 if (pairs)
 393                         cp[-pairs] = cp[0];
 394
 395                 if ('\\' == cp[0]) {
 396                         /*
 397                          * In copy mode, translate double to single
 398                          * backslashes and backslash-t to literal tabs.
 399                          */
 400                         switch (cp[1]) {
 401                         case 't':
 402                                 cp[0] = '\t';
 403                                 /* FALLTHROUGH */
 404                         case '\\':
 405                                 pairs++;
 406                                 cp++;
 407                                 break;
 408                         case ' ':
 409                                 /* Skip escaped blanks. */
 410                                 if (0 == quoted)
 411                                         cp++;
 412                                 break;
 413                         default:
 414                                 break;
 415                         }
 416                 } else if (0 == quoted) {
 417                         if (' ' == cp[0]) {
 418                                 /* Unescaped blanks end unquoted args. */
 419                                 white = 1;
 420                                 break;
 421                         }
 422                 } else if ('"' == cp[0]) {
 423                         if ('"' == cp[1]) {
 424                                 /* Quoted quotes collapse. */
 425                                 pairs++;
 426                                 cp++;
 427                         } else {
 428                                 /* Unquoted quotes end quoted args. */
 429                                 quoted = 2;
 430                                 break;
 431                         }
 432                 }
 433         }
 434
 435         /* Quoted argument without a closing quote. */
 436         if (1 == quoted)
 437                 mandoc_msg(MANDOCERR_ARG_QUOTE, parse, ln, *pos, NULL);
 438
 439         /* NUL-terminate this argument and move to the next one. */
 440         if (pairs)
 441                 cp[-pairs] = '\0';
 442         if ('\0' != *cp) {
 443                 *cp++ = '\0';
 444                 while (' ' == *cp)
 445                         cp++;
 446         }
 447         *pos += (int)(cp - start) + (quoted ? 1 : 0);
 448         *cpp = cp;
 449
 450         if ('\0' == *cp && (white || ' ' == cp[-1]))
 451                 mandoc_msg(MANDOCERR_SPACE_EOL, parse, ln, *pos, NULL);
 452
 453         return start;
 454 }
 455
 456 static int
 457 a2time(time_t *t, const char *fmt, const char *p)
 458 {
 459         struct tm        tm;
 460         char            *pp;
 461
 462         memset(&tm, 0, sizeof(struct tm));
 463
 464         pp = NULL;
 465 #if HAVE_STRPTIME
 466         pp = strptime(p, fmt, &tm);
 467 #endif
 468         if (NULL != pp && '\0' == *pp) {
 469                 *t = mktime(&tm);
 470                 return 1;
 471         }
 472
 473         return 0;
 474 }
 475
 476 static char *
 477 time2a(time_t t)
 478 {
 479         struct tm       *tm;
 480         char            *buf, *p;
 481         size_t           ssz;
 482         int              isz;
 483
 484         tm = localtime(&t);
 485         if (tm == NULL)
 486                 return NULL;
 487
 488         /*
 489          * Reserve space:
 490          * up to 9 characters for the month (September) + blank
 491          * up to 2 characters for the day + comma + blank
 492          * 4 characters for the year and a terminating '\0'
 493          */
 494
 495         p = buf = mandoc_malloc(10 + 4 + 4 + 1);
 496
 497         if ((ssz = strftime(p, 10 + 1, "%B ", tm)) == 0)
 498                 goto fail;
 499         p += (int)ssz;
 500
 501         /*
 502          * The output format is just "%d" here, not "%2d" or "%02d".
 503          * That's also the reason why we can't just format the
 504          * date as a whole with "%B %e, %Y" or "%B %d, %Y".
 505          * Besides, the present approach is less prone to buffer
 506          * overflows, in case anybody should ever introduce the bug
 507          * of looking at LC_TIME.
 508          */
 509
 510         if ((isz = snprintf(p, 4 + 1, "%d, ", tm->tm_mday)) == -1)
 511                 goto fail;
 512         p += isz;
 513
 514         if (strftime(p, 4 + 1, "%Y", tm) == 0)
 515                 goto fail;
 516         return buf;
 517
 518 fail:
 519         free(buf);
 520         return NULL;
 521 }
 522
 523 char *
 524 mandoc_normdate(struct roff_man *man, char *in, int ln, int pos)
 525 {
 526         time_t           t;
 527
 528         /* No date specified: use today's date. */
 529
 530         if (in == NULL || *in == '\0' || strcmp(in, "$" "Mdocdate$") == 0) {
 531                 mandoc_msg(MANDOCERR_DATE_MISSING, man->parse, ln, pos, NULL);
 532                 return time2a(time(NULL));
 533         }
 534
 535         /* Valid mdoc(7) date format. */
 536
 537         if (a2time(&t, "$" "Mdocdate: %b %d %Y $", in) ||
 538             a2time(&t, "%b %d, %Y", in))
 539                 return time2a(t);
 540
 541         /* In man(7), do not warn about the legacy format. */
 542
 543         if (a2time(&t, "%Y-%m-%d", in) == 0)
 544                 mandoc_msg(MANDOCERR_DATE_BAD, man->parse, ln, pos, in);
 545         else if (man->macroset == MACROSET_MDOC)
 546                 mandoc_vmsg(MANDOCERR_DATE_LEGACY, man->parse,
 547                     ln, pos, "Dd %s", in);
 548
 549         /* Use any non-mdoc(7) date verbatim. */
 550
 551         return mandoc_strdup(in);
 552 }
 553
 554 int
 555 mandoc_eos(const char *p, size_t sz)
 556 {
 557         const char      *q;
 558         int              enclosed, found;
 559
 560         if (0 == sz)
 561                 return 0;
 562
 563         /*
 564          * End-of-sentence recognition must include situations where
 565          * some symbols, such as `)', allow prior EOS punctuation to
 566          * propagate outward.
 567          */
 568
 569         enclosed = found = 0;
 570         for (q = p + (int)sz - 1; q >= p; q--) {
 571                 switch (*q) {
 572                 case '\"':
 573                 case '\'':
 574                 case ']':
 575                 case ')':
 576                         if (0 == found)
 577                                 enclosed = 1;
 578                         break;
 579                 case '.':
 580                 case '!':
 581                 case '?':
 582                         found = 1;
 583                         break;
 584                 default:
 585                         return found &&
 586                             (!enclosed || isalnum((unsigned char)*q));
 587                 }
 588         }
 589
 590         return found && !enclosed;
 591 }
 592
 593 /*
 594  * Convert a string to a long that may not be <0.
 595  * If the string is invalid, or is less than 0, return -1.
 596  */
 597 int
 598 mandoc_strntoi(const char *p, size_t sz, int base)
 599 {
 600         char             buf[32];
 601         char            *ep;
 602         long             v;
 603
 604         if (sz > 31)
 605                 return -1;
 606
 607         memcpy(buf, p, sz);
 608         buf[(int)sz] = '\0';
 609
 610         errno = 0;
 611         v = strtol(buf, &ep, base);
 612
 613         if (buf[0] == '\0' || *ep != '\0')
 614                 return -1;
 615
 616         if (v > INT_MAX)
 617                 v = INT_MAX;
 618         if (v < INT_MIN)
 619                 v = INT_MIN;
 620
 621         return (int)v;
 622 }