]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.3
Security fix to prevent XSS attacks:
[mandoc.git] / mandoc.3
1 .\" $Id: mandoc.3,v 1.24 2014/03/23 11:25:26 schwarze Exp $
2 .\"
3 .\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 .\" Copyright (c) 2010 Ingo Schwarze <schwarze@openbsd.org>
5 .\"
6 .\" Permission to use, copy, modify, and distribute this software for any
7 .\" purpose with or without fee is hereby granted, provided that the above
8 .\" copyright notice and this permission notice appear in all copies.
9 .\"
10 .\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 .\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 .\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 .\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 .\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 .\"
18 .Dd $Mdocdate: March 23 2014 $
19 .Dt MANDOC 3
20 .Os
21 .Sh NAME
22 .Nm mandoc ,
23 .Nm mandoc_calloc ,
24 .Nm mandoc_escape ,
25 .Nm mandoc_malloc ,
26 .Nm mandoc_realloc ,
27 .Nm mandoc_strdup ,
28 .Nm mandoc_strndup ,
29 .Nm man_meta ,
30 .Nm man_mparse ,
31 .Nm man_node ,
32 .Nm mchars_alloc ,
33 .Nm mchars_free ,
34 .Nm mchars_num2char ,
35 .Nm mchars_num2uc ,
36 .Nm mchars_spec2cp ,
37 .Nm mchars_spec2str ,
38 .Nm mdoc_meta ,
39 .Nm mdoc_node ,
40 .Nm mparse_alloc ,
41 .Nm mparse_free ,
42 .Nm mparse_getkeep ,
43 .Nm mparse_keep ,
44 .Nm mparse_readfd ,
45 .Nm mparse_reset ,
46 .Nm mparse_result ,
47 .Nm mparse_strerror ,
48 .Nm mparse_strlevel
49 .Nd mandoc macro compiler library
50 .Sh LIBRARY
51 .Lb libmandoc
52 .Sh SYNOPSIS
53 .In mandoc.h
54 .Fd "#define ASCII_NBRSP"
55 .Fd "#define ASCII_HYPH"
56 .Fd "#define ASCII_BREAK"
57 .Ft "void *"
58 .Fo mandoc_calloc
59 .Fa "size_t nmemb"
60 .Fa "size_t size"
61 .Fc
62 .Ft "enum mandoc_esc"
63 .Fo mandoc_escape
64 .Fa "const char **end"
65 .Fa "const char **start"
66 .Fa "int *sz"
67 .Fc
68 .Ft "void *"
69 .Fn mandoc_malloc "size_t size"
70 .Ft "struct mchars *"
71 .Fo mandoc_realloc
72 .Fa "void *ptr"
73 .Fa "size_t size"
74 .Fc
75 .Ft "char *"
76 .Fn mandoc_strdup
77 .Fn mchars_alloc "void"
78 .Ft void
79 .Fn mchars_free "struct mchars *p"
80 .Ft char
81 .Fn mchars_num2char "const char *cp" "size_t sz"
82 .Ft int
83 .Fn mchars_num2uc "const char *cp" "size_t sz"
84 .Ft "const char *"
85 .Fo mchars_spec2str
86 .Fa "const struct mchars *p"
87 .Fa "const char *cp"
88 .Fa "size_t sz"
89 .Fa "size_t *rsz"
90 .Fc
91 .Ft int
92 .Fo mchars_spec2cp
93 .Fa "const struct mchars *p"
94 .Fa "const char *cp"
95 .Fa "size_t sz"
96 .Fc
97 .Ft void
98 .Fo mparse_alloc
99 .Fa "enum mparset inttype"
100 .Fa "enum mandoclevel wlevel"
101 .Fa "mandocmsg mmsg"
102 .Fa "char *defos"
103 .Fa "int quick"
104 .Fc
105 .Ft void
106 .Fo (*mandocmsg)
107 .Fa "enum mandocerr errtype"
108 .Fa "enum mandoclevel level"
109 .Fa "const char *file"
110 .Fa "int line"
111 .Fa "int col"
112 .Fa "const char *msg"
113 .Fc
114 .Ft void
115 .Fo mparse_free
116 .Fa "struct mparse *parse"
117 .Fc
118 .Ft const char *
119 .Fo mparse_getkeep
120 .Fa "const struct mparse *parse"
121 .Fc
122 .Ft void
123 .Fo mparse_keep
124 .Fa "struct mparse *parse"
125 .Fc
126 .Ft "enum mandoclevel"
127 .Fo mparse_readfd
128 .Fa "struct mparse *parse"
129 .Fa "int fd"
130 .Fa "const char *fname"
131 .Fc
132 .Ft void
133 .Fo mparse_reset
134 .Fa "struct mparse *parse"
135 .Fc
136 .Ft void
137 .Fo mparse_result
138 .Fa "struct mparse *parse"
139 .Fa "struct mdoc **mdoc"
140 .Fa "struct man **man"
141 .Fc
142 .Ft "const char *"
143 .Fo mparse_strerror
144 .Fa "enum mandocerr"
145 .Fc
146 .Ft "const char *"
147 .Fo mparse_strlevel
148 .Fa "enum mandoclevel"
149 .Fc
150 .In mandoc.h
151 .In mdoc.h
152 .Ft "const struct mdoc_meta *"
153 .Fo mdoc_meta
154 .Fa "const struct mdoc *mdoc"
155 .Fc
156 .Ft "const struct mdoc_node *"
157 .Fo mdoc_node
158 .Fa "const struct mdoc *mdoc"
159 .Fc
160 .Vt extern const char * const * mdoc_argnames;
161 .Vt extern const char * const * mdoc_macronames;
162 .In mandoc.h
163 .In man.h
164 .Ft "const struct man_meta *"
165 .Fo man_meta
166 .Fa "const struct man *man"
167 .Fc
168 .Ft "const struct mparse *"
169 .Fo man_mparse
170 .Fa "const struct man *man"
171 .Fc
172 .Ft "const struct man_node *"
173 .Fo man_node
174 .Fa "const struct man *man"
175 .Fc
176 .Vt extern const char * const * man_macronames;
177 .Sh DESCRIPTION
178 The
179 .Nm mandoc
180 library parses a
181 .Ux
182 manual into an abstract syntax tree (AST).
183 .Ux
184 manuals are composed of
185 .Xr mdoc 7
186 or
187 .Xr man 7 ,
188 and may be mixed with
189 .Xr roff 7 ,
190 .Xr tbl 7 ,
191 and
192 .Xr eqn 7
193 invocations.
194 .Pp
195 The following describes a general parse sequence:
196 .Bl -enum
197 .It
198 initiate a parsing sequence with
199 .Fn mparse_alloc ;
200 .It
201 parse files or file descriptors with
202 .Fn mparse_readfd ;
203 .It
204 retrieve a parsed syntax tree, if the parse was successful, with
205 .Fn mparse_result ;
206 .It
207 iterate over parse nodes with
208 .Fn mdoc_node
209 or
210 .Fn man_node ;
211 .It
212 free all allocated memory with
213 .Fn mparse_free ,
214 or invoke
215 .Fn mparse_reset
216 and parse new files.
217 .El
218 .Pp
219 The
220 .Nm
221 library also contains routines for translating character strings into glyphs
222 .Pq see Fn mchars_alloc
223 and parsing escape sequences from strings
224 .Pq see Fn mandoc_escape .
225 .Sh REFERENCE
226 This section documents the functions, types, and variables available
227 via
228 .In mandoc.h .
229 .Ss Types
230 .Bl -ohang
231 .It Vt "enum mandoc_esc"
232 An escape sequence classification.
233 .It Vt "enum mandocerr"
234 A fatal error, error, or warning message during parsing.
235 .It Vt "enum mandoclevel"
236 A classification of an
237 .Vt "enum mandocerr"
238 as regards system operation.
239 .It Vt "struct mchars"
240 An opaque pointer to an object allowing for translation between
241 character strings and glyphs.
242 See
243 .Fn mchars_alloc .
244 .It Vt "enum mparset"
245 The type of parser when reading input.
246 This should usually be
247 .Dv MPARSE_AUTO
248 for auto-detection.
249 .It Vt "struct mparse"
250 An opaque pointer to a running parse sequence.
251 Created with
252 .Fn mparse_alloc
253 and freed with
254 .Fn mparse_free .
255 This may be used across parsed input if
256 .Fn mparse_reset
257 is called between parses.
258 .It Vt "mandocmsg"
259 A prototype for a function to handle fatal error, error, and warning
260 messages emitted by the parser.
261 .El
262 .Ss Functions
263 .Bl -ohang
264 .It Fn mandoc_escape
265 Scan an escape sequence, i.e., a character string beginning with
266 .Sq \e .
267 Pass a pointer to the character after the
268 .Sq \e
269 as
270 .Va end ;
271 it will be set to the supremum of the parsed escape sequence unless
272 returning
273 .Dv ESCAPE_ERROR ,
274 in which case the string is bogus and should be
275 thrown away.
276 If not
277 .Dv ESCAPE_ERROR
278 or
279 .Dv ESCAPE_IGNORE ,
280 .Va start
281 is set to the first relevant character of the substring (font, glyph,
282 whatever) of length
283 .Va sz .
284 Both
285 .Va start
286 and
287 .Va sz
288 may be
289 .Dv NULL .
290 Declared in
291 .In mandoc.h ,
292 implemented in
293 .Pa mandoc.c .
294 .It Fn man_meta
295 Obtain the meta-data of a successful parse.
296 This may only be used on a pointer returned by
297 .Fn mparse_result .
298 Declared in
299 .In man.h ,
300 implemented in
301 .Pa man.c .
302 .It Fn man_mparse
303 Get the parser used for the current output.
304 Declared in
305 .In man.h ,
306 implemented in
307 .Pa man.c .
308 .It Fn man_node
309 Obtain the root node of a successful parse.
310 This may only be used on a pointer returned by
311 .Fn mparse_result .
312 Declared in
313 .In man.h ,
314 implemented in
315 .Pa man.c .
316 .It Fn mchars_alloc
317 Allocate an
318 .Vt "struct mchars *"
319 object for translating special characters into glyphs.
320 See
321 .Xr mandoc_char 7
322 for an overview of special characters.
323 The object must be freed with
324 .Fn mchars_free .
325 Declared in
326 .In mandoc.h ,
327 implemented in
328 .Pa chars.c .
329 .It Fn mchars_free
330 Free an object created with
331 .Fn mchars_alloc .
332 Declared in
333 .In mandoc.h ,
334 implemented in
335 .Pa chars.c .
336 .It Fn mchars_num2char
337 Convert a character index (e.g., the \eN\(aq\(aq escape) into a
338 printable ASCII character.
339 Returns \e0 (the nil character) if the input sequence is malformed.
340 Declared in
341 .In mandoc.h ,
342 implemented in
343 .Pa chars.c .
344 .It Fn mchars_num2uc
345 Convert a hexadecimal character index (e.g., the \e[uNNNN] escape) into
346 a Unicode codepoint.
347 Returns \e0 (the nil character) if the input sequence is malformed.
348 Declared in
349 .In mandoc.h ,
350 implemented in
351 .Pa chars.c .
352 .It Fn mchars_spec2cp
353 Convert a special character into a valid Unicode codepoint.
354 Returns \-1 on failure or a non-zero Unicode codepoint on success.
355 Declared in
356 .In mandoc.h ,
357 implemented in
358 .Pa chars.c .
359 .It Fn mchars_spec2str
360 Convert a special character into an ASCII string.
361 Returns
362 .Dv NULL
363 on failure.
364 Declared in
365 .In mandoc.h ,
366 implemented in
367 .Pa chars.c .
368 .It Fn mdoc_meta
369 Obtain the meta-data of a successful parse.
370 This may only be used on a pointer returned by
371 .Fn mparse_result .
372 Declared in
373 .In mdoc.h ,
374 implemented in
375 .Pa mdoc.c .
376 .It Fn mdoc_node
377 Obtain the root node of a successful parse.
378 This may only be used on a pointer returned by
379 .Fn mparse_result .
380 Declared in
381 .In mdoc.h ,
382 implemented in
383 .Pa mdoc.c .
384 .It Fn mparse_alloc
385 Allocate a parser.
386 The arguments have the following effect:
387 .Bl -tag -offset 5n -width inttype
388 .It Ar inttype
389 When set to
390 .Dv MPARSE_MDOC
391 or
392 .Dv MPARSE_MAN ,
393 only that parser will be used.
394 With
395 .Dv MPARSE_AUTO ,
396 the document type will be automatically detected.
397 .It Ar wlevel
398 Can be set to
399 .Dv MANDOCLEVEL_FATAL ,
400 .Dv MANDOCLEVEL_ERROR ,
401 or
402 .Dv MANDOCLEVEL_WARNING .
403 Messages below the selected level will be suppressed.
404 .It Ar mmsg
405 A callback function to handle errors and warnings.
406 See
407 .Pa main.c
408 for an example.
409 .It Ar defos
410 A default string for the
411 .Xr mdoc 7
412 .Sq \&Os
413 macro, overriding the
414 .Dv OSNAME
415 preprocessor definition and the results of
416 .Xr uname 3 .
417 .It Ar quick
418 When set, parsing is aborted after the NAME section.
419 This is for example useful to quickly build minimal databases.
420 .El
421 .Pp
422 The same parser may be used for multiple files so long as
423 .Fn mparse_reset
424 is called between parses.
425 .Fn mparse_free
426 must be called to free the memory allocated by this function.
427 Declared in
428 .In mandoc.h ,
429 implemented in
430 .Pa read.c .
431 .It Fn mparse_free
432 Free all memory allocated by
433 .Fn mparse_alloc .
434 Declared in
435 .In mandoc.h ,
436 implemented in
437 .Pa read.c .
438 .It Fn mparse_getkeep
439 Acquire the keep buffer.
440 Must follow a call of
441 .Fn mparse_keep .
442 Declared in
443 .In mandoc.h ,
444 implemented in
445 .Pa read.c .
446 .It Fn mparse_keep
447 Instruct the parser to retain a copy of its parsed input.
448 This can be acquired with subsequent
449 .Fn mparse_getkeep
450 calls.
451 Declared in
452 .In mandoc.h ,
453 implemented in
454 .Pa read.c .
455 .It Fn mparse_readfd
456 Parse a file or file descriptor.
457 If
458 .Va fd
459 is -1,
460 .Va fname
461 is opened for reading.
462 Otherwise,
463 .Va fname
464 is assumed to be the name associated with
465 .Va fd .
466 This may be called multiple times with different parameters; however,
467 .Fn mparse_reset
468 should be invoked between parses.
469 Declared in
470 .In mandoc.h ,
471 implemented in
472 .Pa read.c .
473 .It Fn mparse_reset
474 Reset a parser so that
475 .Fn mparse_readfd
476 may be used again.
477 Declared in
478 .In mandoc.h ,
479 implemented in
480 .Pa read.c .
481 .It Fn mparse_result
482 Obtain the result of a parse.
483 Only successful parses
484 .Po
485 i.e., those where
486 .Fn mparse_readfd
487 returned less than MANDOCLEVEL_FATAL
488 .Pc
489 should invoke this function, in which case one of the two pointers will
490 be filled in.
491 Declared in
492 .In mandoc.h ,
493 implemented in
494 .Pa read.c .
495 .It Fn mparse_strerror
496 Return a statically-allocated string representation of an error code.
497 Declared in
498 .In mandoc.h ,
499 implemented in
500 .Pa read.c .
501 .It Fn mparse_strlevel
502 Return a statically-allocated string representation of a level code.
503 Declared in
504 .In mandoc.h ,
505 implemented in
506 .Pa read.c .
507 .El
508 .Ss Variables
509 .Bl -ohang
510 .It Va man_macronames
511 The string representation of a man macro as indexed by
512 .Vt "enum mant" .
513 .It Va mdoc_argnames
514 The string representation of a mdoc macro argument as indexed by
515 .Vt "enum mdocargt" .
516 .It Va mdoc_macronames
517 The string representation of a mdoc macro as indexed by
518 .Vt "enum mdoct" .
519 .El
520 .Sh IMPLEMENTATION NOTES
521 This section consists of structural documentation for
522 .Xr mdoc 7
523 and
524 .Xr man 7
525 syntax trees and strings.
526 .Ss Man and Mdoc Strings
527 Strings may be extracted from mdoc and man meta-data, or from text
528 nodes (MDOC_TEXT and MAN_TEXT, respectively).
529 These strings have special non-printing formatting cues embedded in the
530 text itself, as well as
531 .Xr roff 7
532 escapes preserved from input.
533 Implementing systems will need to handle both situations to produce
534 human-readable text.
535 In general, strings may be assumed to consist of 7-bit ASCII characters.
536 .Pp
537 The following non-printing characters may be embedded in text strings:
538 .Bl -tag -width Ds
539 .It Dv ASCII_NBRSP
540 A non-breaking space character.
541 .It Dv ASCII_HYPH
542 A soft hyphen.
543 .El
544 .Pp
545 Escape characters are also passed verbatim into text strings.
546 An escape character is a sequence of characters beginning with the
547 backslash
548 .Pq Sq \e .
549 To construct human-readable text, these should be intercepted with
550 .Fn mandoc_escape
551 and converted with one of
552 .Fn mchars_num2char ,
553 .Fn mchars_spec2str ,
554 and so on.
555 .Ss Man Abstract Syntax Tree
556 This AST is governed by the ontological rules dictated in
557 .Xr man 7
558 and derives its terminology accordingly.
559 .Pp
560 The AST is composed of
561 .Vt struct man_node
562 nodes with element, root and text types as declared by the
563 .Va type
564 field.
565 Each node also provides its parse point (the
566 .Va line ,
567 .Va sec ,
568 and
569 .Va pos
570 fields), its position in the tree (the
571 .Va parent ,
572 .Va child ,
573 .Va next
574 and
575 .Va prev
576 fields) and some type-specific data.
577 .Pp
578 The tree itself is arranged according to the following normal form,
579 where capitalised non-terminals represent nodes.
580 .Pp
581 .Bl -tag -width "ELEMENTXX" -compact
582 .It ROOT
583 \(<- mnode+
584 .It mnode
585 \(<- ELEMENT | TEXT | BLOCK
586 .It BLOCK
587 \(<- HEAD BODY
588 .It HEAD
589 \(<- mnode*
590 .It BODY
591 \(<- mnode*
592 .It ELEMENT
593 \(<- ELEMENT | TEXT*
594 .It TEXT
595 \(<- [[:ascii:]]*
596 .El
597 .Pp
598 The only elements capable of nesting other elements are those with
599 next-lint scope as documented in
600 .Xr man 7 .
601 .Ss Mdoc Abstract Syntax Tree
602 This AST is governed by the ontological
603 rules dictated in
604 .Xr mdoc 7
605 and derives its terminology accordingly.
606 .Qq In-line
607 elements described in
608 .Xr mdoc 7
609 are described simply as
610 .Qq elements .
611 .Pp
612 The AST is composed of
613 .Vt struct mdoc_node
614 nodes with block, head, body, element, root and text types as declared
615 by the
616 .Va type
617 field.
618 Each node also provides its parse point (the
619 .Va line ,
620 .Va sec ,
621 and
622 .Va pos
623 fields), its position in the tree (the
624 .Va parent ,
625 .Va child ,
626 .Va nchild ,
627 .Va next
628 and
629 .Va prev
630 fields) and some type-specific data, in particular, for nodes generated
631 from macros, the generating macro in the
632 .Va tok
633 field.
634 .Pp
635 The tree itself is arranged according to the following normal form,
636 where capitalised non-terminals represent nodes.
637 .Pp
638 .Bl -tag -width "ELEMENTXX" -compact
639 .It ROOT
640 \(<- mnode+
641 .It mnode
642 \(<- BLOCK | ELEMENT | TEXT
643 .It BLOCK
644 \(<- HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]]
645 .It ELEMENT
646 \(<- TEXT*
647 .It HEAD
648 \(<- mnode*
649 .It BODY
650 \(<- mnode* [ENDBODY mnode*]
651 .It TAIL
652 \(<- mnode*
653 .It TEXT
654 \(<- [[:ascii:]]*
655 .El
656 .Pp
657 Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of
658 the BLOCK production: these refer to punctuation marks.
659 Furthermore, although a TEXT node will generally have a non-zero-length
660 string, in the specific case of
661 .Sq \&.Bd \-literal ,
662 an empty line will produce a zero-length string.
663 Multiple body parts are only found in invocations of
664 .Sq \&Bl \-column ,
665 where a new body introduces a new phrase.
666 .Pp
667 The
668 .Xr mdoc 7
669 syntax tree accommodates for broken block structures as well.
670 The ENDBODY node is available to end the formatting associated
671 with a given block before the physical end of that block.
672 It has a non-null
673 .Va end
674 field, is of the BODY
675 .Va type ,
676 has the same
677 .Va tok
678 as the BLOCK it is ending, and has a
679 .Va pending
680 field pointing to that BLOCK's BODY node.
681 It is an indirect child of that BODY node
682 and has no children of its own.
683 .Pp
684 An ENDBODY node is generated when a block ends while one of its child
685 blocks is still open, like in the following example:
686 .Bd -literal -offset indent
687 \&.Ao ao
688 \&.Bo bo ac
689 \&.Ac bc
690 \&.Bc end
691 .Ed
692 .Pp
693 This example results in the following block structure:
694 .Bd -literal -offset indent
695 BLOCK Ao
696 HEAD Ao
697 BODY Ao
698 TEXT ao
699 BLOCK Bo, pending -> Ao
700 HEAD Bo
701 BODY Bo
702 TEXT bo
703 TEXT ac
704 ENDBODY Ao, pending -> Ao
705 TEXT bc
706 TEXT end
707 .Ed
708 .Pp
709 Here, the formatting of the
710 .Sq \&Ao
711 block extends from TEXT ao to TEXT ac,
712 while the formatting of the
713 .Sq \&Bo
714 block extends from TEXT bo to TEXT bc.
715 It renders as follows in
716 .Fl T Ns Cm ascii
717 mode:
718 .Pp
719 .Dl <ao [bo ac> bc] end
720 .Pp
721 Support for badly-nested blocks is only provided for backward
722 compatibility with some older
723 .Xr mdoc 7
724 implementations.
725 Using badly-nested blocks is
726 .Em strongly discouraged ;
727 for example, the
728 .Fl T Ns Cm html
729 and
730 .Fl T Ns Cm xhtml
731 front-ends to
732 .Xr mandoc 1
733 are unable to render them in any meaningful way.
734 Furthermore, behaviour when encountering badly-nested blocks is not
735 consistent across troff implementations, especially when using multiple
736 levels of badly-nested blocks.
737 .Sh SEE ALSO
738 .Xr mandoc 1 ,
739 .Xr eqn 7 ,
740 .Xr man 7 ,
741 .Xr mandoc_char 7 ,
742 .Xr mdoc 7 ,
743 .Xr roff 7 ,
744 .Xr tbl 7
745 .Sh AUTHORS
746 The
747 .Nm
748 library was written by
749 .An Kristaps Dzonsons Aq Mt kristaps@bsd.lv .