]> git.cameronkatri.com Git - mandoc.git/blob - mandoc.3
Reject the escape sequences \[uD800] to \[uDFFF] in the parser.
[mandoc.git] / mandoc.3
1 .\" $Id: mandoc.3,v 1.33 2015/10/13 22:59:54 schwarze Exp $
2 .\"
3 .\" Copyright (c) 2009, 2010, 2011 Kristaps Dzonsons <kristaps@bsd.lv>
4 .\" Copyright (c) 2010, 2013, 2014, 2015 Ingo Schwarze <schwarze@openbsd.org>
5 .\"
6 .\" Permission to use, copy, modify, and distribute this software for any
7 .\" purpose with or without fee is hereby granted, provided that the above
8 .\" copyright notice and this permission notice appear in all copies.
9 .\"
10 .\" THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES
11 .\" WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF
12 .\" MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR
13 .\" ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES
14 .\" WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN
15 .\" ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF
16 .\" OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
17 .\"
18 .Dd $Mdocdate: October 13 2015 $
19 .Dt MANDOC 3
20 .Os
21 .Sh NAME
22 .Nm mandoc ,
23 .Nm man_deroff ,
24 .Nm man_meta ,
25 .Nm man_mparse ,
26 .Nm man_node ,
27 .Nm mdoc_deroff ,
28 .Nm mdoc_meta ,
29 .Nm mdoc_node ,
30 .Nm mparse_alloc ,
31 .Nm mparse_free ,
32 .Nm mparse_getkeep ,
33 .Nm mparse_keep ,
34 .Nm mparse_open ,
35 .Nm mparse_readfd ,
36 .Nm mparse_reset ,
37 .Nm mparse_result ,
38 .Nm mparse_strerror ,
39 .Nm mparse_strlevel
40 .Nd mandoc macro compiler library
41 .Sh SYNOPSIS
42 .In sys/types.h
43 .In mandoc.h
44 .Pp
45 .Fd "#define ASCII_NBRSP"
46 .Fd "#define ASCII_HYPH"
47 .Fd "#define ASCII_BREAK"
48 .Ft struct mparse *
49 .Fo mparse_alloc
50 .Fa "int options"
51 .Fa "enum mandoclevel wlevel"
52 .Fa "mandocmsg mmsg"
53 .Fa "char *defos"
54 .Fc
55 .Ft void
56 .Fo (*mandocmsg)
57 .Fa "enum mandocerr errtype"
58 .Fa "enum mandoclevel level"
59 .Fa "const char *file"
60 .Fa "int line"
61 .Fa "int col"
62 .Fa "const char *msg"
63 .Fc
64 .Ft void
65 .Fo mparse_free
66 .Fa "struct mparse *parse"
67 .Fc
68 .Ft const char *
69 .Fo mparse_getkeep
70 .Fa "const struct mparse *parse"
71 .Fc
72 .Ft void
73 .Fo mparse_keep
74 .Fa "struct mparse *parse"
75 .Fc
76 .Ft "enum mandoclevel"
77 .Fo mparse_open
78 .Fa "struct mparse *parse"
79 .Fa "int *fd"
80 .Fa "const char *fname"
81 .Fc
82 .Ft "enum mandoclevel"
83 .Fo mparse_readfd
84 .Fa "struct mparse *parse"
85 .Fa "int fd"
86 .Fa "const char *fname"
87 .Fc
88 .Ft void
89 .Fo mparse_reset
90 .Fa "struct mparse *parse"
91 .Fc
92 .Ft void
93 .Fo mparse_result
94 .Fa "struct mparse *parse"
95 .Fa "struct mdoc **mdoc"
96 .Fa "struct man **man"
97 .Fa "char **sodest"
98 .Fc
99 .Ft "const char *"
100 .Fo mparse_strerror
101 .Fa "enum mandocerr"
102 .Fc
103 .Ft "const char *"
104 .Fo mparse_strlevel
105 .Fa "enum mandoclevel"
106 .Fc
107 .In sys/types.h
108 .In mandoc.h
109 .In mdoc.h
110 .Ft void
111 .Fo mdoc_deroff
112 .Fa "char **dest"
113 .Fa "const struct mdoc_node *node"
114 .Fc
115 .Ft "const struct mdoc_meta *"
116 .Fo mdoc_meta
117 .Fa "const struct mdoc *mdoc"
118 .Fc
119 .Ft "const struct mdoc_node *"
120 .Fo mdoc_node
121 .Fa "const struct mdoc *mdoc"
122 .Fc
123 .Vt extern const char * const * mdoc_argnames;
124 .Vt extern const char * const * mdoc_macronames;
125 .In sys/types.h
126 .In mandoc.h
127 .In man.h
128 .Ft void
129 .Fo man_deroff
130 .Fa "char **dest"
131 .Fa "const struct man_node *node"
132 .Fc
133 .Ft "const struct man_meta *"
134 .Fo man_meta
135 .Fa "const struct man *man"
136 .Fc
137 .Ft "const struct mparse *"
138 .Fo man_mparse
139 .Fa "const struct man *man"
140 .Fc
141 .Ft "const struct man_node *"
142 .Fo man_node
143 .Fa "const struct man *man"
144 .Fc
145 .Vt extern const char * const * man_macronames;
146 .Sh DESCRIPTION
147 The
148 .Nm mandoc
149 library parses a
150 .Ux
151 manual into an abstract syntax tree (AST).
152 .Ux
153 manuals are composed of
154 .Xr mdoc 7
155 or
156 .Xr man 7 ,
157 and may be mixed with
158 .Xr roff 7 ,
159 .Xr tbl 7 ,
160 and
161 .Xr eqn 7
162 invocations.
163 .Pp
164 The following describes a general parse sequence:
165 .Bl -enum
166 .It
167 initiate a parsing sequence with
168 .Xr mchars_alloc 3
169 and
170 .Fn mparse_alloc ;
171 .It
172 open a file with
173 .Xr open 2
174 or
175 .Fn mparse_open ;
176 .It
177 parse it with
178 .Fn mparse_readfd ;
179 .It
180 retrieve the syntax tree with
181 .Fn mparse_result ;
182 .It
183 iterate over parse nodes with
184 .Fn mdoc_node
185 or
186 .Fn man_node ;
187 .It
188 free all allocated memory with
189 .Fn mparse_free
190 and
191 .Xr mchars_free 3 ,
192 or invoke
193 .Fn mparse_reset
194 and parse new files.
195 .El
196 .Sh REFERENCE
197 This section documents the functions, types, and variables available
198 via
199 .In mandoc.h ,
200 with the exception of those documented in
201 .Xr mandoc_escape 3
202 and
203 .Xr mchars_alloc 3 .
204 .Ss Types
205 .Bl -ohang
206 .It Vt "enum mandocerr"
207 An error or warning message during parsing.
208 .It Vt "enum mandoclevel"
209 A classification of an
210 .Vt "enum mandocerr"
211 as regards system operation.
212 .It Vt "struct mparse"
213 An opaque pointer to a running parse sequence.
214 Created with
215 .Fn mparse_alloc
216 and freed with
217 .Fn mparse_free .
218 This may be used across parsed input if
219 .Fn mparse_reset
220 is called between parses.
221 .It Vt "mandocmsg"
222 A prototype for a function to handle error and warning
223 messages emitted by the parser.
224 .El
225 .Ss Functions
226 .Bl -ohang
227 .It Fn man_deroff
228 Obtain a text-only representation of a
229 .Vt struct man_node ,
230 including text contained in its child nodes.
231 To be used on children of the pointer returned from
232 .Fn man_node .
233 When it is no longer needed, the pointer returned from
234 .Fn man_deroff
235 can be passed to
236 .Xr free 3 .
237 .It Fn man_meta
238 Obtain the meta-data of a successful
239 .Xr man 7
240 parse.
241 This may only be used on a pointer returned by
242 .Fn mparse_result .
243 Declared in
244 .In man.h ,
245 implemented in
246 .Pa man.c .
247 .It Fn man_mparse
248 Get the parser used for the current output.
249 Declared in
250 .In man.h ,
251 implemented in
252 .Pa man.c .
253 .It Fn man_node
254 Obtain the root node of a successful
255 .Xr man 7
256 parse.
257 This may only be used on a pointer returned by
258 .Fn mparse_result .
259 Declared in
260 .In man.h ,
261 implemented in
262 .Pa man.c .
263 .It Fn mdoc_deroff
264 Obtain a text-only representation of a
265 .Vt struct mdoc_node ,
266 including text contained in its child nodes.
267 To be used on children of the pointer returned from
268 .Fn mdoc_node .
269 When it is no longer needed, the pointer returned from
270 .Fn mdoc_deroff
271 can be passed to
272 .Xr free 3 .
273 .It Fn mdoc_meta
274 Obtain the meta-data of a successful
275 .Xr mdoc
276 parse.
277 This may only be used on a pointer returned by
278 .Fn mparse_result .
279 Declared in
280 .In mdoc.h ,
281 implemented in
282 .Pa mdoc.c .
283 .It Fn mdoc_node
284 Obtain the root node of a successful
285 .Xr mdoc
286 parse.
287 This may only be used on a pointer returned by
288 .Fn mparse_result .
289 Declared in
290 .In mdoc.h ,
291 implemented in
292 .Pa mdoc.c .
293 .It Fn mparse_alloc
294 Allocate a parser.
295 The arguments have the following effect:
296 .Bl -tag -offset 5n -width inttype
297 .It Ar options
298 When the
299 .Dv MPARSE_MDOC
300 or
301 .Dv MPARSE_MAN
302 bit is set, only that parser is used.
303 Otherwise, the document type is automatically detected.
304 .Pp
305 When the
306 .Dv MPARSE_SO
307 bit is set,
308 .Xr roff 7
309 .Ic \&so
310 file inclusion requests are always honoured.
311 Otherwise, if the request is the only content in an input file,
312 only the file name is remembered, to be returned in the
313 .Fa sodest
314 argument of
315 .Fn mparse_result .
316 .Pp
317 When the
318 .Dv MPARSE_QUICK
319 bit is set, parsing is aborted after the NAME section.
320 This is for example useful in
321 .Xr makewhatis 8
322 .Fl Q
323 to quickly build minimal databases.
324 .It Ar wlevel
325 Can be set to
326 .Dv MANDOCLEVEL_BADARG ,
327 .Dv MANDOCLEVEL_ERROR ,
328 or
329 .Dv MANDOCLEVEL_WARNING .
330 Messages below the selected level will be suppressed.
331 .It Ar mmsg
332 A callback function to handle errors and warnings.
333 See
334 .Pa main.c
335 for an example.
336 .It Ar defos
337 A default string for the
338 .Xr mdoc 7
339 .Sq \&Os
340 macro, overriding the
341 .Dv OSNAME
342 preprocessor definition and the results of
343 .Xr uname 3 .
344 .El
345 .Pp
346 The same parser may be used for multiple files so long as
347 .Fn mparse_reset
348 is called between parses.
349 .Fn mparse_free
350 must be called to free the memory allocated by this function.
351 Declared in
352 .In mandoc.h ,
353 implemented in
354 .Pa read.c .
355 .It Fn mparse_free
356 Free all memory allocated by
357 .Fn mparse_alloc .
358 Declared in
359 .In mandoc.h ,
360 implemented in
361 .Pa read.c .
362 .It Fn mparse_getkeep
363 Acquire the keep buffer.
364 Must follow a call of
365 .Fn mparse_keep .
366 Declared in
367 .In mandoc.h ,
368 implemented in
369 .Pa read.c .
370 .It Fn mparse_keep
371 Instruct the parser to retain a copy of its parsed input.
372 This can be acquired with subsequent
373 .Fn mparse_getkeep
374 calls.
375 Declared in
376 .In mandoc.h ,
377 implemented in
378 .Pa read.c .
379 .It Fn mparse_open
380 Open the file for reading.
381 If that fails and
382 .Fa fname
383 does not already end in
384 .Ql .gz ,
385 try again after appending
386 .Ql .gz .
387 Save the information whether the file is zipped or not.
388 Return a file descriptor open for reading in
389 .Fa fd ,
390 or -1 on failure.
391 It can be passed to
392 .Fn mparse_readfd
393 or used directly.
394 Declared in
395 .In mandoc.h ,
396 implemented in
397 .Pa read.c .
398 .It Fn mparse_readfd
399 Parse a file descriptor opened with
400 .Xr open 2
401 or
402 .Fn mparse_open .
403 Pass the associated filename in
404 .Va fname .
405 This function may be called multiple times with different parameters; however,
406 .Fn mparse_reset
407 should be invoked between parses.
408 Declared in
409 .In mandoc.h ,
410 implemented in
411 .Pa read.c .
412 .It Fn mparse_reset
413 Reset a parser so that
414 .Fn mparse_readfd
415 may be used again.
416 Declared in
417 .In mandoc.h ,
418 implemented in
419 .Pa read.c .
420 .It Fn mparse_result
421 Obtain the result of a parse.
422 One of the three pointers will be filled in.
423 Declared in
424 .In mandoc.h ,
425 implemented in
426 .Pa read.c .
427 .It Fn mparse_strerror
428 Return a statically-allocated string representation of an error code.
429 Declared in
430 .In mandoc.h ,
431 implemented in
432 .Pa read.c .
433 .It Fn mparse_strlevel
434 Return a statically-allocated string representation of a level code.
435 Declared in
436 .In mandoc.h ,
437 implemented in
438 .Pa read.c .
439 .El
440 .Ss Variables
441 .Bl -ohang
442 .It Va man_macronames
443 The string representation of a man macro as indexed by
444 .Vt "enum mant" .
445 .It Va mdoc_argnames
446 The string representation of a mdoc macro argument as indexed by
447 .Vt "enum mdocargt" .
448 .It Va mdoc_macronames
449 The string representation of a mdoc macro as indexed by
450 .Vt "enum mdoct" .
451 .El
452 .Sh IMPLEMENTATION NOTES
453 This section consists of structural documentation for
454 .Xr mdoc 7
455 and
456 .Xr man 7
457 syntax trees and strings.
458 .Ss Man and Mdoc Strings
459 Strings may be extracted from mdoc and man meta-data, or from text
460 nodes (MDOC_TEXT and MAN_TEXT, respectively).
461 These strings have special non-printing formatting cues embedded in the
462 text itself, as well as
463 .Xr roff 7
464 escapes preserved from input.
465 Implementing systems will need to handle both situations to produce
466 human-readable text.
467 In general, strings may be assumed to consist of 7-bit ASCII characters.
468 .Pp
469 The following non-printing characters may be embedded in text strings:
470 .Bl -tag -width Ds
471 .It Dv ASCII_NBRSP
472 A non-breaking space character.
473 .It Dv ASCII_HYPH
474 A soft hyphen.
475 .It Dv ASCII_BREAK
476 A breakable zero-width space.
477 .El
478 .Pp
479 Escape characters are also passed verbatim into text strings.
480 An escape character is a sequence of characters beginning with the
481 backslash
482 .Pq Sq \e .
483 To construct human-readable text, these should be intercepted with
484 .Xr mandoc_escape 3
485 and converted with one the functions described in
486 .Xr mchars_alloc 3 .
487 .Ss Man Abstract Syntax Tree
488 This AST is governed by the ontological rules dictated in
489 .Xr man 7
490 and derives its terminology accordingly.
491 .Pp
492 The AST is composed of
493 .Vt struct man_node
494 nodes with element, root and text types as declared by the
495 .Va type
496 field.
497 Each node also provides its parse point (the
498 .Va line ,
499 .Va sec ,
500 and
501 .Va pos
502 fields), its position in the tree (the
503 .Va parent ,
504 .Va child ,
505 .Va next
506 and
507 .Va prev
508 fields) and some type-specific data.
509 .Pp
510 The tree itself is arranged according to the following normal form,
511 where capitalised non-terminals represent nodes.
512 .Pp
513 .Bl -tag -width "ELEMENTXX" -compact
514 .It ROOT
515 \(<- mnode+
516 .It mnode
517 \(<- ELEMENT | TEXT | BLOCK
518 .It BLOCK
519 \(<- HEAD BODY
520 .It HEAD
521 \(<- mnode*
522 .It BODY
523 \(<- mnode*
524 .It ELEMENT
525 \(<- ELEMENT | TEXT*
526 .It TEXT
527 \(<- [[:ascii:]]*
528 .El
529 .Pp
530 The only elements capable of nesting other elements are those with
531 next-line scope as documented in
532 .Xr man 7 .
533 .Ss Mdoc Abstract Syntax Tree
534 This AST is governed by the ontological
535 rules dictated in
536 .Xr mdoc 7
537 and derives its terminology accordingly.
538 .Qq In-line
539 elements described in
540 .Xr mdoc 7
541 are described simply as
542 .Qq elements .
543 .Pp
544 The AST is composed of
545 .Vt struct mdoc_node
546 nodes with block, head, body, element, root and text types as declared
547 by the
548 .Va type
549 field.
550 Each node also provides its parse point (the
551 .Va line ,
552 .Va sec ,
553 and
554 .Va pos
555 fields), its position in the tree (the
556 .Va parent ,
557 .Va child ,
558 .Va nchild ,
559 .Va next
560 and
561 .Va prev
562 fields) and some type-specific data, in particular, for nodes generated
563 from macros, the generating macro in the
564 .Va tok
565 field.
566 .Pp
567 The tree itself is arranged according to the following normal form,
568 where capitalised non-terminals represent nodes.
569 .Pp
570 .Bl -tag -width "ELEMENTXX" -compact
571 .It ROOT
572 \(<- mnode+
573 .It mnode
574 \(<- BLOCK | ELEMENT | TEXT
575 .It BLOCK
576 \(<- HEAD [TEXT] (BODY [TEXT])+ [TAIL [TEXT]]
577 .It ELEMENT
578 \(<- TEXT*
579 .It HEAD
580 \(<- mnode*
581 .It BODY
582 \(<- mnode* [ENDBODY mnode*]
583 .It TAIL
584 \(<- mnode*
585 .It TEXT
586 \(<- [[:ascii:]]*
587 .El
588 .Pp
589 Of note are the TEXT nodes following the HEAD, BODY and TAIL nodes of
590 the BLOCK production: these refer to punctuation marks.
591 Furthermore, although a TEXT node will generally have a non-zero-length
592 string, in the specific case of
593 .Sq \&.Bd \-literal ,
594 an empty line will produce a zero-length string.
595 Multiple body parts are only found in invocations of
596 .Sq \&Bl \-column ,
597 where a new body introduces a new phrase.
598 .Pp
599 The
600 .Xr mdoc 7
601 syntax tree accommodates for broken block structures as well.
602 The ENDBODY node is available to end the formatting associated
603 with a given block before the physical end of that block.
604 It has a non-null
605 .Va end
606 field, is of the BODY
607 .Va type ,
608 has the same
609 .Va tok
610 as the BLOCK it is ending, and has a
611 .Va pending
612 field pointing to that BLOCK's BODY node.
613 It is an indirect child of that BODY node
614 and has no children of its own.
615 .Pp
616 An ENDBODY node is generated when a block ends while one of its child
617 blocks is still open, like in the following example:
618 .Bd -literal -offset indent
619 \&.Ao ao
620 \&.Bo bo ac
621 \&.Ac bc
622 \&.Bc end
623 .Ed
624 .Pp
625 This example results in the following block structure:
626 .Bd -literal -offset indent
627 BLOCK Ao
628 HEAD Ao
629 BODY Ao
630 TEXT ao
631 BLOCK Bo, pending -> Ao
632 HEAD Bo
633 BODY Bo
634 TEXT bo
635 TEXT ac
636 ENDBODY Ao, pending -> Ao
637 TEXT bc
638 TEXT end
639 .Ed
640 .Pp
641 Here, the formatting of the
642 .Sq \&Ao
643 block extends from TEXT ao to TEXT ac,
644 while the formatting of the
645 .Sq \&Bo
646 block extends from TEXT bo to TEXT bc.
647 It renders as follows in
648 .Fl T Ns Cm ascii
649 mode:
650 .Pp
651 .Dl <ao [bo ac> bc] end
652 .Pp
653 Support for badly-nested blocks is only provided for backward
654 compatibility with some older
655 .Xr mdoc 7
656 implementations.
657 Using badly-nested blocks is
658 .Em strongly discouraged ;
659 for example, the
660 .Fl T Ns Cm html
661 and
662 .Fl T Ns Cm xhtml
663 front-ends to
664 .Xr mandoc 1
665 are unable to render them in any meaningful way.
666 Furthermore, behaviour when encountering badly-nested blocks is not
667 consistent across troff implementations, especially when using multiple
668 levels of badly-nested blocks.
669 .Sh SEE ALSO
670 .Xr mandoc 1 ,
671 .Xr mandoc_escape 3 ,
672 .Xr mandoc_malloc 3 ,
673 .Xr mchars_alloc 3 ,
674 .Xr eqn 7 ,
675 .Xr man 7 ,
676 .Xr mandoc_char 7 ,
677 .Xr mdoc 7 ,
678 .Xr roff 7 ,
679 .Xr tbl 7
680 .Sh AUTHORS
681 The
682 .Nm
683 library was written by
684 .An Kristaps Dzonsons Aq Mt kristaps@bsd.lv .