]> git.cameronkatri.com Git - cgit.git/blob - filters/html-converters/resources/markdown.pl
ui-shared: fix resource leak: free allocation from cgit_hosturl
[cgit.git] / filters / html-converters / resources / markdown.pl
1 #!/usr/bin/perl
2
3 #
4 # Markdown -- A text-to-HTML conversion tool for web writers
5 #
6 # Copyright (c) 2004 John Gruber
7 # <http://daringfireball.net/projects/markdown/>
8 #
9
10
11 package Markdown;
12 require 5.006_000;
13 use strict;
14 use warnings;
15
16 use Digest::MD5 qw(md5_hex);
17 use vars qw($VERSION);
18 $VERSION = '1.0.1';
19 # Tue 14 Dec 2004
20
21
22 #
23 # Global default settings:
24 #
25 my $g_empty_element_suffix = " />"; # Change to ">" for HTML output
26 my $g_tab_width = 4;
27
28
29 #
30 # Globals:
31 #
32
33 # Regex to match balanced [brackets]. See Friedl's
34 # "Mastering Regular Expressions", 2nd Ed., pp. 328-331.
35 my $g_nested_brackets;
36 $g_nested_brackets = qr{
37 (?> # Atomic matching
38 [^\[\]]+ # Anything other than brackets
39 |
40 \[
41 (??{ $g_nested_brackets }) # Recursive set of nested brackets
42 \]
43 )*
44 }x;
45
46
47 # Table of hash values for escaped characters:
48 my %g_escape_table;
49 foreach my $char (split //, '\\`*_{}[]()>#+-.!') {
50 $g_escape_table{$char} = md5_hex($char);
51 }
52
53
54 # Global hashes, used by various utility routines
55 my %g_urls;
56 my %g_titles;
57 my %g_html_blocks;
58
59 # Used to track when we're inside an ordered or unordered list
60 # (see _ProcessListItems() for details):
61 my $g_list_level = 0;
62
63
64 #### Blosxom plug-in interface ##########################################
65
66 # Set $g_blosxom_use_meta to 1 to use Blosxom's meta plug-in to determine
67 # which posts Markdown should process, using a "meta-markup: markdown"
68 # header. If it's set to 0 (the default), Markdown will process all
69 # entries.
70 my $g_blosxom_use_meta = 0;
71
72 sub start { 1; }
73 sub story {
74 my($pkg, $path, $filename, $story_ref, $title_ref, $body_ref) = @_;
75
76 if ( (! $g_blosxom_use_meta) or
77 (defined($meta::markup) and ($meta::markup =~ /^\s*markdown\s*$/i))
78 ){
79 $$body_ref = Markdown($$body_ref);
80 }
81 1;
82 }
83
84
85 #### Movable Type plug-in interface #####################################
86 eval {require MT}; # Test to see if we're running in MT.
87 unless ($@) {
88 require MT;
89 import MT;
90 require MT::Template::Context;
91 import MT::Template::Context;
92
93 eval {require MT::Plugin}; # Test to see if we're running >= MT 3.0.
94 unless ($@) {
95 require MT::Plugin;
96 import MT::Plugin;
97 my $plugin = new MT::Plugin({
98 name => "Markdown",
99 description => "A plain-text-to-HTML formatting plugin. (Version: $VERSION)",
100 doc_link => 'http://daringfireball.net/projects/markdown/'
101 });
102 MT->add_plugin( $plugin );
103 }
104
105 MT::Template::Context->add_container_tag(MarkdownOptions => sub {
106 my $ctx = shift;
107 my $args = shift;
108 my $builder = $ctx->stash('builder');
109 my $tokens = $ctx->stash('tokens');
110
111 if (defined ($args->{'output'}) ) {
112 $ctx->stash('markdown_output', lc $args->{'output'});
113 }
114
115 defined (my $str = $builder->build($ctx, $tokens) )
116 or return $ctx->error($builder->errstr);
117 $str; # return value
118 });
119
120 MT->add_text_filter('markdown' => {
121 label => 'Markdown',
122 docs => 'http://daringfireball.net/projects/markdown/',
123 on_format => sub {
124 my $text = shift;
125 my $ctx = shift;
126 my $raw = 0;
127 if (defined $ctx) {
128 my $output = $ctx->stash('markdown_output');
129 if (defined $output && $output =~ m/^html/i) {
130 $g_empty_element_suffix = ">";
131 $ctx->stash('markdown_output', '');
132 }
133 elsif (defined $output && $output eq 'raw') {
134 $raw = 1;
135 $ctx->stash('markdown_output', '');
136 }
137 else {
138 $raw = 0;
139 $g_empty_element_suffix = " />";
140 }
141 }
142 $text = $raw ? $text : Markdown($text);
143 $text;
144 },
145 });
146
147 # If SmartyPants is loaded, add a combo Markdown/SmartyPants text filter:
148 my $smartypants;
149
150 {
151 no warnings "once";
152 $smartypants = $MT::Template::Context::Global_filters{'smarty_pants'};
153 }
154
155 if ($smartypants) {
156 MT->add_text_filter('markdown_with_smartypants' => {
157 label => 'Markdown With SmartyPants',
158 docs => 'http://daringfireball.net/projects/markdown/',
159 on_format => sub {
160 my $text = shift;
161 my $ctx = shift;
162 if (defined $ctx) {
163 my $output = $ctx->stash('markdown_output');
164 if (defined $output && $output eq 'html') {
165 $g_empty_element_suffix = ">";
166 }
167 else {
168 $g_empty_element_suffix = " />";
169 }
170 }
171 $text = Markdown($text);
172 $text = $smartypants->($text, '1');
173 },
174 });
175 }
176 }
177 else {
178 #### BBEdit/command-line text filter interface ##########################
179 # Needs to be hidden from MT (and Blosxom when running in static mode).
180
181 # We're only using $blosxom::version once; tell Perl not to warn us:
182 no warnings 'once';
183 unless ( defined($blosxom::version) ) {
184 use warnings;
185
186 #### Check for command-line switches: #################
187 my %cli_opts;
188 use Getopt::Long;
189 Getopt::Long::Configure('pass_through');
190 GetOptions(\%cli_opts,
191 'version',
192 'shortversion',
193 'html4tags',
194 );
195 if ($cli_opts{'version'}) { # Version info
196 print "\nThis is Markdown, version $VERSION.\n";
197 print "Copyright 2004 John Gruber\n";
198 print "http://daringfireball.net/projects/markdown/\n\n";
199 exit 0;
200 }
201 if ($cli_opts{'shortversion'}) { # Just the version number string.
202 print $VERSION;
203 exit 0;
204 }
205 if ($cli_opts{'html4tags'}) { # Use HTML tag style instead of XHTML
206 $g_empty_element_suffix = ">";
207 }
208
209
210 #### Process incoming text: ###########################
211 my $text;
212 {
213 local $/; # Slurp the whole file
214 $text = <>;
215 }
216 print <<'EOT';
217 <style>
218 .markdown-body {
219 font-size: 14px;
220 line-height: 1.6;
221 overflow: hidden;
222 }
223 .markdown-body>*:first-child {
224 margin-top: 0 !important;
225 }
226 .markdown-body>*:last-child {
227 margin-bottom: 0 !important;
228 }
229 .markdown-body a.absent {
230 color: #c00;
231 }
232 .markdown-body a.anchor {
233 display: block;
234 padding-left: 30px;
235 margin-left: -30px;
236 cursor: pointer;
237 position: absolute;
238 top: 0;
239 left: 0;
240 bottom: 0;
241 }
242 .markdown-body h1, .markdown-body h2, .markdown-body h3, .markdown-body h4, .markdown-body h5, .markdown-body h6 {
243 margin: 20px 0 10px;
244 padding: 0;
245 font-weight: bold;
246 -webkit-font-smoothing: antialiased;
247 cursor: text;
248 position: relative;
249 }
250 .markdown-body h1 .mini-icon-link, .markdown-body h2 .mini-icon-link, .markdown-body h3 .mini-icon-link, .markdown-body h4 .mini-icon-link, .markdown-body h5 .mini-icon-link, .markdown-body h6 .mini-icon-link {
251 display: none;
252 color: #000;
253 }
254 .markdown-body h1:hover a.anchor, .markdown-body h2:hover a.anchor, .markdown-body h3:hover a.anchor, .markdown-body h4:hover a.anchor, .markdown-body h5:hover a.anchor, .markdown-body h6:hover a.anchor {
255 text-decoration: none;
256 line-height: 1;
257 padding-left: 0;
258 margin-left: -22px;
259 top: 15%}
260 .markdown-body h1:hover a.anchor .mini-icon-link, .markdown-body h2:hover a.anchor .mini-icon-link, .markdown-body h3:hover a.anchor .mini-icon-link, .markdown-body h4:hover a.anchor .mini-icon-link, .markdown-body h5:hover a.anchor .mini-icon-link, .markdown-body h6:hover a.anchor .mini-icon-link {
261 display: inline-block;
262 }
263 .markdown-body h1 tt, .markdown-body h1 code, .markdown-body h2 tt, .markdown-body h2 code, .markdown-body h3 tt, .markdown-body h3 code, .markdown-body h4 tt, .markdown-body h4 code, .markdown-body h5 tt, .markdown-body h5 code, .markdown-body h6 tt, .markdown-body h6 code {
264 font-size: inherit;
265 }
266 .markdown-body h1 {
267 font-size: 28px;
268 color: #000;
269 }
270 .markdown-body h2 {
271 font-size: 24px;
272 border-bottom: 1px solid #ccc;
273 color: #000;
274 }
275 .markdown-body h3 {
276 font-size: 18px;
277 }
278 .markdown-body h4 {
279 font-size: 16px;
280 }
281 .markdown-body h5 {
282 font-size: 14px;
283 }
284 .markdown-body h6 {
285 color: #777;
286 font-size: 14px;
287 }
288 .markdown-body p, .markdown-body blockquote, .markdown-body ul, .markdown-body ol, .markdown-body dl, .markdown-body table, .markdown-body pre {
289 margin: 15px 0;
290 }
291 .markdown-body hr {
292 background: transparent url("/dirty-shade.png") repeat-x 0 0;
293 border: 0 none;
294 color: #ccc;
295 height: 4px;
296 padding: 0;
297 }
298 .markdown-body>h2:first-child, .markdown-body>h1:first-child, .markdown-body>h1:first-child+h2, .markdown-body>h3:first-child, .markdown-body>h4:first-child, .markdown-body>h5:first-child, .markdown-body>h6:first-child {
299 margin-top: 0;
300 padding-top: 0;
301 }
302 .markdown-body a:first-child h1, .markdown-body a:first-child h2, .markdown-body a:first-child h3, .markdown-body a:first-child h4, .markdown-body a:first-child h5, .markdown-body a:first-child h6 {
303 margin-top: 0;
304 padding-top: 0;
305 }
306 .markdown-body h1+p, .markdown-body h2+p, .markdown-body h3+p, .markdown-body h4+p, .markdown-body h5+p, .markdown-body h6+p {
307 margin-top: 0;
308 }
309 .markdown-body li p.first {
310 display: inline-block;
311 }
312 .markdown-body ul, .markdown-body ol {
313 padding-left: 30px;
314 }
315 .markdown-body ul.no-list, .markdown-body ol.no-list {
316 list-style-type: none;
317 padding: 0;
318 }
319 .markdown-body ul li>:first-child, .markdown-body ul li ul:first-of-type, .markdown-body ul li ol:first-of-type, .markdown-body ol li>:first-child, .markdown-body ol li ul:first-of-type, .markdown-body ol li ol:first-of-type {
320 margin-top: 0px;
321 }
322 .markdown-body ul li p:last-of-type, .markdown-body ol li p:last-of-type {
323 margin-bottom: 0;
324 }
325 .markdown-body ul ul, .markdown-body ul ol, .markdown-body ol ol, .markdown-body ol ul {
326 margin-bottom: 0;
327 }
328 .markdown-body dl {
329 padding: 0;
330 }
331 .markdown-body dl dt {
332 font-size: 14px;
333 font-weight: bold;
334 font-style: italic;
335 padding: 0;
336 margin: 15px 0 5px;
337 }
338 .markdown-body dl dt:first-child {
339 padding: 0;
340 }
341 .markdown-body dl dt>:first-child {
342 margin-top: 0px;
343 }
344 .markdown-body dl dt>:last-child {
345 margin-bottom: 0px;
346 }
347 .markdown-body dl dd {
348 margin: 0 0 15px;
349 padding: 0 15px;
350 }
351 .markdown-body dl dd>:first-child {
352 margin-top: 0px;
353 }
354 .markdown-body dl dd>:last-child {
355 margin-bottom: 0px;
356 }
357 .markdown-body blockquote {
358 border-left: 4px solid #DDD;
359 padding: 0 15px;
360 color: #777;
361 }
362 .markdown-body blockquote>:first-child {
363 margin-top: 0px;
364 }
365 .markdown-body blockquote>:last-child {
366 margin-bottom: 0px;
367 }
368 .markdown-body table th {
369 font-weight: bold;
370 }
371 .markdown-body table th, .markdown-body table td {
372 border: 1px solid #ccc;
373 padding: 6px 13px;
374 }
375 .markdown-body table tr {
376 border-top: 1px solid #ccc;
377 background-color: #fff;
378 }
379 .markdown-body table tr:nth-child(2n) {
380 background-color: #f8f8f8;
381 }
382 .markdown-body img {
383 max-width: 100%;
384 -moz-box-sizing: border-box;
385 box-sizing: border-box;
386 }
387 .markdown-body span.frame {
388 display: block;
389 overflow: hidden;
390 }
391 .markdown-body span.frame>span {
392 border: 1px solid #ddd;
393 display: block;
394 float: left;
395 overflow: hidden;
396 margin: 13px 0 0;
397 padding: 7px;
398 width: auto;
399 }
400 .markdown-body span.frame span img {
401 display: block;
402 float: left;
403 }
404 .markdown-body span.frame span span {
405 clear: both;
406 color: #333;
407 display: block;
408 padding: 5px 0 0;
409 }
410 .markdown-body span.align-center {
411 display: block;
412 overflow: hidden;
413 clear: both;
414 }
415 .markdown-body span.align-center>span {
416 display: block;
417 overflow: hidden;
418 margin: 13px auto 0;
419 text-align: center;
420 }
421 .markdown-body span.align-center span img {
422 margin: 0 auto;
423 text-align: center;
424 }
425 .markdown-body span.align-right {
426 display: block;
427 overflow: hidden;
428 clear: both;
429 }
430 .markdown-body span.align-right>span {
431 display: block;
432 overflow: hidden;
433 margin: 13px 0 0;
434 text-align: right;
435 }
436 .markdown-body span.align-right span img {
437 margin: 0;
438 text-align: right;
439 }
440 .markdown-body span.float-left {
441 display: block;
442 margin-right: 13px;
443 overflow: hidden;
444 float: left;
445 }
446 .markdown-body span.float-left span {
447 margin: 13px 0 0;
448 }
449 .markdown-body span.float-right {
450 display: block;
451 margin-left: 13px;
452 overflow: hidden;
453 float: right;
454 }
455 .markdown-body span.float-right>span {
456 display: block;
457 overflow: hidden;
458 margin: 13px auto 0;
459 text-align: right;
460 }
461 .markdown-body code, .markdown-body tt {
462 margin: 0 2px;
463 padding: 0px 5px;
464 border: 1px solid #eaeaea;
465 background-color: #f8f8f8;
466 border-radius: 3px;
467 }
468 .markdown-body code {
469 white-space: nowrap;
470 }
471 .markdown-body pre>code {
472 margin: 0;
473 padding: 0;
474 white-space: pre;
475 border: none;
476 background: transparent;
477 }
478 .markdown-body .highlight pre, .markdown-body pre {
479 background-color: #f8f8f8;
480 border: 1px solid #ccc;
481 font-size: 13px;
482 line-height: 19px;
483 overflow: auto;
484 padding: 6px 10px;
485 border-radius: 3px;
486 }
487 .markdown-body pre code, .markdown-body pre tt {
488 margin: 0;
489 padding: 0;
490 background-color: transparent;
491 border: none;
492 }
493 </style>
494 EOT
495 print "<div class='markdown-body'>";
496 print Markdown($text);
497 print "</div>";
498 }
499 }
500
501
502
503 sub Markdown {
504 #
505 # Main function. The order in which other subs are called here is
506 # essential. Link and image substitutions need to happen before
507 # _EscapeSpecialChars(), so that any *'s or _'s in the <a>
508 # and <img> tags get encoded.
509 #
510 my $text = shift;
511
512 # Clear the global hashes. If we don't clear these, you get conflicts
513 # from other articles when generating a page which contains more than
514 # one article (e.g. an index page that shows the N most recent
515 # articles):
516 %g_urls = ();
517 %g_titles = ();
518 %g_html_blocks = ();
519
520
521 # Standardize line endings:
522 $text =~ s{\r\n}{\n}g; # DOS to Unix
523 $text =~ s{\r}{\n}g; # Mac to Unix
524
525 # Make sure $text ends with a couple of newlines:
526 $text .= "\n\n";
527
528 # Convert all tabs to spaces.
529 $text = _Detab($text);
530
531 # Strip any lines consisting only of spaces and tabs.
532 # This makes subsequent regexen easier to write, because we can
533 # match consecutive blank lines with /\n+/ instead of something
534 # contorted like /[ \t]*\n+/ .
535 $text =~ s/^[ \t]+$//mg;
536
537 # Turn block-level HTML blocks into hash entries
538 $text = _HashHTMLBlocks($text);
539
540 # Strip link definitions, store in hashes.
541 $text = _StripLinkDefinitions($text);
542
543 $text = _RunBlockGamut($text);
544
545 $text = _UnescapeSpecialChars($text);
546
547 return $text . "\n";
548 }
549
550
551 sub _StripLinkDefinitions {
552 #
553 # Strips link definitions from text, stores the URLs and titles in
554 # hash references.
555 #
556 my $text = shift;
557 my $less_than_tab = $g_tab_width - 1;
558
559 # Link defs are in the form: ^[id]: url "optional title"
560 while ($text =~ s{
561 ^[ ]{0,$less_than_tab}\[(.+)\]: # id = $1
562 [ \t]*
563 \n? # maybe *one* newline
564 [ \t]*
565 <?(\S+?)>? # url = $2
566 [ \t]*
567 \n? # maybe one newline
568 [ \t]*
569 (?:
570 (?<=\s) # lookbehind for whitespace
571 ["(]
572 (.+?) # title = $3
573 [")]
574 [ \t]*
575 )? # title is optional
576 (?:\n+|\Z)
577 }
578 {}mx) {
579 $g_urls{lc $1} = _EncodeAmpsAndAngles( $2 ); # Link IDs are case-insensitive
580 if ($3) {
581 $g_titles{lc $1} = $3;
582 $g_titles{lc $1} =~ s/"/&quot;/g;
583 }
584 }
585
586 return $text;
587 }
588
589
590 sub _HashHTMLBlocks {
591 my $text = shift;
592 my $less_than_tab = $g_tab_width - 1;
593
594 # Hashify HTML blocks:
595 # We only want to do this for block-level HTML tags, such as headers,
596 # lists, and tables. That's because we still want to wrap <p>s around
597 # "paragraphs" that are wrapped in non-block-level tags, such as anchors,
598 # phrase emphasis, and spans. The list of tags we're looking for is
599 # hard-coded:
600 my $block_tags_a = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math|ins|del/;
601 my $block_tags_b = qr/p|div|h[1-6]|blockquote|pre|table|dl|ol|ul|script|noscript|form|fieldset|iframe|math/;
602
603 # First, look for nested blocks, e.g.:
604 # <div>
605 # <div>
606 # tags for inner block must be indented.
607 # </div>
608 # </div>
609 #
610 # The outermost tags must start at the left margin for this to match, and
611 # the inner nested divs must be indented.
612 # We need to do this before the next, more liberal match, because the next
613 # match will start at the first `<div>` and stop at the first `</div>`.
614 $text =~ s{
615 ( # save in $1
616 ^ # start of line (with /m)
617 <($block_tags_a) # start tag = $2
618 \b # word break
619 (.*\n)*? # any number of lines, minimally matching
620 </\2> # the matching end tag
621 [ \t]* # trailing spaces/tabs
622 (?=\n+|\Z) # followed by a newline or end of document
623 )
624 }{
625 my $key = md5_hex($1);
626 $g_html_blocks{$key} = $1;
627 "\n\n" . $key . "\n\n";
628 }egmx;
629
630
631 #
632 # Now match more liberally, simply from `\n<tag>` to `</tag>\n`
633 #
634 $text =~ s{
635 ( # save in $1
636 ^ # start of line (with /m)
637 <($block_tags_b) # start tag = $2
638 \b # word break
639 (.*\n)*? # any number of lines, minimally matching
640 .*</\2> # the matching end tag
641 [ \t]* # trailing spaces/tabs
642 (?=\n+|\Z) # followed by a newline or end of document
643 )
644 }{
645 my $key = md5_hex($1);
646 $g_html_blocks{$key} = $1;
647 "\n\n" . $key . "\n\n";
648 }egmx;
649 # Special case just for <hr />. It was easier to make a special case than
650 # to make the other regex more complicated.
651 $text =~ s{
652 (?:
653 (?<=\n\n) # Starting after a blank line
654 | # or
655 \A\n? # the beginning of the doc
656 )
657 ( # save in $1
658 [ ]{0,$less_than_tab}
659 <(hr) # start tag = $2
660 \b # word break
661 ([^<>])*? #
662 /?> # the matching end tag
663 [ \t]*
664 (?=\n{2,}|\Z) # followed by a blank line or end of document
665 )
666 }{
667 my $key = md5_hex($1);
668 $g_html_blocks{$key} = $1;
669 "\n\n" . $key . "\n\n";
670 }egx;
671
672 # Special case for standalone HTML comments:
673 $text =~ s{
674 (?:
675 (?<=\n\n) # Starting after a blank line
676 | # or
677 \A\n? # the beginning of the doc
678 )
679 ( # save in $1
680 [ ]{0,$less_than_tab}
681 (?s:
682 <!
683 (--.*?--\s*)+
684 >
685 )
686 [ \t]*
687 (?=\n{2,}|\Z) # followed by a blank line or end of document
688 )
689 }{
690 my $key = md5_hex($1);
691 $g_html_blocks{$key} = $1;
692 "\n\n" . $key . "\n\n";
693 }egx;
694
695
696 return $text;
697 }
698
699
700 sub _RunBlockGamut {
701 #
702 # These are all the transformations that form block-level
703 # tags like paragraphs, headers, and list items.
704 #
705 my $text = shift;
706
707 $text = _DoHeaders($text);
708
709 # Do Horizontal Rules:
710 $text =~ s{^[ ]{0,2}([ ]?\*[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
711 $text =~ s{^[ ]{0,2}([ ]? -[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
712 $text =~ s{^[ ]{0,2}([ ]? _[ ]?){3,}[ \t]*$}{\n<hr$g_empty_element_suffix\n}gmx;
713
714 $text = _DoLists($text);
715
716 $text = _DoCodeBlocks($text);
717
718 $text = _DoBlockQuotes($text);
719
720 # We already ran _HashHTMLBlocks() before, in Markdown(), but that
721 # was to escape raw HTML in the original Markdown source. This time,
722 # we're escaping the markup we've just created, so that we don't wrap
723 # <p> tags around block-level tags.
724 $text = _HashHTMLBlocks($text);
725
726 $text = _FormParagraphs($text);
727
728 return $text;
729 }
730
731
732 sub _RunSpanGamut {
733 #
734 # These are all the transformations that occur *within* block-level
735 # tags like paragraphs, headers, and list items.
736 #
737 my $text = shift;
738
739 $text = _DoCodeSpans($text);
740
741 $text = _EscapeSpecialChars($text);
742
743 # Process anchor and image tags. Images must come first,
744 # because ![foo][f] looks like an anchor.
745 $text = _DoImages($text);
746 $text = _DoAnchors($text);
747
748 # Make links out of things like `<http://example.com/>`
749 # Must come after _DoAnchors(), because you can use < and >
750 # delimiters in inline links like [this](<url>).
751 $text = _DoAutoLinks($text);
752
753 $text = _EncodeAmpsAndAngles($text);
754
755 $text = _DoItalicsAndBold($text);
756
757 # Do hard breaks:
758 $text =~ s/ {2,}\n/ <br$g_empty_element_suffix\n/g;
759
760 return $text;
761 }
762
763
764 sub _EscapeSpecialChars {
765 my $text = shift;
766 my $tokens ||= _TokenizeHTML($text);
767
768 $text = ''; # rebuild $text from the tokens
769 # my $in_pre = 0; # Keep track of when we're inside <pre> or <code> tags.
770 # my $tags_to_skip = qr!<(/?)(?:pre|code|kbd|script|math)[\s>]!;
771
772 foreach my $cur_token (@$tokens) {
773 if ($cur_token->[0] eq "tag") {
774 # Within tags, encode * and _ so they don't conflict
775 # with their use in Markdown for italics and strong.
776 # We're replacing each such character with its
777 # corresponding MD5 checksum value; this is likely
778 # overkill, but it should prevent us from colliding
779 # with the escape values by accident.
780 $cur_token->[1] =~ s! \* !$g_escape_table{'*'}!gx;
781 $cur_token->[1] =~ s! _ !$g_escape_table{'_'}!gx;
782 $text .= $cur_token->[1];
783 } else {
784 my $t = $cur_token->[1];
785 $t = _EncodeBackslashEscapes($t);
786 $text .= $t;
787 }
788 }
789 return $text;
790 }
791
792
793 sub _DoAnchors {
794 #
795 # Turn Markdown link shortcuts into XHTML <a> tags.
796 #
797 my $text = shift;
798
799 #
800 # First, handle reference-style links: [link text] [id]
801 #
802 $text =~ s{
803 ( # wrap whole match in $1
804 \[
805 ($g_nested_brackets) # link text = $2
806 \]
807
808 [ ]? # one optional space
809 (?:\n[ ]*)? # one optional newline followed by spaces
810
811 \[
812 (.*?) # id = $3
813 \]
814 )
815 }{
816 my $result;
817 my $whole_match = $1;
818 my $link_text = $2;
819 my $link_id = lc $3;
820
821 if ($link_id eq "") {
822 $link_id = lc $link_text; # for shortcut links like [this][].
823 }
824
825 if (defined $g_urls{$link_id}) {
826 my $url = $g_urls{$link_id};
827 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
828 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
829 $result = "<a href=\"$url\"";
830 if ( defined $g_titles{$link_id} ) {
831 my $title = $g_titles{$link_id};
832 $title =~ s! \* !$g_escape_table{'*'}!gx;
833 $title =~ s! _ !$g_escape_table{'_'}!gx;
834 $result .= " title=\"$title\"";
835 }
836 $result .= ">$link_text</a>";
837 }
838 else {
839 $result = $whole_match;
840 }
841 $result;
842 }xsge;
843
844 #
845 # Next, inline-style links: [link text](url "optional title")
846 #
847 $text =~ s{
848 ( # wrap whole match in $1
849 \[
850 ($g_nested_brackets) # link text = $2
851 \]
852 \( # literal paren
853 [ \t]*
854 <?(.*?)>? # href = $3
855 [ \t]*
856 ( # $4
857 (['"]) # quote char = $5
858 (.*?) # Title = $6
859 \5 # matching quote
860 )? # title is optional
861 \)
862 )
863 }{
864 my $result;
865 my $whole_match = $1;
866 my $link_text = $2;
867 my $url = $3;
868 my $title = $6;
869
870 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
871 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
872 $result = "<a href=\"$url\"";
873
874 if (defined $title) {
875 $title =~ s/"/&quot;/g;
876 $title =~ s! \* !$g_escape_table{'*'}!gx;
877 $title =~ s! _ !$g_escape_table{'_'}!gx;
878 $result .= " title=\"$title\"";
879 }
880
881 $result .= ">$link_text</a>";
882
883 $result;
884 }xsge;
885
886 return $text;
887 }
888
889
890 sub _DoImages {
891 #
892 # Turn Markdown image shortcuts into <img> tags.
893 #
894 my $text = shift;
895
896 #
897 # First, handle reference-style labeled images: ![alt text][id]
898 #
899 $text =~ s{
900 ( # wrap whole match in $1
901 !\[
902 (.*?) # alt text = $2
903 \]
904
905 [ ]? # one optional space
906 (?:\n[ ]*)? # one optional newline followed by spaces
907
908 \[
909 (.*?) # id = $3
910 \]
911
912 )
913 }{
914 my $result;
915 my $whole_match = $1;
916 my $alt_text = $2;
917 my $link_id = lc $3;
918
919 if ($link_id eq "") {
920 $link_id = lc $alt_text; # for shortcut links like ![this][].
921 }
922
923 $alt_text =~ s/"/&quot;/g;
924 if (defined $g_urls{$link_id}) {
925 my $url = $g_urls{$link_id};
926 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
927 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
928 $result = "<img src=\"$url\" alt=\"$alt_text\"";
929 if (defined $g_titles{$link_id}) {
930 my $title = $g_titles{$link_id};
931 $title =~ s! \* !$g_escape_table{'*'}!gx;
932 $title =~ s! _ !$g_escape_table{'_'}!gx;
933 $result .= " title=\"$title\"";
934 }
935 $result .= $g_empty_element_suffix;
936 }
937 else {
938 # If there's no such link ID, leave intact:
939 $result = $whole_match;
940 }
941
942 $result;
943 }xsge;
944
945 #
946 # Next, handle inline images: ![alt text](url "optional title")
947 # Don't forget: encode * and _
948
949 $text =~ s{
950 ( # wrap whole match in $1
951 !\[
952 (.*?) # alt text = $2
953 \]
954 \( # literal paren
955 [ \t]*
956 <?(\S+?)>? # src url = $3
957 [ \t]*
958 ( # $4
959 (['"]) # quote char = $5
960 (.*?) # title = $6
961 \5 # matching quote
962 [ \t]*
963 )? # title is optional
964 \)
965 )
966 }{
967 my $result;
968 my $whole_match = $1;
969 my $alt_text = $2;
970 my $url = $3;
971 my $title = '';
972 if (defined($6)) {
973 $title = $6;
974 }
975
976 $alt_text =~ s/"/&quot;/g;
977 $title =~ s/"/&quot;/g;
978 $url =~ s! \* !$g_escape_table{'*'}!gx; # We've got to encode these to avoid
979 $url =~ s! _ !$g_escape_table{'_'}!gx; # conflicting with italics/bold.
980 $result = "<img src=\"$url\" alt=\"$alt_text\"";
981 if (defined $title) {
982 $title =~ s! \* !$g_escape_table{'*'}!gx;
983 $title =~ s! _ !$g_escape_table{'_'}!gx;
984 $result .= " title=\"$title\"";
985 }
986 $result .= $g_empty_element_suffix;
987
988 $result;
989 }xsge;
990
991 return $text;
992 }
993
994
995 sub _DoHeaders {
996 my $text = shift;
997
998 # Setext-style headers:
999 # Header 1
1000 # ========
1001 #
1002 # Header 2
1003 # --------
1004 #
1005 $text =~ s{ ^(.+)[ \t]*\n=+[ \t]*\n+ }{
1006 "<h1>" . _RunSpanGamut($1) . "</h1>\n\n";
1007 }egmx;
1008
1009 $text =~ s{ ^(.+)[ \t]*\n-+[ \t]*\n+ }{
1010 "<h2>" . _RunSpanGamut($1) . "</h2>\n\n";
1011 }egmx;
1012
1013
1014 # atx-style headers:
1015 # # Header 1
1016 # ## Header 2
1017 # ## Header 2 with closing hashes ##
1018 # ...
1019 # ###### Header 6
1020 #
1021 $text =~ s{
1022 ^(\#{1,6}) # $1 = string of #'s
1023 [ \t]*
1024 (.+?) # $2 = Header text
1025 [ \t]*
1026 \#* # optional closing #'s (not counted)
1027 \n+
1028 }{
1029 my $h_level = length($1);
1030 "<h$h_level>" . _RunSpanGamut($2) . "</h$h_level>\n\n";
1031 }egmx;
1032
1033 return $text;
1034 }
1035
1036
1037 sub _DoLists {
1038 #
1039 # Form HTML ordered (numbered) and unordered (bulleted) lists.
1040 #
1041 my $text = shift;
1042 my $less_than_tab = $g_tab_width - 1;
1043
1044 # Re-usable patterns to match list item bullets and number markers:
1045 my $marker_ul = qr/[*+-]/;
1046 my $marker_ol = qr/\d+[.]/;
1047 my $marker_any = qr/(?:$marker_ul|$marker_ol)/;
1048
1049 # Re-usable pattern to match any entirel ul or ol list:
1050 my $whole_list = qr{
1051 ( # $1 = whole list
1052 ( # $2
1053 [ ]{0,$less_than_tab}
1054 (${marker_any}) # $3 = first list item marker
1055 [ \t]+
1056 )
1057 (?s:.+?)
1058 ( # $4
1059 \z
1060 |
1061 \n{2,}
1062 (?=\S)
1063 (?! # Negative lookahead for another list item marker
1064 [ \t]*
1065 ${marker_any}[ \t]+
1066 )
1067 )
1068 )
1069 }mx;
1070
1071 # We use a different prefix before nested lists than top-level lists.
1072 # See extended comment in _ProcessListItems().
1073 #
1074 # Note: There's a bit of duplication here. My original implementation
1075 # created a scalar regex pattern as the conditional result of the test on
1076 # $g_list_level, and then only ran the $text =~ s{...}{...}egmx
1077 # substitution once, using the scalar as the pattern. This worked,
1078 # everywhere except when running under MT on my hosting account at Pair
1079 # Networks. There, this caused all rebuilds to be killed by the reaper (or
1080 # perhaps they crashed, but that seems incredibly unlikely given that the
1081 # same script on the same server ran fine *except* under MT. I've spent
1082 # more time trying to figure out why this is happening than I'd like to
1083 # admit. My only guess, backed up by the fact that this workaround works,
1084 # is that Perl optimizes the substition when it can figure out that the
1085 # pattern will never change, and when this optimization isn't on, we run
1086 # afoul of the reaper. Thus, the slightly redundant code to that uses two
1087 # static s/// patterns rather than one conditional pattern.
1088
1089 if ($g_list_level) {
1090 $text =~ s{
1091 ^
1092 $whole_list
1093 }{
1094 my $list = $1;
1095 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1096 # Turn double returns into triple returns, so that we can make a
1097 # paragraph for the last item in a list, if necessary:
1098 $list =~ s/\n{2,}/\n\n\n/g;
1099 my $result = _ProcessListItems($list, $marker_any);
1100 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1101 $result;
1102 }egmx;
1103 }
1104 else {
1105 $text =~ s{
1106 (?:(?<=\n\n)|\A\n?)
1107 $whole_list
1108 }{
1109 my $list = $1;
1110 my $list_type = ($3 =~ m/$marker_ul/) ? "ul" : "ol";
1111 # Turn double returns into triple returns, so that we can make a
1112 # paragraph for the last item in a list, if necessary:
1113 $list =~ s/\n{2,}/\n\n\n/g;
1114 my $result = _ProcessListItems($list, $marker_any);
1115 $result = "<$list_type>\n" . $result . "</$list_type>\n";
1116 $result;
1117 }egmx;
1118 }
1119
1120
1121 return $text;
1122 }
1123
1124
1125 sub _ProcessListItems {
1126 #
1127 # Process the contents of a single ordered or unordered list, splitting it
1128 # into individual list items.
1129 #
1130
1131 my $list_str = shift;
1132 my $marker_any = shift;
1133
1134
1135 # The $g_list_level global keeps track of when we're inside a list.
1136 # Each time we enter a list, we increment it; when we leave a list,
1137 # we decrement. If it's zero, we're not in a list anymore.
1138 #
1139 # We do this because when we're not inside a list, we want to treat
1140 # something like this:
1141 #
1142 # I recommend upgrading to version
1143 # 8. Oops, now this line is treated
1144 # as a sub-list.
1145 #
1146 # As a single paragraph, despite the fact that the second line starts
1147 # with a digit-period-space sequence.
1148 #
1149 # Whereas when we're inside a list (or sub-list), that line will be
1150 # treated as the start of a sub-list. What a kludge, huh? This is
1151 # an aspect of Markdown's syntax that's hard to parse perfectly
1152 # without resorting to mind-reading. Perhaps the solution is to
1153 # change the syntax rules such that sub-lists must start with a
1154 # starting cardinal number; e.g. "1." or "a.".
1155
1156 $g_list_level++;
1157
1158 # trim trailing blank lines:
1159 $list_str =~ s/\n{2,}\z/\n/;
1160
1161
1162 $list_str =~ s{
1163 (\n)? # leading line = $1
1164 (^[ \t]*) # leading whitespace = $2
1165 ($marker_any) [ \t]+ # list marker = $3
1166 ((?s:.+?) # list item text = $4
1167 (\n{1,2}))
1168 (?= \n* (\z | \2 ($marker_any) [ \t]+))
1169 }{
1170 my $item = $4;
1171 my $leading_line = $1;
1172 my $leading_space = $2;
1173
1174 if ($leading_line or ($item =~ m/\n{2,}/)) {
1175 $item = _RunBlockGamut(_Outdent($item));
1176 }
1177 else {
1178 # Recursion for sub-lists:
1179 $item = _DoLists(_Outdent($item));
1180 chomp $item;
1181 $item = _RunSpanGamut($item);
1182 }
1183
1184 "<li>" . $item . "</li>\n";
1185 }egmx;
1186
1187 $g_list_level--;
1188 return $list_str;
1189 }
1190
1191
1192
1193 sub _DoCodeBlocks {
1194 #
1195 # Process Markdown `<pre><code>` blocks.
1196 #
1197
1198 my $text = shift;
1199
1200 $text =~ s{
1201 (?:\n\n|\A)
1202 ( # $1 = the code block -- one or more lines, starting with a space/tab
1203 (?:
1204 (?:[ ]{$g_tab_width} | \t) # Lines must start with a tab or a tab-width of spaces
1205 .*\n+
1206 )+
1207 )
1208 ((?=^[ ]{0,$g_tab_width}\S)|\Z) # Lookahead for non-space at line-start, or end of doc
1209 }{
1210 my $codeblock = $1;
1211 my $result; # return value
1212
1213 $codeblock = _EncodeCode(_Outdent($codeblock));
1214 $codeblock = _Detab($codeblock);
1215 $codeblock =~ s/\A\n+//; # trim leading newlines
1216 $codeblock =~ s/\s+\z//; # trim trailing whitespace
1217
1218 $result = "\n\n<pre><code>" . $codeblock . "\n</code></pre>\n\n";
1219
1220 $result;
1221 }egmx;
1222
1223 return $text;
1224 }
1225
1226
1227 sub _DoCodeSpans {
1228 #
1229 # * Backtick quotes are used for <code></code> spans.
1230 #
1231 # * You can use multiple backticks as the delimiters if you want to
1232 # include literal backticks in the code span. So, this input:
1233 #
1234 # Just type ``foo `bar` baz`` at the prompt.
1235 #
1236 # Will translate to:
1237 #
1238 # <p>Just type <code>foo `bar` baz</code> at the prompt.</p>
1239 #
1240 # There's no arbitrary limit to the number of backticks you
1241 # can use as delimters. If you need three consecutive backticks
1242 # in your code, use four for delimiters, etc.
1243 #
1244 # * You can use spaces to get literal backticks at the edges:
1245 #
1246 # ... type `` `bar` `` ...
1247 #
1248 # Turns to:
1249 #
1250 # ... type <code>`bar`</code> ...
1251 #
1252
1253 my $text = shift;
1254
1255 $text =~ s@
1256 (`+) # $1 = Opening run of `
1257 (.+?) # $2 = The code block
1258 (?<!`)
1259 \1 # Matching closer
1260 (?!`)
1261 @
1262 my $c = "$2";
1263 $c =~ s/^[ \t]*//g; # leading whitespace
1264 $c =~ s/[ \t]*$//g; # trailing whitespace
1265 $c = _EncodeCode($c);
1266 "<code>$c</code>";
1267 @egsx;
1268
1269 return $text;
1270 }
1271
1272
1273 sub _EncodeCode {
1274 #
1275 # Encode/escape certain characters inside Markdown code runs.
1276 # The point is that in code, these characters are literals,
1277 # and lose their special Markdown meanings.
1278 #
1279 local $_ = shift;
1280
1281 # Encode all ampersands; HTML entities are not
1282 # entities within a Markdown code span.
1283 s/&/&amp;/g;
1284
1285 # Encode $'s, but only if we're running under Blosxom.
1286 # (Blosxom interpolates Perl variables in article bodies.)
1287 {
1288 no warnings 'once';
1289 if (defined($blosxom::version)) {
1290 s/\$/&#036;/g;
1291 }
1292 }
1293
1294
1295 # Do the angle bracket song and dance:
1296 s! < !&lt;!gx;
1297 s! > !&gt;!gx;
1298
1299 # Now, escape characters that are magic in Markdown:
1300 s! \* !$g_escape_table{'*'}!gx;
1301 s! _ !$g_escape_table{'_'}!gx;
1302 s! { !$g_escape_table{'{'}!gx;
1303 s! } !$g_escape_table{'}'}!gx;
1304 s! \[ !$g_escape_table{'['}!gx;
1305 s! \] !$g_escape_table{']'}!gx;
1306 s! \\ !$g_escape_table{'\\'}!gx;
1307
1308 return $_;
1309 }
1310
1311
1312 sub _DoItalicsAndBold {
1313 my $text = shift;
1314
1315 # <strong> must go first:
1316 $text =~ s{ (\*\*|__) (?=\S) (.+?[*_]*) (?<=\S) \1 }
1317 {<strong>$2</strong>}gsx;
1318
1319 $text =~ s{ (\*|_) (?=\S) (.+?) (?<=\S) \1 }
1320 {<em>$2</em>}gsx;
1321
1322 return $text;
1323 }
1324
1325
1326 sub _DoBlockQuotes {
1327 my $text = shift;
1328
1329 $text =~ s{
1330 ( # Wrap whole match in $1
1331 (
1332 ^[ \t]*>[ \t]? # '>' at the start of a line
1333 .+\n # rest of the first line
1334 (.+\n)* # subsequent consecutive lines
1335 \n* # blanks
1336 )+
1337 )
1338 }{
1339 my $bq = $1;
1340 $bq =~ s/^[ \t]*>[ \t]?//gm; # trim one level of quoting
1341 $bq =~ s/^[ \t]+$//mg; # trim whitespace-only lines
1342 $bq = _RunBlockGamut($bq); # recurse
1343
1344 $bq =~ s/^/ /g;
1345 # These leading spaces screw with <pre> content, so we need to fix that:
1346 $bq =~ s{
1347 (\s*<pre>.+?</pre>)
1348 }{
1349 my $pre = $1;
1350 $pre =~ s/^ //mg;
1351 $pre;
1352 }egsx;
1353
1354 "<blockquote>\n$bq\n</blockquote>\n\n";
1355 }egmx;
1356
1357
1358 return $text;
1359 }
1360
1361
1362 sub _FormParagraphs {
1363 #
1364 # Params:
1365 # $text - string to process with html <p> tags
1366 #
1367 my $text = shift;
1368
1369 # Strip leading and trailing lines:
1370 $text =~ s/\A\n+//;
1371 $text =~ s/\n+\z//;
1372
1373 my @grafs = split(/\n{2,}/, $text);
1374
1375 #
1376 # Wrap <p> tags.
1377 #
1378 foreach (@grafs) {
1379 unless (defined( $g_html_blocks{$_} )) {
1380 $_ = _RunSpanGamut($_);
1381 s/^([ \t]*)/<p>/;
1382 $_ .= "</p>";
1383 }
1384 }
1385
1386 #
1387 # Unhashify HTML blocks
1388 #
1389 foreach (@grafs) {
1390 if (defined( $g_html_blocks{$_} )) {
1391 $_ = $g_html_blocks{$_};
1392 }
1393 }
1394
1395 return join "\n\n", @grafs;
1396 }
1397
1398
1399 sub _EncodeAmpsAndAngles {
1400 # Smart processing for ampersands and angle brackets that need to be encoded.
1401
1402 my $text = shift;
1403
1404 # Ampersand-encoding based entirely on Nat Irons's Amputator MT plugin:
1405 # http://bumppo.net/projects/amputator/
1406 $text =~ s/&(?!#?[xX]?(?:[0-9a-fA-F]+|\w+);)/&amp;/g;
1407
1408 # Encode naked <'s
1409 $text =~ s{<(?![a-z/?\$!])}{&lt;}gi;
1410
1411 return $text;
1412 }
1413
1414
1415 sub _EncodeBackslashEscapes {
1416 #
1417 # Parameter: String.
1418 # Returns: The string, with after processing the following backslash
1419 # escape sequences.
1420 #
1421 local $_ = shift;
1422
1423 s! \\\\ !$g_escape_table{'\\'}!gx; # Must process escaped backslashes first.
1424 s! \\` !$g_escape_table{'`'}!gx;
1425 s! \\\* !$g_escape_table{'*'}!gx;
1426 s! \\_ !$g_escape_table{'_'}!gx;
1427 s! \\\{ !$g_escape_table{'{'}!gx;
1428 s! \\\} !$g_escape_table{'}'}!gx;
1429 s! \\\[ !$g_escape_table{'['}!gx;
1430 s! \\\] !$g_escape_table{']'}!gx;
1431 s! \\\( !$g_escape_table{'('}!gx;
1432 s! \\\) !$g_escape_table{')'}!gx;
1433 s! \\> !$g_escape_table{'>'}!gx;
1434 s! \\\# !$g_escape_table{'#'}!gx;
1435 s! \\\+ !$g_escape_table{'+'}!gx;
1436 s! \\\- !$g_escape_table{'-'}!gx;
1437 s! \\\. !$g_escape_table{'.'}!gx;
1438 s{ \\! }{$g_escape_table{'!'}}gx;
1439
1440 return $_;
1441 }
1442
1443
1444 sub _DoAutoLinks {
1445 my $text = shift;
1446
1447 $text =~ s{<((https?|ftp):[^'">\s]+)>}{<a href="$1">$1</a>}gi;
1448
1449 # Email addresses: <address@domain.foo>
1450 $text =~ s{
1451 <
1452 (?:mailto:)?
1453 (
1454 [-.\w]+
1455 \@
1456 [-a-z0-9]+(\.[-a-z0-9]+)*\.[a-z]+
1457 )
1458 >
1459 }{
1460 _EncodeEmailAddress( _UnescapeSpecialChars($1) );
1461 }egix;
1462
1463 return $text;
1464 }
1465
1466
1467 sub _EncodeEmailAddress {
1468 #
1469 # Input: an email address, e.g. "foo@example.com"
1470 #
1471 # Output: the email address as a mailto link, with each character
1472 # of the address encoded as either a decimal or hex entity, in
1473 # the hopes of foiling most address harvesting spam bots. E.g.:
1474 #
1475 # <a href="&#x6D;&#97;&#105;&#108;&#x74;&#111;:&#102;&#111;&#111;&#64;&#101;
1476 # x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;">&#102;&#111;&#111;
1477 # &#64;&#101;x&#x61;&#109;&#x70;&#108;&#x65;&#x2E;&#99;&#111;&#109;</a>
1478 #
1479 # Based on a filter by Matthew Wickline, posted to the BBEdit-Talk
1480 # mailing list: <http://tinyurl.com/yu7ue>
1481 #
1482
1483 my $addr = shift;
1484
1485 srand;
1486 my @encode = (
1487 sub { '&#' . ord(shift) . ';' },
1488 sub { '&#x' . sprintf( "%X", ord(shift) ) . ';' },
1489 sub { shift },
1490 );
1491
1492 $addr = "mailto:" . $addr;
1493
1494 $addr =~ s{(.)}{
1495 my $char = $1;
1496 if ( $char eq '@' ) {
1497 # this *must* be encoded. I insist.
1498 $char = $encode[int rand 1]->($char);
1499 } elsif ( $char ne ':' ) {
1500 # leave ':' alone (to spot mailto: later)
1501 my $r = rand;
1502 # roughly 10% raw, 45% hex, 45% dec
1503 $char = (
1504 $r > .9 ? $encode[2]->($char) :
1505 $r < .45 ? $encode[1]->($char) :
1506 $encode[0]->($char)
1507 );
1508 }
1509 $char;
1510 }gex;
1511
1512 $addr = qq{<a href="$addr">$addr</a>};
1513 $addr =~ s{">.+?:}{">}; # strip the mailto: from the visible part
1514
1515 return $addr;
1516 }
1517
1518
1519 sub _UnescapeSpecialChars {
1520 #
1521 # Swap back in all the special characters we've hidden.
1522 #
1523 my $text = shift;
1524
1525 while( my($char, $hash) = each(%g_escape_table) ) {
1526 $text =~ s/$hash/$char/g;
1527 }
1528 return $text;
1529 }
1530
1531
1532 sub _TokenizeHTML {
1533 #
1534 # Parameter: String containing HTML markup.
1535 # Returns: Reference to an array of the tokens comprising the input
1536 # string. Each token is either a tag (possibly with nested,
1537 # tags contained therein, such as <a href="<MTFoo>">, or a
1538 # run of text between tags. Each element of the array is a
1539 # two-element array; the first is either 'tag' or 'text';
1540 # the second is the actual value.
1541 #
1542 #
1543 # Derived from the _tokenize() subroutine from Brad Choate's MTRegex plugin.
1544 # <http://www.bradchoate.com/past/mtregex.php>
1545 #
1546
1547 my $str = shift;
1548 my $pos = 0;
1549 my $len = length $str;
1550 my @tokens;
1551
1552 my $depth = 6;
1553 my $nested_tags = join('|', ('(?:<[a-z/!$](?:[^<>]') x $depth) . (')*>)' x $depth);
1554 my $match = qr/(?s: <! ( -- .*? -- \s* )+ > ) | # comment
1555 (?s: <\? .*? \?> ) | # processing instruction
1556 $nested_tags/ix; # nested tags
1557
1558 while ($str =~ m/($match)/g) {
1559 my $whole_tag = $1;
1560 my $sec_start = pos $str;
1561 my $tag_start = $sec_start - length $whole_tag;
1562 if ($pos < $tag_start) {
1563 push @tokens, ['text', substr($str, $pos, $tag_start - $pos)];
1564 }
1565 push @tokens, ['tag', $whole_tag];
1566 $pos = pos $str;
1567 }
1568 push @tokens, ['text', substr($str, $pos, $len - $pos)] if $pos < $len;
1569 \@tokens;
1570 }
1571
1572
1573 sub _Outdent {
1574 #
1575 # Remove one level of line-leading tabs or spaces
1576 #
1577 my $text = shift;
1578
1579 $text =~ s/^(\t|[ ]{1,$g_tab_width})//gm;
1580 return $text;
1581 }
1582
1583
1584 sub _Detab {
1585 #
1586 # Cribbed from a post by Bart Lateur:
1587 # <http://www.nntp.perl.org/group/perl.macperl.anyperl/154>
1588 #
1589 my $text = shift;
1590
1591 $text =~ s{(.*?)\t}{$1.(' ' x ($g_tab_width - length($1) % $g_tab_width))}ge;
1592 return $text;
1593 }
1594
1595
1596 1;
1597
1598 __END__
1599
1600
1601 =pod
1602
1603 =head1 NAME
1604
1605 B<Markdown>
1606
1607
1608 =head1 SYNOPSIS
1609
1610 B<Markdown.pl> [ B<--html4tags> ] [ B<--version> ] [ B<-shortversion> ]
1611 [ I<file> ... ]
1612
1613
1614 =head1 DESCRIPTION
1615
1616 Markdown is a text-to-HTML filter; it translates an easy-to-read /
1617 easy-to-write structured text format into HTML. Markdown's text format
1618 is most similar to that of plain text email, and supports features such
1619 as headers, *emphasis*, code blocks, blockquotes, and links.
1620
1621 Markdown's syntax is designed not as a generic markup language, but
1622 specifically to serve as a front-end to (X)HTML. You can use span-level
1623 HTML tags anywhere in a Markdown document, and you can use block level
1624 HTML tags (like <div> and <table> as well).
1625
1626 For more information about Markdown's syntax, see:
1627
1628 http://daringfireball.net/projects/markdown/
1629
1630
1631 =head1 OPTIONS
1632
1633 Use "--" to end switch parsing. For example, to open a file named "-z", use:
1634
1635 Markdown.pl -- -z
1636
1637 =over 4
1638
1639
1640 =item B<--html4tags>
1641
1642 Use HTML 4 style for empty element tags, e.g.:
1643
1644 <br>
1645
1646 instead of Markdown's default XHTML style tags, e.g.:
1647
1648 <br />
1649
1650
1651 =item B<-v>, B<--version>
1652
1653 Display Markdown's version number and copyright information.
1654
1655
1656 =item B<-s>, B<--shortversion>
1657
1658 Display the short-form version number.
1659
1660
1661 =back
1662
1663
1664
1665 =head1 BUGS
1666
1667 To file bug reports or feature requests (other than topics listed in the
1668 Caveats section above) please send email to:
1669
1670 support@daringfireball.net
1671
1672 Please include with your report: (1) the example input; (2) the output
1673 you expected; (3) the output Markdown actually produced.
1674
1675
1676 =head1 VERSION HISTORY
1677
1678 See the readme file for detailed release notes for this version.
1679
1680 1.0.1 - 14 Dec 2004
1681
1682 1.0 - 28 Aug 2004
1683
1684
1685 =head1 AUTHOR
1686
1687 John Gruber
1688 http://daringfireball.net
1689
1690 PHP port and other contributions by Michel Fortin
1691 http://michelf.com
1692
1693
1694 =head1 COPYRIGHT AND LICENSE
1695
1696 Copyright (c) 2003-2004 John Gruber
1697 <http://daringfireball.net/>
1698 All rights reserved.
1699
1700 Redistribution and use in source and binary forms, with or without
1701 modification, are permitted provided that the following conditions are
1702 met:
1703
1704 * Redistributions of source code must retain the above copyright notice,
1705 this list of conditions and the following disclaimer.
1706
1707 * Redistributions in binary form must reproduce the above copyright
1708 notice, this list of conditions and the following disclaimer in the
1709 documentation and/or other materials provided with the distribution.
1710
1711 * Neither the name "Markdown" nor the names of its contributors may
1712 be used to endorse or promote products derived from this software
1713 without specific prior written permission.
1714
1715 This software is provided by the copyright holders and contributors "as
1716 is" and any express or implied warranties, including, but not limited
1717 to, the implied warranties of merchantability and fitness for a
1718 particular purpose are disclaimed. In no event shall the copyright owner
1719 or contributors be liable for any direct, indirect, incidental, special,
1720 exemplary, or consequential damages (including, but not limited to,
1721 procurement of substitute goods or services; loss of use, data, or
1722 profits; or business interruption) however caused and on any theory of
1723 liability, whether in contract, strict liability, or tort (including
1724 negligence or otherwise) arising in any way out of the use of this
1725 software, even if advised of the possibility of such damage.
1726
1727 =cut