Last active
December 15, 2022 12:37
-
-
Save taufik-nurrohman/f349cf63d644eec04fe9 to your computer and use it in GitHub Desktop.
The Simplest PHP Markdown Parser
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
<?php | |
/*! | |
* ======================================================= | |
* Author : Taufik Nurrohman | |
* URL : https://github.com/taufik-nurrohman | |
* License : MIT | |
* ======================================================= | |
* | |
* -- CODE: ---------------------------------------------- | |
* | |
* echo MD('this is a **bold** text'); | |
* | |
* ------------------------------------------------------- | |
* | |
*/ | |
// escape | |
function __MDE($str, $x) { | |
return preg_replace('#([' . preg_quote($x, '#') . '])#', '\\\$1', $str); | |
} | |
// un-escape | |
function __MDD($str, $x) { | |
return preg_replace('#\\\\([' . preg_quote($x, '#') . '])#', '$1', $str); | |
} | |
// main function | |
function MD($content) { | |
// character(s) to escape | |
$x = '`~!#^*()-_+={}[]:\'"<>.'; | |
// URL pattern | |
$url = '(?:ht|f)tps?:\/\/[a-z0-9\-._~!$&\'()*+,;=:\/?\#\[\]@%]+'; | |
// empty element suffix | |
$suffix = '>'; | |
// normalize white-space | |
$content = trim(str_replace(array("\r\n", "\r"), "\n", $content)); | |
// parse fenced code block to indented code block | |
$content = preg_replace_callback('#(^|\n)([`~]{3,})(?: *\.?([a-zA-Z0-9\-.]+))?\n+([\s\S]+?)\n+\2(\n|$)#', function($m) { | |
$s = "\0" . str_replace('.', ' ', $m[3]) . "\0\n"; | |
return $m[1] . $s . ' ' . str_replace("\n", "\n ", $m[4]) . $m[5]; | |
}, $content); | |
// parse code block | |
$content = preg_replace_callback('#^(?:\0(.*?)\0\n)?( {4}|\t)(.*?)$#m', function($m) use($x) { | |
$s1 = ! empty($m[1]) ? ' class="' . $m[1] . '"' : ""; | |
$s3 = str_replace("\t", ' ', htmlentities($m[3], ENT_NOQUOTES)); | |
return '<pre><code' . $s1 . '>' . __MDE($s3, $x) . '</code></pre>'; | |
}, $content); | |
// parse code inline | |
$content = preg_replace_callback('#(?<!\\\)`([^\n]+?)`#', function($m) use($x) { | |
$s = htmlentities($m[1], ENT_NOQUOTES); | |
return '\\<code>' . __MDE($s, $x) . '</code>'; | |
}, $content); | |
// parse image and link | |
$content = preg_replace_callback('#(!)?\[(.*?)\]\((.*?)( +([\'"])(.*?)\5)?\)#', function($m) use($x, $suffix) { | |
$s2 = $m[2]; | |
$s3 = __MDE($m[3], $x); | |
$s6 = ! empty($m[4]) && ! empty($m[6]) ? __MDE($m[6], $x) : ""; | |
$s6 = $s6 ? ' title="' . htmlentities($s6) . '"' : ""; | |
if( ! empty($m[1])) { | |
return '\\<img alt="' . htmlentities($s2) . '" src="' . $s3 . '"' . $s6 . $suffix; | |
} | |
return '\\<a href="' . $s3 . '"' . $s6 . '>' . $s2 . '</a>'; | |
}, $content); | |
// parse link | |
$content = preg_replace_callback('#<(' . $url . ')>#', function($m) use($x) { | |
return '\\<a href="' . __MDE($m[1], $x) . '">' . $m[1] . '</a>'; | |
}, $content); | |
// parse ATX header(s) | |
$content = preg_replace_callback('#^(\#{1,6})\s*([^\#]+?)\s*\#*$#m', function($m) { | |
$i = strlen($m[1]); | |
return '<h' . $i . '>' . $m[2] . '</h' . $i . '>'; | |
}, $content); | |
$content = preg_replace( | |
array( | |
// parse SEText header(s) | |
'#^(.+?)\n={2,}$#m', | |
'#^(.+?)\n-{2,}$#m', | |
// parse horizontal rule | |
'#^ {0,3}([*\-+] *){3,}$#m', | |
// parse bold-italic text | |
'#(?<!\\\)([*_]{2})([*_])([^\n]+?)\2\1#', | |
// parse bold text | |
'#(?<!\\\)([*_]{2})([^\n]+?)\1#', | |
// parse italic text | |
'#(?<!\\\)([*_])([^\n]+?)\1#', | |
// parse strike text | |
// '#(?<!\\\)(~{2})([^\n]+?)\1#', | |
// parse unordered-list | |
'#^ *[*\-+] +(.*?)$#m', | |
// parse ordered-list | |
'#^ *\d+\. +(.*?)$#m', | |
// parse quote block | |
'#^(?:>|>) +(.*?)$#m', | |
// clean-up list ... | |
'#\s*<\/(ol|ul)>\n<\1>\s*#', | |
// clean-up quote block ... | |
'#\s*<\/blockquote>\n<blockquote>\s*#', | |
// parse two or more white-space(s) at the end of text into a line-break | |
'#(\S) {2,}\n#' | |
), | |
array( | |
'<h1>$1</h1>', | |
'<h2>$1</h2>', | |
'<hr' . $suffix, | |
'\\<strong><em>$3</em></strong>', | |
'\\<strong>$2</strong>', | |
'\\<em>$2</em>', | |
// '\\<del>$2</del>', | |
"<ul>\n <li>$1</li>\n</ul>", | |
"<ol>\n <li>$1</li>\n</ol>", | |
"<blockquote>\n <p>$1</p>\n</blockquote>", | |
"\n ", | |
"\n ", | |
"$1<br" . $suffix . "\n" | |
), | |
$content); | |
// parse table | |
$content = preg_replace_callback('#((?:\|[^|]+?\|[^|]+?)+)\|?\n((?:\| *(?:\-+|:\-+|\-+:|:\-+:) *)+\|?)((?:\n(?:\|[^|]+?)+\|?)+)$#m', function($m) { | |
$a = explode('|', trim($m[2], '|')); | |
$str = "<table border=\"1\">\n"; | |
$str .= " <thead>\n"; | |
$str .= " <tr>\n"; | |
foreach(explode('|', trim($m[1], '|')) as $k => $v) { | |
$aa = isset($a[$k]) ? ' ' . trim($a[$k]) . ' ' : ""; | |
if(strpos($aa, ' :') !== false && strpos($aa, ': ') !== false) { | |
$a[$k] = ' align="center"'; | |
} else if(strpos($aa, ' :') !== false) { | |
$a[$k] = ' align="left"'; | |
} else if(strpos($aa, ': ') !== false) { | |
$a[$k] = ' align="right"'; | |
} else if(strpos($aa, ':') === false) { | |
$a[$k] = ""; | |
} | |
$str .= " <th" . $a[$k] . ">" . trim($v) . "</th>\n"; | |
} | |
$str .= " </tr>\n </thead>\n"; | |
$str .= " <tbody>\n"; | |
foreach(explode("\n", trim($m[3], "\n")) as $v) { | |
$str .= " <tr>\n"; | |
foreach(explode('|', trim($v, '|')) as $kk => $vv) { | |
$str .= " <td" . $a[$kk] . ">" . trim($vv) . "</td>\n"; | |
} | |
$str .= " </tr>\n"; | |
} | |
return $str . " </tbody>\n</table>\n"; | |
}, $content); | |
// parse new-line to paragraph | |
foreach($content = explode("\n\n", $content) as &$line) { | |
if( | |
$line !== "" // not empty | |
&& strpos($line, ' ') !== 0 // not a code block | |
&& strpos($line, "\t") !== 0 // --ibid | |
&& strpos($line, '<') !== 0 // not a HTML tag | |
) { | |
$line = '<p>' . trim($line) . '</p>'; | |
} | |
} | |
$content = implode("\n\n", $content); | |
// clean-up code block ... | |
$content = preg_replace('#<\/code><\/pre>\n<pre><code(>| .*?>)#', "\n", $content); | |
// typography (anything outside the HTML tag) | |
$content = preg_replace_callback('#(^|<\/?[a-z]+[^>\n]*?>)(.*?)(<\/?[a-z]+[^>\n]*?>|$)#', function($m) { | |
$s = str_replace( | |
array( | |
'&', | |
'<', | |
'>', | |
'---', | |
'--', | |
'...' | |
), | |
array( | |
'&', | |
'<', | |
'>', | |
'—', | |
'–', | |
'…' | |
), | |
$m[2]); | |
return $m[1] . preg_replace( | |
array( | |
'#\'([^\'"]*?)\'#', | |
'#"([^"]*?)"#', | |
'#\b\'#', | |
'#\'\b#', | |
'#&([a-z0-9]+|\#[0-9]+|\#x[a-f0-9]+);#' // restore the encoded html entity | |
), | |
array( | |
'‘$1’', | |
'“$1”', | |
'’', | |
'‘', | |
'&$1;' | |
), | |
$s) . $m[3]; | |
}, $content); | |
// un-escape character(s) | |
$content = __MDD($content, $x); | |
// output the result | |
return rtrim($content); | |
} |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Test Input
http://phpfiddle.org/main/code/mf1i-8ct6