*
* @author Michel Fortin
*
* @author Paul M. Jones
*
* @license http://opensource.org/licenses/bsd-license.php BSD
*
* @version $Id$
*
* @todo How to configure plugins at the start?
*
* @todo How to send a plugin object in the config?
*
*/
class Solar_Markdown extends Solar_Base
{
/**
*
* Default configuration values.
*
* @config int tab_width Number of spaces per tab. Default 4.
*
* @config bool|array tidy If empty/false/null, do not use Tidy to post-process
* the transformed output. If true or a non-empty array, is a set of
* config options to pass to Tidy when rendering output.
* See also . Default false.
*
* @config array plugins An array of plugins for the parser to use, in order.
*
* @var array
*
*/
protected $_Solar_Markdown = array(
'tab_width' => 4,
'tidy' => false,
'plugins' => array(
// pre-processing on the source as a whole
'Solar_Markdown_Plugin_Prefilter',
'Solar_Markdown_Plugin_StripLinkDefs',
// blocks
'Solar_Markdown_Plugin_Header',
'Solar_Markdown_Plugin_HorizRule',
'Solar_Markdown_Plugin_List',
'Solar_Markdown_Plugin_CodeBlock',
'Solar_Markdown_Plugin_BlockQuote',
'Solar_Markdown_Plugin_Html',
'Solar_Markdown_Plugin_Paragraph',
// spans
'Solar_Markdown_Plugin_CodeSpan',
'Solar_Markdown_Plugin_Image',
'Solar_Markdown_Plugin_Link',
'Solar_Markdown_Plugin_Uri',
'Solar_Markdown_Plugin_Encode',
'Solar_Markdown_Plugin_AmpsAngles',
'Solar_Markdown_Plugin_EmStrong',
'Solar_Markdown_Plugin_Break',
),
);
/**
*
* Left-delimiter for HTML tokens.
*
* @var string
*
*/
protected $_html_ldelim = "\x0E";
/**
*
* Right-delimiter for HTML tokens.
*
* @var string
*
*/
protected $_html_rdelim = "\x0F";
/**
*
* Left-delimiter for encoded Markdown characters.
*
* @var string
*
*/
protected $_char_ldelim = "\x02";
/**
*
* Right-delimiter for encoded Markdown characters.
*
* @var string
*
*/
protected $_char_rdelim = "\x03";
/**
*
* Array of HTML blocks represented by delimited token numbers.
*
* Format is token => html.
*
* @var array
*
*/
protected $_html = array();
/**
*
* Running count of $this->_html elements so we don't have to
* count($this->_html) each time we add an HTML token.
*
* @var int
*
*/
protected $_count = 0;
/**
*
* List of defined link references keyed on the link name.
*
* Format is "link-name" => array('href' => ..., 'title' => ...).
*
* Generally populated via the StripLinkDefs plugin.
*
* @var array
*
*/
protected $_link = array();
/**
*
* Escape table for special Markdown characters.
*
* Format is "$char" => "\x1B$char\x1B".
*
* @var array
*
*/
protected $_esc = array();
/**
*
* Escape table for backslashed special Markdown characters.
*
* Format is "\\$char" => "\x1B$char\x1B".
*
* Note that the backslash escape table and the normal escape table
* map to identical escape sequences.
*
* @var array
*
*/
protected $_bs_esc = array();
/**
*
* Array of all plugin objects.
*
* Format is class name => object instance.
*
* @var array
*
*/
protected $_plugin = array();
/**
*
* Array of all block-type plugin class names.
*
* Each plugin reports if it is a span or a block.
*
* @var array
*
*/
protected $_block_class = array();
/**
*
* Array of all span-type plugin class names.
*
* Each plugin reports if it is a span or a block.
*
* @var array
*
*/
protected $_span_class = array();
/**
*
* Array of all plugin classes that need to prepare the text before
* processing.
*
* @var array
*
*/
protected $_prepare_class = array();
/**
*
* Array of all plugin classes that need to clean up the text after
* processing.
*
* @var array
*
*/
protected $_cleanup_class = array();
/**
*
* Special characters that should be encoded by Markdown.
*
* This list will grow as plugins are added; they each report their
* own list of special characters to be encoded.
*
* @var array
*
*/
protected $_chars = '.{}\\';
/**
*
* Post-construction tasks to complete object construction.
*
* Loads the plugins, builds a list of characters to encode, and
* builds the list of block-type and span-type plugin classes.
*
* @return void
*
*/
protected function _postConstruct()
{
parent::_postConstruct();
// use tidy?
if ($this->_config['tidy'] && ! extension_loaded('tidy')) {
// tidy requested but not loaded
throw $this->_exception('ERR_EXTENSION_NOT_LOADED', array(
'extension' => 'tidy',
));
}
// load each plugin object
foreach ($this->_config['plugins'] as $spec) {
if (is_object($spec)) {
$class = get_class($spec);
$plugin = $spec;
} else {
$class = $spec;
$plugin = Solar::factory($class);
}
// save the plugin
$this->_plugin[$class] = $plugin;
$plugin->setMarkdown($this);
// does it need to prepare the text?
if ($plugin->isPrepare()) {
$this->_prepare_class[] = $class;
}
// is it a block plugin?
if ($plugin->isBlock()) {
$this->_block_class[] = $class;
}
// is it a span plugin?
if ($plugin->isSpan()) {
$this->_span_class[] = $class;
}
// does it need to clean up the text?
if ($plugin->isCleanup()) {
$this->_cleanup_class[] = $class;
}
// find out what characters this plugin thinks should be
// encoded as "special" Markdown characters.
$this->_chars .= $plugin->getChars();
}
// build the character escape tables
$k = strlen($this->_chars);
for ($i = 0; $i < $k; ++ $i) {
$char = $this->_chars[$i];
$delim = $this->_char_ldelim . $i. $this->_char_rdelim;
$this->_esc[$char] = $delim;
$this->_bs_esc["\\$char"] = $delim;
}
}
/**
*
* Returns an internal Markdown plugin object for direct manipulation
* and inspection.
*
* @param string $class The plugin class name.
*
* @return object The requested Markdown plugins.
*
*/
public function getPlugin($class)
{
return $this->_plugin[$class];
}
/**
*
* One-step transformation of source text using plugins.
*
* Calls reset(), prepare(), processBlocks(), cleanup(), and
* render() using the source text.
*
* @param string $text The source text.
*
* @return string The transformed text after all processing and
* rendering.
*
*/
public function transform($text)
{
$this->reset();
$text = $this->prepare($text);
$text = $this->processBlocks($text);
$text = $this->cleanup($text);
return $this->render($text);
}
/**
*
* Resets Markdown and all its plugins for a new transformation.
*
* @return void
*
*/
public function reset()
{
// reset from previous transformations
$this->_count = 0;
$this->_html = array();
$this->_link = array();
// reset the plugins
foreach ($this->_plugin as $plugin) {
$plugin->reset();
}
}
/**
*
* Prepares the text for processing by running the prepare()
* method of each plugin, in order, on the source text.
*
* Also calls the plugin's reset() method, and resets internal
* counters and arrays. This is to support work with multiple
* separate text sources.
*
* @param string $text The source text.
*
* @return string The source text after processing.
*
*/
public function prepare($text)
{
// let each plugin prepare the source text for parsing
foreach ($this->_prepare_class as $class) {
$text = $this->_plugin[$class]->prepare($text);
}
return $text;
}
/**
*
* Runs the source text through all block-type plugins.
*
* @param string $text The source text.
*
* @return string The source text after processing.
*
*/
public function processBlocks($text)
{
foreach ($this->_block_class as $class) {
$text = $this->_plugin[$class]->parse($text);
}
return $text;
}
/**
*
* Runs the processed text through each plugin's cleanup() method.
*
* This is so that the plugins can "clean up" after all main
* has been completed.
*
* @param string $text The processed text.
*
* @return string The processed text after cleanup.
*
*/
public function cleanup($text)
{
// let each plugin clean up the rendered source
foreach ($this->_cleanup_class as $class) {
$text = $this->_plugin[$class]->cleanup($text);
}
return $text;
}
/**
*
* Returns a final rendering of the processed text.
*
* This replaces any remaining HTML tokens, un-encodes special
* Markdown characters, and optionally runs the text through
* [Tidy][].
*
* [Tidy]: http://php.net/tidy
*
* @param string $text The processed and cleaned text.
*
* @return string The final rendering of the text.
*
*/
public function render($text)
{
// replace any remaining HTML tokens
$text = $this->unHtmlToken($text);
// replace all special chars in the text.
$text = $this->unEncode($text);
if (! $this->_config['tidy']) {
// tidy explicitly disabled
return $text;
}
// tidy up the text
$tidy = new tidy;
$opts = (array) $this->_config['tidy'];
$tidy->parseString($text, $opts, 'utf8');
$tidy->cleanRepair();
// get only the body portion
$body = trim(tidy_get_body($tidy)->value);
// remove and
return substr($body, 6, -7);
}
// -----------------------------------------------------------------
//
// General support methods for plugins.
//
// -----------------------------------------------------------------
/**
*
* Runs the source text through all span-type plugins.
*
* Generally this is called by block-type plugins, not by the
* Markdown engine directly.
*
* @param string $text The source text.
*
* @return string The source text after processing.
*
*/
public function processSpans($text)
{
foreach ($this->_span_class as $class) {
$text = $this->_plugin[$class]->parse($text);
}
return $text;
}
/**
*
* Returns the number of spaces per tab.
*
* @return int
*
*/
public function getTabWidth()
{
return (int) $this->_config['tab_width'];
}
// -----------------------------------------------------------------
//
// HTML token processing support methods for plugins.
//
// -----------------------------------------------------------------
/**
*
* Saves a pieces of text as HTML and returns a delimited token.
*
* @param string $text The text to retain as HTML.
*
* @return An HTML token.
*
*/
public function toHtmlToken($text)
{
$key = $this->_html_ldelim
. $this->_count
. $this->_html_rdelim;
$this->_html[$this->_count ++] = $text;
return $key;
}
/**
*
* Is a piece of text a delimited HTML token?
*
* @param string $text The text to check.
*
* @return bool True if a token, false if not.
*
*/
public function isHtmlToken($text)
{
return preg_match(
"/^{$this->_html_ldelim}.*?{$this->_html_rdelim}$/",
$text
);
}
/**
*
* Replaces all HTML tokens in source text with saved HTML.
*
* @param string $text The text to do replacements in.
*
* @return string The source text with HTML in place of tokens.
*
*/
public function unHtmlToken($text)
{
$regex = "/{$this->_html_ldelim}(.*?){$this->_html_rdelim}/";
while (preg_match_all($regex, $text, $matches, PREG_SET_ORDER)) {
foreach ($matches as $val) {
$text = str_replace(
$val[0],
$this->_html[$val[1]],
$text
);
}
}
return $text;
}
// -----------------------------------------------------------------
//
// Escaping for HTML and encoding for special Markdown characters.
//
// -----------------------------------------------------------------
/**
*
* Escapes HTML in source text.
*
* Uses htmlspecialchars() with UTF-8.
*
* @param string $text Source text.
*
* @param int $quotes How to escape quotes; default ENT_COMPAT.
*
* @return string The escaped text.
*
*/
public function escape($text, $quotes = ENT_COMPAT)
{
return htmlspecialchars($text, $quotes, 'UTF-8');
}
/**
*
* Encodes special Markdown characters so they are not recognized
* when parsing.
*
* @param string $text The source text.
*
* @param bool $only_backslash Only encode backslashed characters.
*
* @return string The encoded text.
*
*/
public function encode($text, $only_backslash = false)
{
$list = $this->_explodeTags($text);
// reset text and rebuild from the list
$text = '';
if ($only_backslash) {
foreach ($list as $item) {
$text .= $this->_encode($item[1]);
}
} else {
foreach ($list as $item) {
if ($item[0] == 'tag') {
$text .= $this->_encode($item[1], true);
} else {
$text .= $this->_encode($item[1]);
}
}
}
return $text;
}
/**
*
* Un-encodes special Markdown characters.
*
* @param string $text The text with encocded characters.
*
* @return string The un-encoded text.
*
*/
public function unEncode($text)
{
// because the bs_esc table uses the same values (just
// different keys), this will catch both regular and
// backslashes chars.
$chars = array_flip($this->_esc);
$text = str_replace(
array_keys($chars),
array_values($chars),
$text
);
return $text;
}
/**
*
* Support method for encode().
*
* @param string $text The source text.
*
* @param bool $in_tag Escaping inside a tag?
*
* @return string The encoded text.
*
*/
protected function _encode($text, $in_tag = false)
{
if ($in_tag) {
// inside a tag
$chars = $this->_esc;
} else {
// outside a tag, or between tags
$chars = $this->_bs_esc;
}
return str_replace(
array_keys($chars),
array_values($chars),
$text
);
}
/**
*
* Explodes source text into tags and text
*
* Regular expression derived from the _tokenize() subroutine in
* Brad Choate's [MTRegex][] plugin.
*
* [MTRegex]: http://www.bradchoate.com/past/mtregex.php
*
* From the original notes ...
*
* > Returns an array of the tokens comprising the input string.
* > Each token is either a tag (possibly with nested, tags
* > contained therein, such as , or a run of
* > text between tags. Each element of the array is a
* > two-element array; the first is either 'tag' or 'text'; the
* > second is the actual value.
*
* @param string $str A string of HTML.
*
* @return array The string exploded into tag and non-tag portions.
*
*/
protected function _explodeTags($str)
{
$index = 0;
$list = array();
$match = '(?s:)|'. # comment
'(?s:<\?.*?\?>)|'. # processing instruction
# regular tags
'(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)';
$parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE);
foreach ($parts as $part) {
if (++$index % 2 && $part != '') {
$list[] = array('text', $part);
} else {
$list[] = array('tag', $part);
}
}
return $list;
}
// -----------------------------------------------------------------
//
// Support methods for plugins, related to named link references.
//
// -----------------------------------------------------------------
/**
*
* Sets the value of a named link reference.
*
* @param string $name The link reference name.
*
* @param string $href The URI this link points to.
*
* @param string $title Alternate title for the link.
*
* @return void
*
*/
public function setLink($name, $href, $title = null)
{
$this->_link[$name] = array(
'href' => $href,
'title' => $title,
);
}
/**
*
* Returns the href and title of a named link reference.
*
* @param string $name The link reference name.
*
* @return bool|array If the link reference exists, returns an
* array with keys 'href' and 'title'. If not, returns false.
*
*/
public function getLink($name)
{
if (! empty($this->_link[$name])) {
return $this->_link[$name];
} else {
return false;
}
}
/**
*
* Returns an array of all defined link references.
*
* @return array All links keyed by name, where each element is an
* array with keys 'href' and 'title'.
*
*/
public function getLinks()
{
return $this->_link;
}
}