* * @author Michel Fortin * * @author Paul M. Jones * * @license http://opensource.org/licenses/bsd-license.php BSD * * @version $Id$ * * @todo How to configure plugins at the start? * * @todo How to send a plugin object in the config? * */ class Solar_Markdown extends Solar_Base { /** * * Default configuration values. * * @config int tab_width Number of spaces per tab. Default 4. * * @config bool|array tidy If empty/false/null, do not use Tidy to post-process * the transformed output. If true or a non-empty array, is a set of * config options to pass to Tidy when rendering output. * See also . Default false. * * @config array plugins An array of plugins for the parser to use, in order. * * @var array * */ protected $_Solar_Markdown = array( 'tab_width' => 4, 'tidy' => false, 'plugins' => array( // pre-processing on the source as a whole 'Solar_Markdown_Plugin_Prefilter', 'Solar_Markdown_Plugin_StripLinkDefs', // blocks 'Solar_Markdown_Plugin_Header', 'Solar_Markdown_Plugin_HorizRule', 'Solar_Markdown_Plugin_List', 'Solar_Markdown_Plugin_CodeBlock', 'Solar_Markdown_Plugin_BlockQuote', 'Solar_Markdown_Plugin_Html', 'Solar_Markdown_Plugin_Paragraph', // spans 'Solar_Markdown_Plugin_CodeSpan', 'Solar_Markdown_Plugin_Image', 'Solar_Markdown_Plugin_Link', 'Solar_Markdown_Plugin_Uri', 'Solar_Markdown_Plugin_Encode', 'Solar_Markdown_Plugin_AmpsAngles', 'Solar_Markdown_Plugin_EmStrong', 'Solar_Markdown_Plugin_Break', ), ); /** * * Left-delimiter for HTML tokens. * * @var string * */ protected $_html_ldelim = "\x0E"; /** * * Right-delimiter for HTML tokens. * * @var string * */ protected $_html_rdelim = "\x0F"; /** * * Left-delimiter for encoded Markdown characters. * * @var string * */ protected $_char_ldelim = "\x02"; /** * * Right-delimiter for encoded Markdown characters. * * @var string * */ protected $_char_rdelim = "\x03"; /** * * Array of HTML blocks represented by delimited token numbers. * * Format is token => html. * * @var array * */ protected $_html = array(); /** * * Running count of $this->_html elements so we don't have to * count($this->_html) each time we add an HTML token. * * @var int * */ protected $_count = 0; /** * * List of defined link references keyed on the link name. * * Format is "link-name" => array('href' => ..., 'title' => ...). * * Generally populated via the StripLinkDefs plugin. * * @var array * */ protected $_link = array(); /** * * Escape table for special Markdown characters. * * Format is "$char" => "\x1B$char\x1B". * * @var array * */ protected $_esc = array(); /** * * Escape table for backslashed special Markdown characters. * * Format is "\\$char" => "\x1B$char\x1B". * * Note that the backslash escape table and the normal escape table * map to identical escape sequences. * * @var array * */ protected $_bs_esc = array(); /** * * Array of all plugin objects. * * Format is class name => object instance. * * @var array * */ protected $_plugin = array(); /** * * Array of all block-type plugin class names. * * Each plugin reports if it is a span or a block. * * @var array * */ protected $_block_class = array(); /** * * Array of all span-type plugin class names. * * Each plugin reports if it is a span or a block. * * @var array * */ protected $_span_class = array(); /** * * Array of all plugin classes that need to prepare the text before * processing. * * @var array * */ protected $_prepare_class = array(); /** * * Array of all plugin classes that need to clean up the text after * processing. * * @var array * */ protected $_cleanup_class = array(); /** * * Special characters that should be encoded by Markdown. * * This list will grow as plugins are added; they each report their * own list of special characters to be encoded. * * @var array * */ protected $_chars = '.{}\\'; /** * * Post-construction tasks to complete object construction. * * Loads the plugins, builds a list of characters to encode, and * builds the list of block-type and span-type plugin classes. * * @return void * */ protected function _postConstruct() { parent::_postConstruct(); // use tidy? if ($this->_config['tidy'] && ! extension_loaded('tidy')) { // tidy requested but not loaded throw $this->_exception('ERR_EXTENSION_NOT_LOADED', array( 'extension' => 'tidy', )); } // load each plugin object foreach ($this->_config['plugins'] as $spec) { if (is_object($spec)) { $class = get_class($spec); $plugin = $spec; } else { $class = $spec; $plugin = Solar::factory($class); } // save the plugin $this->_plugin[$class] = $plugin; $plugin->setMarkdown($this); // does it need to prepare the text? if ($plugin->isPrepare()) { $this->_prepare_class[] = $class; } // is it a block plugin? if ($plugin->isBlock()) { $this->_block_class[] = $class; } // is it a span plugin? if ($plugin->isSpan()) { $this->_span_class[] = $class; } // does it need to clean up the text? if ($plugin->isCleanup()) { $this->_cleanup_class[] = $class; } // find out what characters this plugin thinks should be // encoded as "special" Markdown characters. $this->_chars .= $plugin->getChars(); } // build the character escape tables $k = strlen($this->_chars); for ($i = 0; $i < $k; ++ $i) { $char = $this->_chars[$i]; $delim = $this->_char_ldelim . $i. $this->_char_rdelim; $this->_esc[$char] = $delim; $this->_bs_esc["\\$char"] = $delim; } } /** * * Returns an internal Markdown plugin object for direct manipulation * and inspection. * * @param string $class The plugin class name. * * @return object The requested Markdown plugins. * */ public function getPlugin($class) { return $this->_plugin[$class]; } /** * * One-step transformation of source text using plugins. * * Calls reset(), prepare(), processBlocks(), cleanup(), and * render() using the source text. * * @param string $text The source text. * * @return string The transformed text after all processing and * rendering. * */ public function transform($text) { $this->reset(); $text = $this->prepare($text); $text = $this->processBlocks($text); $text = $this->cleanup($text); return $this->render($text); } /** * * Resets Markdown and all its plugins for a new transformation. * * @return void * */ public function reset() { // reset from previous transformations $this->_count = 0; $this->_html = array(); $this->_link = array(); // reset the plugins foreach ($this->_plugin as $plugin) { $plugin->reset(); } } /** * * Prepares the text for processing by running the prepare() * method of each plugin, in order, on the source text. * * Also calls the plugin's reset() method, and resets internal * counters and arrays. This is to support work with multiple * separate text sources. * * @param string $text The source text. * * @return string The source text after processing. * */ public function prepare($text) { // let each plugin prepare the source text for parsing foreach ($this->_prepare_class as $class) { $text = $this->_plugin[$class]->prepare($text); } return $text; } /** * * Runs the source text through all block-type plugins. * * @param string $text The source text. * * @return string The source text after processing. * */ public function processBlocks($text) { foreach ($this->_block_class as $class) { $text = $this->_plugin[$class]->parse($text); } return $text; } /** * * Runs the processed text through each plugin's cleanup() method. * * This is so that the plugins can "clean up" after all main * has been completed. * * @param string $text The processed text. * * @return string The processed text after cleanup. * */ public function cleanup($text) { // let each plugin clean up the rendered source foreach ($this->_cleanup_class as $class) { $text = $this->_plugin[$class]->cleanup($text); } return $text; } /** * * Returns a final rendering of the processed text. * * This replaces any remaining HTML tokens, un-encodes special * Markdown characters, and optionally runs the text through * [Tidy][]. * * [Tidy]: http://php.net/tidy * * @param string $text The processed and cleaned text. * * @return string The final rendering of the text. * */ public function render($text) { // replace any remaining HTML tokens $text = $this->unHtmlToken($text); // replace all special chars in the text. $text = $this->unEncode($text); if (! $this->_config['tidy']) { // tidy explicitly disabled return $text; } // tidy up the text $tidy = new tidy; $opts = (array) $this->_config['tidy']; $tidy->parseString($text, $opts, 'utf8'); $tidy->cleanRepair(); // get only the body portion $body = trim(tidy_get_body($tidy)->value); // remove and return substr($body, 6, -7); } // ----------------------------------------------------------------- // // General support methods for plugins. // // ----------------------------------------------------------------- /** * * Runs the source text through all span-type plugins. * * Generally this is called by block-type plugins, not by the * Markdown engine directly. * * @param string $text The source text. * * @return string The source text after processing. * */ public function processSpans($text) { foreach ($this->_span_class as $class) { $text = $this->_plugin[$class]->parse($text); } return $text; } /** * * Returns the number of spaces per tab. * * @return int * */ public function getTabWidth() { return (int) $this->_config['tab_width']; } // ----------------------------------------------------------------- // // HTML token processing support methods for plugins. // // ----------------------------------------------------------------- /** * * Saves a pieces of text as HTML and returns a delimited token. * * @param string $text The text to retain as HTML. * * @return An HTML token. * */ public function toHtmlToken($text) { $key = $this->_html_ldelim . $this->_count . $this->_html_rdelim; $this->_html[$this->_count ++] = $text; return $key; } /** * * Is a piece of text a delimited HTML token? * * @param string $text The text to check. * * @return bool True if a token, false if not. * */ public function isHtmlToken($text) { return preg_match( "/^{$this->_html_ldelim}.*?{$this->_html_rdelim}$/", $text ); } /** * * Replaces all HTML tokens in source text with saved HTML. * * @param string $text The text to do replacements in. * * @return string The source text with HTML in place of tokens. * */ public function unHtmlToken($text) { $regex = "/{$this->_html_ldelim}(.*?){$this->_html_rdelim}/"; while (preg_match_all($regex, $text, $matches, PREG_SET_ORDER)) { foreach ($matches as $val) { $text = str_replace( $val[0], $this->_html[$val[1]], $text ); } } return $text; } // ----------------------------------------------------------------- // // Escaping for HTML and encoding for special Markdown characters. // // ----------------------------------------------------------------- /** * * Escapes HTML in source text. * * Uses htmlspecialchars() with UTF-8. * * @param string $text Source text. * * @param int $quotes How to escape quotes; default ENT_COMPAT. * * @return string The escaped text. * */ public function escape($text, $quotes = ENT_COMPAT) { return htmlspecialchars($text, $quotes, 'UTF-8'); } /** * * Encodes special Markdown characters so they are not recognized * when parsing. * * @param string $text The source text. * * @param bool $only_backslash Only encode backslashed characters. * * @return string The encoded text. * */ public function encode($text, $only_backslash = false) { $list = $this->_explodeTags($text); // reset text and rebuild from the list $text = ''; if ($only_backslash) { foreach ($list as $item) { $text .= $this->_encode($item[1]); } } else { foreach ($list as $item) { if ($item[0] == 'tag') { $text .= $this->_encode($item[1], true); } else { $text .= $this->_encode($item[1]); } } } return $text; } /** * * Un-encodes special Markdown characters. * * @param string $text The text with encocded characters. * * @return string The un-encoded text. * */ public function unEncode($text) { // because the bs_esc table uses the same values (just // different keys), this will catch both regular and // backslashes chars. $chars = array_flip($this->_esc); $text = str_replace( array_keys($chars), array_values($chars), $text ); return $text; } /** * * Support method for encode(). * * @param string $text The source text. * * @param bool $in_tag Escaping inside a tag? * * @return string The encoded text. * */ protected function _encode($text, $in_tag = false) { if ($in_tag) { // inside a tag $chars = $this->_esc; } else { // outside a tag, or between tags $chars = $this->_bs_esc; } return str_replace( array_keys($chars), array_values($chars), $text ); } /** * * Explodes source text into tags and text * * Regular expression derived from the _tokenize() subroutine in * Brad Choate's [MTRegex][] plugin. * * [MTRegex]: http://www.bradchoate.com/past/mtregex.php * * From the original notes ... * * > Returns an array of the tokens comprising the input string. * > Each token is either a tag (possibly with nested, tags * > contained therein, such as , or a run of * > text between tags. Each element of the array is a * > two-element array; the first is either 'tag' or 'text'; the * > second is the actual value. * * @param string $str A string of HTML. * * @return array The string exploded into tag and non-tag portions. * */ protected function _explodeTags($str) { $index = 0; $list = array(); $match = '(?s:)|'. # comment '(?s:<\?.*?\?>)|'. # processing instruction # regular tags '(?:<[/!$]?[-a-zA-Z0-9:]+\b(?>[^"\'>]+|"[^"]*"|\'[^\']*\')*>)'; $parts = preg_split("{($match)}", $str, -1, PREG_SPLIT_DELIM_CAPTURE); foreach ($parts as $part) { if (++$index % 2 && $part != '') { $list[] = array('text', $part); } else { $list[] = array('tag', $part); } } return $list; } // ----------------------------------------------------------------- // // Support methods for plugins, related to named link references. // // ----------------------------------------------------------------- /** * * Sets the value of a named link reference. * * @param string $name The link reference name. * * @param string $href The URI this link points to. * * @param string $title Alternate title for the link. * * @return void * */ public function setLink($name, $href, $title = null) { $this->_link[$name] = array( 'href' => $href, 'title' => $title, ); } /** * * Returns the href and title of a named link reference. * * @param string $name The link reference name. * * @return bool|array If the link reference exists, returns an * array with keys 'href' and 'title'. If not, returns false. * */ public function getLink($name) { if (! empty($this->_link[$name])) { return $this->_link[$name]; } else { return false; } } /** * * Returns an array of all defined link references. * * @return array All links keyed by name, where each element is an * array with keys 'href' and 'title'. * */ public function getLinks() { return $this->_link; } }