diff options
author | Andreas Gohr <gohr@cosmocode.de> | 2019-10-10 09:55:14 +0200 |
---|---|---|
committer | Andreas Gohr <gohr@cosmocode.de> | 2019-10-10 09:55:14 +0200 |
commit | 31a58aba4c24b34c34ad5764d1a35b7c398c3a2c (patch) | |
tree | 7f4d1546fbb69863a7d366fc1ff647f784853b68 /inc/Parsing/Lexer/Lexer.php | |
parent | af7ba5aa0bd10fc0ad9ef983006305b4c5a8ed42 (diff) | |
parent | c0c77cd20b23921c9e893bb70b99f38be153875a (diff) | |
download | dokuwiki-31a58aba4c24b34c34ad5764d1a35b7c398c3a2c.tar.gz dokuwiki-31a58aba4c24b34c34ad5764d1a35b7c398c3a2c.zip |
Merge branch 'psr2'
* psr2: (160 commits)
fixed merge error
Moved parts of the Asian word handling to its own class
ignore snake_case error of substr_replace
fixed some line length errors
ignore PSR2 in the old form class
fix PSR2 error in switch statement
replaced deprecated utf8 functions
formatting cleanup
mark old utf8 functions deprecated
some more PSR2 cleanup
Some cleanup for the UTF-8 stuff
Moved all utf8 methods to their own namespaced classes
Create separate table files for UTF-8 handling
Ignore mixed concerns in loader
Use type safe comparisons in loader
Remove obsolete include
adjust phpcs exclude patterns for new plugin classes
🚚 Move Subscription class to deprecated.php
♻️ Split up ChangesSubscriptionSender into multiple classes
Minor optimizations in PluginController
...
Diffstat (limited to 'inc/Parsing/Lexer/Lexer.php')
-rw-r--r-- | inc/Parsing/Lexer/Lexer.php | 347 |
1 files changed, 347 insertions, 0 deletions
diff --git a/inc/Parsing/Lexer/Lexer.php b/inc/Parsing/Lexer/Lexer.php new file mode 100644 index 000000000..c164f4ffe --- /dev/null +++ b/inc/Parsing/Lexer/Lexer.php @@ -0,0 +1,347 @@ +<?php +/** + * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/ + * For an intro to the Lexer see: + * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes + * + * @author Marcus Baker http://www.lastcraft.com + */ + +namespace dokuwiki\Parsing\Lexer; + +// FIXME move elsewhere + +define("DOKU_LEXER_ENTER", 1); +define("DOKU_LEXER_MATCHED", 2); +define("DOKU_LEXER_UNMATCHED", 3); +define("DOKU_LEXER_EXIT", 4); +define("DOKU_LEXER_SPECIAL", 5); + +/** + * Accepts text and breaks it into tokens. + * + * Some optimisation to make the sure the content is only scanned by the PHP regex + * parser once. Lexer modes must not start with leading underscores. + */ +class Lexer +{ + /** @var ParallelRegex[] */ + protected $regexes; + /** @var \Doku_Handler */ + protected $handler; + /** @var StateStack */ + protected $modeStack; + /** @var array mode "rewrites" */ + protected $mode_handlers; + /** @var bool case sensitive? */ + protected $case; + + /** + * Sets up the lexer in case insensitive matching by default. + * + * @param \Doku_Handler $handler Handling strategy by reference. + * @param string $start Starting handler. + * @param boolean $case True for case sensitive. + */ + public function __construct($handler, $start = "accept", $case = false) + { + $this->case = $case; + $this->regexes = array(); + $this->handler = $handler; + $this->modeStack = new StateStack($start); + $this->mode_handlers = array(); + } + + /** + * Adds a token search pattern for a particular parsing mode. + * + * The pattern does not change the current mode. + * + * @param string $pattern Perl style regex, but ( and ) + * lose the usual meaning. + * @param string $mode Should only apply this + * pattern when dealing with + * this type of input. + */ + public function addPattern($pattern, $mode = "accept") + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern); + } + + /** + * Adds a pattern that will enter a new parsing mode. + * + * Useful for entering parenthesis, strings, tags, etc. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Should only apply this pattern when dealing with this type of input. + * @param string $new_mode Change parsing to this new nested mode. + */ + public function addEntryPattern($pattern, $mode, $new_mode) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, $new_mode); + } + + /** + * Adds a pattern that will exit the current mode and re-enter the previous one. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Mode to leave. + */ + public function addExitPattern($pattern, $mode) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, "__exit"); + } + + /** + * Adds a pattern that has a special mode. + * + * Acts as an entry and exit pattern in one go, effectively calling a special + * parser handler for this token only. + * + * @param string $pattern Perl style regex, but ( and ) lose the usual meaning. + * @param string $mode Should only apply this pattern when dealing with this type of input. + * @param string $special Use this mode for this one token. + */ + public function addSpecialPattern($pattern, $mode, $special) + { + if (! isset($this->regexes[$mode])) { + $this->regexes[$mode] = new ParallelRegex($this->case); + } + $this->regexes[$mode]->addPattern($pattern, "_$special"); + } + + /** + * Adds a mapping from a mode to another handler. + * + * @param string $mode Mode to be remapped. + * @param string $handler New target handler. + */ + public function mapHandler($mode, $handler) + { + $this->mode_handlers[$mode] = $handler; + } + + /** + * Splits the page text into tokens. + * + * Will fail if the handlers report an error or if no content is consumed. If successful then each + * unparsed and parsed token invokes a call to the held listener. + * + * @param string $raw Raw HTML text. + * @return boolean True on success, else false. + */ + public function parse($raw) + { + if (! isset($this->handler)) { + return false; + } + $initialLength = strlen($raw); + $length = $initialLength; + $pos = 0; + while (is_array($parsed = $this->reduce($raw))) { + list($unmatched, $matched, $mode) = $parsed; + $currentLength = strlen($raw); + $matchPos = $initialLength - $currentLength - strlen($matched); + if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) { + return false; + } + if ($currentLength == $length) { + return false; + } + $length = $currentLength; + $pos = $initialLength - $currentLength; + } + if (!$parsed) { + return false; + } + return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos); + } + + /** + * Sends the matched token and any leading unmatched + * text to the parser changing the lexer to a new + * mode if one is listed. + * + * @param string $unmatched Unmatched leading portion. + * @param string $matched Actual token match. + * @param bool|string $mode Mode after match. A boolean false mode causes no change. + * @param int $initialPos + * @param int $matchPos Current byte index location in raw doc thats being parsed + * @return boolean False if there was any error from the parser. + */ + protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos) + { + if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) { + return false; + } + if ($this->isModeEnd($mode)) { + if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) { + return false; + } + return $this->modeStack->leave(); + } + if ($this->isSpecialMode($mode)) { + $this->modeStack->enter($this->decodeSpecial($mode)); + if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) { + return false; + } + return $this->modeStack->leave(); + } + if (is_string($mode)) { + $this->modeStack->enter($mode); + return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos); + } + return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos); + } + + /** + * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching + * mode stack. + * + * @param string $mode Mode to test. + * @return boolean True if this is the exit mode. + */ + protected function isModeEnd($mode) + { + return ($mode === "__exit"); + } + + /** + * Test to see if the mode is one where this mode is entered for this token only and automatically + * leaves immediately afterwoods. + * + * @param string $mode Mode to test. + * @return boolean True if this is the exit mode. + */ + protected function isSpecialMode($mode) + { + return (strncmp($mode, "_", 1) == 0); + } + + /** + * Strips the magic underscore marking single token modes. + * + * @param string $mode Mode to decode. + * @return string Underlying mode name. + */ + protected function decodeSpecial($mode) + { + return substr($mode, 1); + } + + /** + * Calls the parser method named after the current mode. + * + * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer. + * + * @param string $content Text parsed. + * @param boolean $is_match Token is recognised rather + * than unparsed data. + * @param int $pos Current byte index location in raw doc + * thats being parsed + * @return bool + */ + protected function invokeHandler($content, $is_match, $pos) + { + if (($content === "") || ($content === false)) { + return true; + } + $handler = $this->modeStack->getCurrent(); + if (isset($this->mode_handlers[$handler])) { + $handler = $this->mode_handlers[$handler]; + } + + // modes starting with plugin_ are all handled by the same + // handler but with an additional parameter + if (substr($handler, 0, 7)=='plugin_') { + list($handler,$plugin) = explode('_', $handler, 2); + return $this->handler->$handler($content, $is_match, $pos, $plugin); + } + + return $this->handler->$handler($content, $is_match, $pos); + } + + /** + * Tries to match a chunk of text and if successful removes the recognised chunk and any leading + * unparsed data. Empty strings will not be matched. + * + * @param string $raw The subject to parse. This is the content that will be eaten. + * @return array|bool Three item list of unparsed content followed by the + * recognised token and finally the action the parser is to take. + * True if no match, false if there is a parsing error. + */ + protected function reduce(&$raw) + { + if (! isset($this->regexes[$this->modeStack->getCurrent()])) { + return false; + } + if ($raw === "") { + return true; + } + if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) { + list($unparsed, $match, $raw) = $split; + return array($unparsed, $match, $action); + } + return true; + } + + /** + * Escapes regex characters other than (, ) and / + * + * @param string $str + * @return string + */ + public static function escape($str) + { + $chars = array( + '/\\\\/', + '/\./', + '/\+/', + '/\*/', + '/\?/', + '/\[/', + '/\^/', + '/\]/', + '/\$/', + '/\{/', + '/\}/', + '/\=/', + '/\!/', + '/\</', + '/\>/', + '/\|/', + '/\:/' + ); + + $escaped = array( + '\\\\\\\\', + '\.', + '\+', + '\*', + '\?', + '\[', + '\^', + '\]', + '\$', + '\{', + '\}', + '\=', + '\!', + '\<', + '\>', + '\|', + '\:' + ); + return preg_replace($chars, $escaped, $str); + } +} |