inc/Parsing/Lexer/Lexer.php


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349

<?php

/**
 * Lexer adapted from Simple Test: http://sourceforge.net/projects/simpletest/
 * For an intro to the Lexer see:
 * https://web.archive.org/web/20120125041816/http://www.phppatterns.com/docs/develop/simple_test_lexer_notes
 *
 * @author Marcus Baker http://www.lastcraft.com
 */

namespace dokuwiki\Parsing\Lexer;

/**
 * Accepts text and breaks it into tokens.
 *
 * Some optimisation to make the sure the content is only scanned by the PHP regex
 * parser once. Lexer modes must not start with leading underscores.
 */
class Lexer
{
    /** @var ParallelRegex[] */
    protected $regexes = [];
    /** @var \Doku_Handler */
    protected $handler;
    /** @var StateStack */
    protected $modeStack;
    /** @var array mode "rewrites" */
    protected $mode_handlers = [];
    /** @var bool case sensitive? */
    protected $case;

    /**
     * Sets up the lexer in case insensitive matching by default.
     *
     * @param \Doku_Handler $handler  Handling strategy by reference.
     * @param string $start            Starting handler.
     * @param boolean $case            True for case sensitive.
     */
    public function __construct($handler, $start = "accept", $case = false)
    {
        $this->case = $case;
        $this->handler = $handler;
        $this->modeStack = new StateStack($start);
    }

    /**
     * Adds a token search pattern for a particular parsing mode.
     *
     * The pattern does not change the current mode.
     *
     * @param string $pattern      Perl style regex, but ( and )
     *                             lose the usual meaning.
     * @param string $mode         Should only apply this
     *                             pattern when dealing with
     *                             this type of input.
     */
    public function addPattern($pattern, $mode = "accept")
    {
        if (! isset($this->regexes[$mode])) {
            $this->regexes[$mode] = new ParallelRegex($this->case);
        }
        $this->regexes[$mode]->addPattern($pattern);
    }

    /**
     * Adds a pattern that will enter a new parsing mode.
     *
     * Useful for entering parenthesis, strings, tags, etc.
     *
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
     * @param string $new_mode     Change parsing to this new nested mode.
     */
    public function addEntryPattern($pattern, $mode, $new_mode)
    {
        if (! isset($this->regexes[$mode])) {
            $this->regexes[$mode] = new ParallelRegex($this->case);
        }
        $this->regexes[$mode]->addPattern($pattern, $new_mode);
    }

    /**
     * Adds a pattern that will exit the current mode and re-enter the previous one.
     *
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
     * @param string $mode         Mode to leave.
     */
    public function addExitPattern($pattern, $mode)
    {
        if (! isset($this->regexes[$mode])) {
            $this->regexes[$mode] = new ParallelRegex($this->case);
        }
        $this->regexes[$mode]->addPattern($pattern, "__exit");
    }

    /**
     * Adds a pattern that has a special mode.
     *
     * Acts as an entry and exit pattern in one go, effectively calling a special
     * parser handler for this token only.
     *
     * @param string $pattern      Perl style regex, but ( and ) lose the usual meaning.
     * @param string $mode         Should only apply this pattern when dealing with this type of input.
     * @param string $special      Use this mode for this one token.
     */
    public function addSpecialPattern($pattern, $mode, $special)
    {
        if (! isset($this->regexes[$mode])) {
            $this->regexes[$mode] = new ParallelRegex($this->case);
        }
        $this->regexes[$mode]->addPattern($pattern, "_$special");
    }

    /**
     * Adds a mapping from a mode to another handler.
     *
     * @param string $mode        Mode to be remapped.
     * @param string $handler     New target handler.
     */
    public function mapHandler($mode, $handler)
    {
        $this->mode_handlers[$mode] = $handler;
    }

    /**
     * Splits the page text into tokens.
     *
     * Will fail if the handlers report an error or if no content is consumed. If successful then each
     * unparsed and parsed token invokes a call to the held listener.
     *
     * @param string $raw        Raw HTML text.
     * @return boolean           True on success, else false.
     */
    public function parse($raw)
    {
        if (! isset($this->handler)) {
            return false;
        }
        $initialLength = strlen($raw);
        $length = $initialLength;
        $pos = 0;
        while (is_array($parsed = $this->reduce($raw))) {
            [$unmatched, $matched, $mode] = $parsed;
            $currentLength = strlen($raw);
            $matchPos = $initialLength - $currentLength - strlen($matched);
            if (! $this->dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
                return false;
            }
            if ($currentLength === $length) {
                return false;
            }
            $length = $currentLength;
            $pos = $initialLength - $currentLength;
        }
        if (!$parsed) {
            return false;
        }
        return $this->invokeHandler($raw, DOKU_LEXER_UNMATCHED, $pos);
    }

    /**
     * Gives plugins access to the mode stack
     *
     * @return StateStack
     */
    public function getModeStack()
    {
        return $this->modeStack;
    }

    /**
     * Sends the matched token and any leading unmatched
     * text to the parser changing the lexer to a new
     * mode if one is listed.
     *
     * @param string $unmatched Unmatched leading portion.
     * @param string $matched Actual token match.
     * @param bool|string $mode Mode after match. A boolean false mode causes no change.
     * @param int $initialPos
     * @param int $matchPos Current byte index location in raw doc thats being parsed
     * @return boolean             False if there was any error from the parser.
     */
    protected function dispatchTokens($unmatched, $matched, $mode, $initialPos, $matchPos)
    {
        if (! $this->invokeHandler($unmatched, DOKU_LEXER_UNMATCHED, $initialPos)) {
            return false;
        }
        if ($this->isModeEnd($mode)) {
            if (! $this->invokeHandler($matched, DOKU_LEXER_EXIT, $matchPos)) {
                return false;
            }
            return $this->modeStack->leave();
        }
        if ($this->isSpecialMode($mode)) {
            $this->modeStack->enter($this->decodeSpecial($mode));
            if (! $this->invokeHandler($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
                return false;
            }
            return $this->modeStack->leave();
        }
        if (is_string($mode)) {
            $this->modeStack->enter($mode);
            return $this->invokeHandler($matched, DOKU_LEXER_ENTER, $matchPos);
        }
        return $this->invokeHandler($matched, DOKU_LEXER_MATCHED, $matchPos);
    }

    /**
     * Tests to see if the new mode is actually to leave the current mode and pop an item from the matching
     * mode stack.
     *
     * @param string $mode    Mode to test.
     * @return boolean        True if this is the exit mode.
     */
    protected function isModeEnd($mode)
    {
        return ($mode === "__exit");
    }

    /**
     * Test to see if the mode is one where this mode is entered for this token only and automatically
     * leaves immediately afterwoods.
     *
     * @param string $mode    Mode to test.
     * @return boolean        True if this is the exit mode.
     */
    protected function isSpecialMode($mode)
    {
        return str_starts_with($mode, '_');
    }

    /**
     * Strips the magic underscore marking single token modes.
     *
     * @param string $mode    Mode to decode.
     * @return string         Underlying mode name.
     */
    protected function decodeSpecial($mode)
    {
        return substr($mode, 1);
    }

    /**
     * Calls the parser method named after the current mode.
     *
     * Empty content will be ignored. The lexer has a parser handler for each mode in the lexer.
     *
     * @param string $content Text parsed.
     * @param boolean $is_match Token is recognised rather
     *                               than unparsed data.
     * @param int $pos Current byte index location in raw doc
     *                             thats being parsed
     * @return bool
     */
    protected function invokeHandler($content, $is_match, $pos)
    {
        if (($content === "") || ($content === false)) {
            return true;
        }
        $handler = $this->modeStack->getCurrent();
        if (isset($this->mode_handlers[$handler])) {
            $handler = $this->mode_handlers[$handler];
        }

        // modes starting with plugin_ are all handled by the same
        // handler but with an additional parameter
        if (str_starts_with($handler, 'plugin_')) {
            [$handler, $plugin] = sexplode('_', $handler, 2, '');
            return $this->handler->$handler($content, $is_match, $pos, $plugin);
        }

        return $this->handler->$handler($content, $is_match, $pos);
    }

    /**
     * Tries to match a chunk of text and if successful removes the recognised chunk and any leading
     * unparsed data. Empty strings will not be matched.
     *
     * @param string $raw         The subject to parse. This is the content that will be eaten.
     * @return array|bool         Three item list of unparsed content followed by the
     *                            recognised token and finally the action the parser is to take.
     *                            True if no match, false if there is a parsing error.
     */
    protected function reduce(&$raw)
    {
        if (! isset($this->regexes[$this->modeStack->getCurrent()])) {
            return false;
        }
        if ($raw === "") {
            return true;
        }
        if ($action = $this->regexes[$this->modeStack->getCurrent()]->split($raw, $split)) {
            [$unparsed, $match, $raw] = $split;
            return [$unparsed, $match, $action];
        }
        return true;
    }

    /**
     * Escapes regex characters other than (, ) and /
     *
     * @param string $str
     * @return string
     */
    public static function escape($str)
    {
        $chars = [
            '/\\\\/',
            '/\./',
            '/\+/',
            '/\*/',
            '/\?/',
            '/\[/',
            '/\^/',
            '/\]/',
            '/\$/',
            '/\{/',
            '/\}/',
            '/\=/',
            '/\!/',
            '/\</',
            '/\>/',
            '/\|/',
            '/\:/'
        ];

        $escaped = [
            '\\\\\\\\',
            '\.',
            '\+',
            '\*',
            '\?',
            '\[',
            '\^',
            '\]',
            '\$',
            '\{',
            '\}',
            '\=',
            '\!',
            '\<',
            '\>',
            '\|',
            '\:'
        ];

        return preg_replace($chars, $escaped, $str);
    }
}