core/modules/search/search.module


1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347

<?php

/**
 * @file
 */

use Drupal\Component\Utility\Html;
use Drupal\Component\Utility\Unicode;
use Drupal\search\SearchTextProcessorInterface;

/**
 * Implements hook_theme_suggestions_HOOK().
 */
function search_theme_suggestions_search_result(array $variables): array {
  return ['search_result__' . $variables['plugin_id']];
}

/**
 * Implements hook_preprocess_HOOK() for block templates.
 */
function search_preprocess_block(&$variables): void {
  if ($variables['plugin_id'] == 'search_form_block') {
    $variables['attributes']['role'] = 'search';
  }
}

/**
 * @defgroup search Search interface
 * @{
 * The Drupal search interface manages a global search mechanism.
 *
 * Modules may plug into this system to provide searches of different types of
 * data. Most of the system is handled by the Search module, so this must be
 * enabled for all of the search features to work.
 *
 * There are two ways to interact with the search system:
 * - Specifically for searching nodes, you can implement
 *   hook_node_update_index() and hook_node_search_result(). However, note that
 *   the search system already indexes all visible output of a node; i.e.,
 *   everything displayed normally during node viewing. This is
 *   usually sufficient. You should only use this mechanism if you want
 *   additional, non-visible data to be indexed.
 * - Define a plugin implementing \Drupal\search\Plugin\SearchInterface and
 *   annotated as \Drupal\search\Annotation\SearchPlugin. This will create a
 *   search page type that users can use to set up one or more search pages.
 *   Each of these corresponds to a tab on the /search page, which can be
 *   used to perform searches. You will also need to implement the execute()
 *   method from the interface to perform the search. A base class is provided
 *   in \Drupal\search\Plugin\SearchPluginBase. For more information about
 *   plugins, see the @link plugin_api Plugin API topic. @endlink
 *
 * If your module needs to provide a more complicated search form, then you need
 * to implement it yourself. In that case, you may wish to define it as a local
 * task (tab) under the /search page (e.g. /search/my_module) so that users can
 * easily find it.
 *
 * @see plugin_api
 * @see annotation
 */

/**
 * Returns snippets from a piece of text, with search keywords highlighted.
 *
 * Used for formatting search results. All HTML tags will be stripped from
 * $text.
 *
 * @param string $keys
 *   A string containing a search query.
 * @param string $text
 *   The text to extract fragments from.
 * @param string|null $langcode
 *   Language code for the language of $text, if known.
 *
 * @return array
 *   A render array containing HTML for the excerpt.
 */
function search_excerpt($keys, $text, $langcode = NULL): array {
  // We highlight around non-indexable or CJK characters.
  $boundary_character = '[' . Unicode::PREG_CLASS_WORD_BOUNDARY . SearchTextProcessorInterface::PREG_CLASS_CJK . ']';
  $preceded_by_boundary = '(?<=' . $boundary_character . ')';
  $followed_by_boundary = '(?=' . $boundary_character . ')';

  // Extract positive keywords and phrases.
  preg_match_all('/ ("([^"]+)"|(?!OR)([^" ]+))/', ' ' . $keys, $matches);
  $keys = array_merge($matches[2], $matches[3]);

  // Prepare text by stripping HTML tags and decoding HTML entities.
  $text = strip_tags(str_replace(['<', '>'], [' <', '> '], $text));
  $text = Html::decodeEntities($text);
  $text_length = strlen($text);

  // Make a list of unique keywords that are actually found in the text,
  // which could be items in $keys or replacements that are equivalent through
  // \Drupal\search\SearchTextProcessorInterface::analyze().
  $temp_keys = [];
  foreach ($keys as $key) {
    $key = _search_find_match_with_simplify($key, $text, $boundary_character, $langcode);
    if (isset($key)) {
      // Quote slashes so they can be used in regular expressions.
      $temp_keys[] = preg_quote($key, '/');
    }
  }
  // Several keywords could have simplified down to the same thing, so pick
  // out the unique ones.
  $keys = array_unique($temp_keys);

  // Extract fragments of about 60 characters around keywords, bounded by word
  // boundary characters. Try to reach 256 characters, using second occurrences
  // if necessary.
  $ranges = [];
  $length = 0;
  $look_start = [];
  $remaining_keys = $keys;

  while ($length < 256 && !empty($remaining_keys)) {
    $found_keys = [];
    foreach ($remaining_keys as $key) {
      if ($length >= 256) {
        break;
      }

      // Remember where we last found $key, in case we are coming through a
      // second time.
      if (!isset($look_start[$key])) {
        $look_start[$key] = 0;
      }

      // See if we can find $key after where we found it the last time. Since
      // we are requiring a match on a word boundary, make sure $text starts
      // and ends with a space.
      $matches = [];
      if (preg_match('/' . $preceded_by_boundary . $key . $followed_by_boundary . '/iu', ' ' . $text . ' ', $matches, PREG_OFFSET_CAPTURE, $look_start[$key])) {
        $found_position = $matches[0][1];
        $look_start[$key] = $found_position + 1;
        // Keep track of which keys we found this time, in case we need to
        // pass through again to find more text.
        $found_keys[] = $key;

        // Locate a space before and after this match, leaving about 60
        // characters of context on each end.
        $before = strpos(' ' . $text, ' ', max(0, $found_position - 61));
        if ($before !== FALSE && $before <= $found_position) {
          if ($text_length > $found_position + 60) {
            $after = strrpos(substr($text, 0, $found_position + 60), ' ', $found_position);
          }
          else {
            $after = $text_length;
          }
          if ($after !== FALSE && $after > $found_position) {
            // Account for the spaces we added.
            $before = max($before - 1, 0);
            if ($before < $after) {
              // Save this range.
              $ranges[$before] = $after;
              $length += $after - $before;
            }
          }
        }
      }
    }
    // Next time through this loop, only look for keys we found this time,
    // if any.
    $remaining_keys = $found_keys;
  }

  if (empty($ranges)) {
    // We didn't find any keyword matches, so just return the first part of the
    // text. We also need to re-encode any HTML special characters that we
    // entity-decoded above.
    return [
      '#plain_text' => Unicode::truncate($text, 256, TRUE, TRUE),
    ];
  }

  // Sort the text ranges by starting position.
  ksort($ranges);

  // Collapse overlapping text ranges into one. The sorting makes it O(n).
  $new_ranges = [];
  $max_end = 0;
  foreach ($ranges as $this_from => $this_to) {
    $max_end = max($max_end, $this_to);
    if (!isset($working_from)) {
      // This is the first time through this loop: initialize.
      $working_from = $this_from;
      $working_to = $this_to;
      continue;
    }
    if ($this_from <= $working_to) {
      // The ranges overlap: combine them.
      $working_to = max($working_to, $this_to);
    }
    else {
      // The ranges do not overlap: save the working range and start a new one.
      $new_ranges[$working_from] = $working_to;
      $working_from = $this_from;
      $working_to = $this_to;
    }
  }
  // Save the remaining working range.
  $new_ranges[$working_from] = $working_to;

  // Fetch text within the combined ranges we found.
  $out = [];
  foreach ($new_ranges as $from => $to) {
    $out[] = substr($text, $from, $to - $from);
  }

  // Combine the text chunks with "…" separators. The "…" needs to be
  // translated. Let translators have the … separator text as one chunk.
  $ellipses = explode('@excerpt', t('… @excerpt … @excerpt …'));
  $text = (isset($new_ranges[0]) ? '' : $ellipses[0]) . implode($ellipses[1], $out) . (($max_end < strlen($text) - 1) ? $ellipses[2] : '');
  $text = Html::escape($text);

  // Highlight keywords. Must be done at once to prevent conflicts ('strong'
  // and '<strong>').
  $text = trim(preg_replace('/' . $preceded_by_boundary . '(?:' . implode('|', $keys) . ')' . $followed_by_boundary . '/iu', '<strong>\0</strong>', ' ' . $text . ' '));
  return [
    '#markup' => $text,
    '#allowed_tags' => ['strong'],
  ];
}

/**
 * @} End of "defgroup search".
 */

/**
 * Finds an appropriate keyword in text.
 *
 * @param string $key
 *   The keyword to find.
 * @param string $text
 *   The text to search for the keyword.
 * @param string $boundary
 *   Regular expression for the boundary character class (characters that
 *   indicate spaces between words).
 * @param string|null $langcode
 *   Language code for the language of $text, if known.
 *
 * @return string|null
 *   A segment of $text that is between word boundary characters that either
 *   matches $key directly, or matches $key when both this text segment and
 *   $key are processed by
 *   \Drupal\search\SearchTextProcessorInterface::analyze(). If a matching text
 *   segment is not located, NULL is returned.
 */
function _search_find_match_with_simplify($key, $text, $boundary, $langcode = NULL) {
  $preceded_by_boundary = '(?<=' . $boundary . ')';
  $followed_by_boundary = '(?=' . $boundary . ')';

  // See if $key appears as-is. When testing, make sure $text starts/ends with
  // a space, because we require $key to be surrounded by word boundary
  // characters.
  $temp = trim($key);
  if ($temp == '') {
    return NULL;
  }
  if (preg_match('/' . $preceded_by_boundary . preg_quote($temp, '/') . $followed_by_boundary . '/iu', ' ' . $text . ' ')) {
    return $temp;
  }

  // See if there is a match after lower-casing and removing diacritics in
  // both, which should preserve the string length.
  $new_text = mb_strtolower($text);
  $new_text = \Drupal::service('transliteration')->removeDiacritics($new_text);
  $new_key = mb_strtolower($temp);
  $new_key = \Drupal::service('transliteration')->removeDiacritics($new_key);
  if (preg_match('/' . $preceded_by_boundary . preg_quote($new_key, '/') . $followed_by_boundary . '/u', ' ' . $new_text . ' ')) {
    $position = mb_strpos($new_text, $new_key);
    return mb_substr($text, $position, mb_strlen($new_key));
  }

  // Run both text and key through text processor.
  /** @var \Drupal\search\SearchTextProcessorInterface $text_processor */
  $text_processor = \Drupal::service('search.text_processor');
  $simplified_key = trim($text_processor->analyze($key, $langcode));
  $simplified_text = trim($text_processor->analyze($text, $langcode));
  if ($simplified_key == '' || $simplified_text == '' || !str_contains($simplified_text, $simplified_key)) {
    // The simplified keyword and text do not match at all, or are empty.
    return NULL;
  }

  // Split $text into words, keeping track of where the word boundaries are.
  $words = preg_split('/' . $boundary . '+/u', $text, -1, PREG_SPLIT_OFFSET_CAPTURE);
  // Add an entry pointing to the end of the string, for the loop below.
  $words[] = ['', strlen($text)];

  // Using a binary search, find the earliest possible ending position in
  // $text where it will still match the keyword after applying
  // \Drupal\search\SearchTextProcessorInterface::analyze().
  $start_index = 0;
  $start_pos = $words[$start_index][1];
  $min_end_index = 1;
  $max_end_index = count($words) - 1;
  while ($max_end_index > $min_end_index) {
    // Check the index half way between min and max. See if we ended there,
    // if we would still have a match.
    $proposed_end_index = floor(($max_end_index + $min_end_index) / 2);
    $proposed_end_pos = $words[$proposed_end_index][1];
    // Since the split was done with preg_split(), the positions are byte counts
    // not character counts, so use substr() not mb_substr() here.
    $trial_text = trim($text_processor->analyze(substr($text, $start_pos, $proposed_end_pos - $start_pos), $langcode));
    if (str_contains($trial_text, $simplified_key)) {
      // The proposed endpoint is fine, text still matches.
      $max_end_index = $proposed_end_index;
    }
    else {
      // The proposed endpoint index is too early, so the earliest possible
      // OK ending point would be the next index.
      $min_end_index = $proposed_end_index + 1;
    }
  }

  // Now do the same for the starting position: using a binary search, find the
  // latest possible starting position in $text where it will still match the
  // keyword after applying
  // \Drupal\search\SearchTextProcessorInterface::analyze().
  $end_index = $min_end_index;
  $end_pos = $words[$end_index][1];
  $min_start_index = 0;
  $max_start_index = $end_index - 1;
  while ($max_start_index > $min_start_index) {
    // Check the index half way between min and max. See if we started there,
    // if we would still have a match.
    $proposed_start_index = ceil(($max_start_index + $min_start_index) / 2);
    $proposed_start_pos = $words[$proposed_start_index][1];
    // Since the split was done with preg_split(), the positions are byte counts
    // not character counts, so use substr() not mb_substr() here.
    $trial_text = trim($text_processor->analyze(substr($text, $proposed_start_pos, $end_pos - $proposed_start_pos), $langcode));
    if (str_contains($trial_text, $simplified_key)) {
      // The proposed start point is fine, text still matches.
      $min_start_index = $proposed_start_index;
    }
    else {
      // The proposed start point index is too late, so the latest possible
      // OK starting point would be the previous index.
      $max_start_index = $proposed_start_index - 1;
    }
  }
  $start_index = $max_start_index;

  // Return the matching text. We need to use substr() here and not the
  // mb_substr() function, because the indices in $words came from preg_split(),
  // so they are Unicode-safe byte positions, not character positions.
  return trim(substr($text, $words[$start_index][1], $words[$end_index][1] - $words[$start_index][1]));
}