aboutsummaryrefslogtreecommitdiffstatshomepage
path: root/inc/Utf8
diff options
context:
space:
mode:
Diffstat (limited to 'inc/Utf8')
-rw-r--r--inc/Utf8/Asian.php3
-rw-r--r--inc/Utf8/Clean.php1
-rw-r--r--inc/Utf8/Conversion.php14
-rw-r--r--inc/Utf8/PhpString.php15
-rw-r--r--inc/Utf8/Sort.php6
-rw-r--r--inc/Utf8/Table.php1
-rw-r--r--inc/Utf8/Unicode.php157
-rw-r--r--inc/Utf8/tables/case.php2
-rw-r--r--inc/Utf8/tables/loweraccents.php2
-rw-r--r--inc/Utf8/tables/romanization.php33
-rw-r--r--inc/Utf8/tables/specials.php2
-rw-r--r--inc/Utf8/tables/upperaccents.php2
12 files changed, 85 insertions, 153 deletions
diff --git a/inc/Utf8/Asian.php b/inc/Utf8/Asian.php
index c7baa3029..75406594a 100644
--- a/inc/Utf8/Asian.php
+++ b/inc/Utf8/Asian.php
@@ -11,13 +11,12 @@ namespace dokuwiki\Utf8;
*/
class Asian
{
-
/**
* This defines a non-capturing group for the use in regular expressions to match any asian character that
* needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from
* http://en.wikipedia.org/wiki/Unicode_block
*/
- const REGEXP =
+ public const REGEXP =
'(?:' .
'[\x{0E00}-\x{0E7F}]' . // Thai
diff --git a/inc/Utf8/Clean.php b/inc/Utf8/Clean.php
index 0975ff559..434da7043 100644
--- a/inc/Utf8/Clean.php
+++ b/inc/Utf8/Clean.php
@@ -200,5 +200,4 @@ class Clean
return $i;
}
-
}
diff --git a/inc/Utf8/Conversion.php b/inc/Utf8/Conversion.php
index fad9cd0b1..acde3fb2f 100644
--- a/inc/Utf8/Conversion.php
+++ b/inc/Utf8/Conversion.php
@@ -7,7 +7,6 @@ namespace dokuwiki\Utf8;
*/
class Conversion
{
-
/**
* Encodes UTF-8 characters to HTML entities
*
@@ -58,14 +57,14 @@ class Conversion
if (!$entities) {
return preg_replace_callback(
'/(&#([Xx])?([0-9A-Za-z]+);)/m',
- [__CLASS__, 'decodeNumericEntity'],
+ [self::class, 'decodeNumericEntity'],
$str
);
}
return preg_replace_callback(
'/&(#)?([Xx])?([0-9A-Za-z]+);/m',
- [__CLASS__, 'decodeAnyEntity'],
+ [self::class, 'decodeAnyEntity'],
$str
);
}
@@ -84,9 +83,7 @@ class Conversion
$table = get_html_translation_table(HTML_ENTITIES);
$table = array_flip($table);
$table = array_map(
- static function ($c) {
- return Unicode::toUtf8(array(ord($c)));
- },
+ static fn($c) => Unicode::toUtf8([ord($c)]),
$table
);
}
@@ -116,10 +113,10 @@ class Conversion
$cp = hexdec($ent[3]);
break;
default:
- $cp = intval($ent[3]);
+ $cp = (int) $ent[3];
break;
}
- return Unicode::toUtf8(array($cp));
+ return Unicode::toUtf8([$cp]);
}
/**
@@ -158,5 +155,4 @@ class Conversion
$uni = unpack('n*', $str);
return Unicode::toUtf8($uni);
}
-
}
diff --git a/inc/Utf8/PhpString.php b/inc/Utf8/PhpString.php
index d382f14aa..6d9a8d547 100644
--- a/inc/Utf8/PhpString.php
+++ b/inc/Utf8/PhpString.php
@@ -7,7 +7,6 @@ namespace dokuwiki\Utf8;
*/
class PhpString
{
-
/**
* A locale independent basename() implementation
*
@@ -138,29 +137,21 @@ class PhpString
if ($length === null) {
$length_pattern = '(.*)$'; // the rest of the string
} else {
-
if (!isset($strlen)) $strlen = self::strlen($str); // see notes
if ($offset > $strlen) return ''; // another trivial case
if ($length > 0) {
-
// reduce any length that would go past the end of the string
$length = min($strlen - $offset, $length);
-
$Lx = (int)($length / 65535);
$Ly = $length % 65535;
-
// +ve length requires ... a captured group of length characters
if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
$length_pattern = '(' . $length_pattern . '.{' . $Ly . '})';
-
- } else if ($length < 0) {
-
+ } elseif ($length < 0) {
if ($length < ($offset - $strlen)) return '';
-
$Lx = (int)((-$length) / 65535);
$Ly = (-$length) % 65535;
-
// -ve length requires ... capture everything except a group of -length characters
// anchored at the tail-end of the string
if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}';
@@ -268,7 +259,7 @@ class PhpString
*/
public static function strtolower($string)
{
- if($string === null) return ''; // pre-8.1 behaviour
+ if ($string === null) return ''; // pre-8.1 behaviour
if (UTF8_MBSTRING) {
if (class_exists('Normalizer', $autoload = false)) {
return \Normalizer::normalize(mb_strtolower($string, 'utf-8'));
@@ -379,6 +370,4 @@ class PhpString
return $length;
}
-
-
}
diff --git a/inc/Utf8/Sort.php b/inc/Utf8/Sort.php
index decb058a7..9f066b050 100644
--- a/inc/Utf8/Sort.php
+++ b/inc/Utf8/Sort.php
@@ -55,7 +55,9 @@ class Sort
'Collator created with locale "' . $lc . '": numeric collation on, ' .
'valid locale "' . $collator->getLocale(\Locale::VALID_LOCALE) . '", ' .
'actual locale "' . $collator->getLocale(\Locale::ACTUAL_LOCALE) . '"',
- null, __FILE__, __LINE__
+ null,
+ __FILE__,
+ __LINE__
);
self::$collators[$lc] = $collator;
}
@@ -128,7 +130,7 @@ class Sort
{
$collator = self::getCollator();
if (isset($collator)) {
- return uksort($array, array($collator, 'compare'));
+ return uksort($array, [$collator, 'compare']);
} else {
return ksort($array, SORT_NATURAL | SORT_FLAG_CASE);
}
diff --git a/inc/Utf8/Table.php b/inc/Utf8/Table.php
index 8683c9238..f618c69ea 100644
--- a/inc/Utf8/Table.php
+++ b/inc/Utf8/Table.php
@@ -9,7 +9,6 @@ namespace dokuwiki\Utf8;
*/
class Table
{
-
/**
* Get the upper to lower case conversion table
*
diff --git a/inc/Utf8/Unicode.php b/inc/Utf8/Unicode.php
index 4b6426533..31faf95d7 100644
--- a/inc/Utf8/Unicode.php
+++ b/inc/Utf8/Unicode.php
@@ -7,7 +7,6 @@ namespace dokuwiki\Utf8;
*/
class Unicode
{
-
/**
* Takes an UTF-8 string and returns an array of ints representing the
* Unicode characters. Astral planes are supported ie. the ints in the
@@ -40,45 +39,39 @@ class Unicode
$mUcs4 = 0; // cached Unicode character
$mBytes = 1; // cached expected number of octets in the current sequence
- $out = array();
+ $out = [];
$len = strlen($str);
for ($i = 0; $i < $len; $i++) {
-
$in = ord($str[$i]);
if ($mState === 0) {
-
// When mState is zero we expect either a US-ASCII character or a
// multi-octet sequence.
if (0 === (0x80 & $in)) {
// US-ASCII, pass straight through.
$out[] = $in;
$mBytes = 1;
-
- } else if (0xC0 === (0xE0 & $in)) {
+ } elseif (0xC0 === (0xE0 & $in)) {
// First octet of 2 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x1F) << 6;
$mState = 1;
$mBytes = 2;
-
- } else if (0xE0 === (0xF0 & $in)) {
+ } elseif (0xE0 === (0xF0 & $in)) {
// First octet of 3 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x0F) << 12;
$mState = 2;
$mBytes = 3;
-
- } else if (0xF0 === (0xF8 & $in)) {
+ } elseif (0xF0 === (0xF8 & $in)) {
// First octet of 4 octet sequence
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 0x07) << 18;
$mState = 3;
$mBytes = 4;
-
- } else if (0xF8 === (0xFC & $in)) {
+ } elseif (0xF8 === (0xFC & $in)) {
/* First octet of 5 octet sequence.
*
* This is illegal because the encoded codepoint must be either
@@ -91,14 +84,12 @@ class Unicode
$mUcs4 = ($mUcs4 & 0x03) << 24;
$mState = 4;
$mBytes = 5;
-
- } else if (0xFC === (0xFE & $in)) {
+ } elseif (0xFC === (0xFE & $in)) {
// First octet of 6 octet sequence, see comments for 5 octet sequence.
$mUcs4 = $in;
$mUcs4 = ($mUcs4 & 1) << 30;
$mState = 5;
$mBytes = 6;
-
} elseif ($strict) {
/* Current octet is neither in the US-ASCII range nor a legal first
* octet of a multi-octet sequence.
@@ -109,76 +100,67 @@ class Unicode
E_USER_WARNING
);
return false;
-
}
-
- } else {
-
+ } elseif (0x80 === (0xC0 & $in)) {
// When mState is non-zero, we expect a continuation of the multi-octet
// sequence
- if (0x80 === (0xC0 & $in)) {
-
- // Legal continuation.
- $shift = ($mState - 1) * 6;
- $tmp = $in;
- $tmp = ($tmp & 0x0000003F) << $shift;
- $mUcs4 |= $tmp;
-
- /**
- * End of the multi-octet sequence. mUcs4 now contains the final
- * Unicode codepoint to be output
+ // Legal continuation.
+ $shift = ($mState - 1) * 6;
+ $tmp = $in;
+ $tmp = ($tmp & 0x0000003F) << $shift;
+ $mUcs4 |= $tmp;
+ /**
+ * End of the multi-octet sequence. mUcs4 now contains the final
+ * Unicode codepoint to be output
+ */
+ if (0 === --$mState) {
+ /*
+ * Check for illegal sequences and codepoints.
*/
- if (0 === --$mState) {
-
- /*
- * Check for illegal sequences and codepoints.
- */
- // From Unicode 3.1, non-shortest form is illegal
- if (((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
- ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
- ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
- (4 < $mBytes) ||
- // From Unicode 3.2, surrogate characters are illegal
- (($mUcs4 & 0xFFFFF800) === 0xD800) ||
- // Codepoints outside the Unicode range are illegal
- ($mUcs4 > 0x10FFFF)) {
-
- if ($strict) {
- trigger_error(
- 'utf8_to_unicode: Illegal sequence or codepoint ' .
- 'in UTF-8 at byte ' . $i,
- E_USER_WARNING
- );
-
- return false;
- }
-
+ // From Unicode 3.1, non-shortest form is illegal
+ if (
+ ((2 === $mBytes) && ($mUcs4 < 0x0080)) ||
+ ((3 === $mBytes) && ($mUcs4 < 0x0800)) ||
+ ((4 === $mBytes) && ($mUcs4 < 0x10000)) ||
+ (4 < $mBytes) ||
+ // From Unicode 3.2, surrogate characters are illegal
+ (($mUcs4 & 0xFFFFF800) === 0xD800) ||
+ // Codepoints outside the Unicode range are illegal
+ ($mUcs4 > 0x10FFFF)
+ ) {
+ if ($strict) {
+ trigger_error(
+ 'utf8_to_unicode: Illegal sequence or codepoint ' .
+ 'in UTF-8 at byte ' . $i,
+ E_USER_WARNING
+ );
+
+ return false;
}
-
- if (0xFEFF !== $mUcs4) {
- // BOM is legal but we don't want to output it
- $out[] = $mUcs4;
- }
-
- //initialize UTF8 cache
- $mState = 0;
- $mUcs4 = 0;
- $mBytes = 1;
}
- } elseif ($strict) {
- /**
- *((0xC0 & (*in) != 0x80) && (mState != 0))
- * Incomplete multi-octet sequence.
- */
- trigger_error(
- 'utf8_to_unicode: Incomplete multi-octet ' .
- ' sequence in UTF-8 at byte ' . $i,
- E_USER_WARNING
- );
+ if (0xFEFF !== $mUcs4) {
+ // BOM is legal but we don't want to output it
+ $out[] = $mUcs4;
+ }
- return false;
+ //initialize UTF8 cache
+ $mState = 0;
+ $mUcs4 = 0;
+ $mBytes = 1;
}
+ } elseif ($strict) {
+ /**
+ *((0xC0 & (*in) != 0x80) && (mState != 0))
+ * Incomplete multi-octet sequence.
+ */
+ trigger_error(
+ 'utf8_to_unicode: Incomplete multi-octet ' .
+ ' sequence in UTF-8 at byte ' . $i,
+ E_USER_WARNING
+ );
+
+ return false;
}
}
return $out;
@@ -215,25 +197,18 @@ class Unicode
ob_start();
foreach (array_keys($arr) as $k) {
-
if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) {
# ASCII range (including control chars)
-
echo chr($arr[$k]);
-
- } else if ($arr[$k] <= 0x07ff) {
+ } elseif ($arr[$k] <= 0x07ff) {
# 2 byte sequence
-
echo chr(0xc0 | ($arr[$k] >> 6));
echo chr(0x80 | ($arr[$k] & 0x003f));
-
- } else if ($arr[$k] == 0xFEFF) {
+ } elseif ($arr[$k] == 0xFEFF) {
# Byte order mark (skip)
// nop -- zap the BOM
-
- } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
+ } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) {
# Test for illegal surrogates
-
// found a surrogate
if ($strict) {
trigger_error(
@@ -243,24 +218,18 @@ class Unicode
);
return false;
}
-
- } else if ($arr[$k] <= 0xffff) {
+ } elseif ($arr[$k] <= 0xffff) {
# 3 byte sequence
-
echo chr(0xe0 | ($arr[$k] >> 12));
echo chr(0x80 | (($arr[$k] >> 6) & 0x003f));
echo chr(0x80 | ($arr[$k] & 0x003f));
-
- } else if ($arr[$k] <= 0x10ffff) {
+ } elseif ($arr[$k] <= 0x10ffff) {
# 4 byte sequence
-
echo chr(0xf0 | ($arr[$k] >> 18));
echo chr(0x80 | (($arr[$k] >> 12) & 0x3f));
echo chr(0x80 | (($arr[$k] >> 6) & 0x3f));
echo chr(0x80 | ($arr[$k] & 0x3f));
-
} elseif ($strict) {
-
trigger_error(
'unicode_to_utf8: Codepoint out of Unicode range ' .
'at index: ' . $k . ', value: ' . $arr[$k],
diff --git a/inc/Utf8/tables/case.php b/inc/Utf8/tables/case.php
index 6c41b5808..ca5b48078 100644
--- a/inc/Utf8/tables/case.php
+++ b/inc/Utf8/tables/case.php
@@ -1,4 +1,5 @@
<?php
+
/**
* UTF-8 Case lookup table
*
@@ -7,6 +8,7 @@
*
* @author Andreas Gohr <andi@splitbrain.org>
*/
+
return [
'A' => 'a',
'B' => 'b',
diff --git a/inc/Utf8/tables/loweraccents.php b/inc/Utf8/tables/loweraccents.php
index 789379b64..74cbff321 100644
--- a/inc/Utf8/tables/loweraccents.php
+++ b/inc/Utf8/tables/loweraccents.php
@@ -1,4 +1,5 @@
<?php
+
/**
* UTF-8 lookup table for lower case accented letters
*
@@ -8,6 +9,7 @@
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::deaccent()
*/
+
return [
'á' => 'a',
'à' => 'a',
diff --git a/inc/Utf8/tables/romanization.php b/inc/Utf8/tables/romanization.php
index b15a9baa9..006a70598 100644
--- a/inc/Utf8/tables/romanization.php
+++ b/inc/Utf8/tables/romanization.php
@@ -1,4 +1,5 @@
<?php
+
/**
* Romanization lookup table
*
@@ -24,6 +25,7 @@
* @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription
* @link http://www.btranslations.com/resources/romanization/korean.asp
*/
+
return [
// scandinavian - differs from what we do in deaccent
'å' => 'a',
@@ -298,9 +300,6 @@ return [
// 3 character syllables, っ doubles the consonant after
'っちゃ' => 'ccha',
- 'っちぇ' => 'cche',
- 'っちょ' => 'ccho',
- 'っちゅ' => 'cchu',
'っびゃ' => 'bbya',
'っびぇ' => 'bbye',
'っびぃ' => 'bbyi',
@@ -313,7 +312,6 @@ return [
'っぴゅ' => 'ppyu',
'っちゃ' => 'ccha',
'っちぇ' => 'cche',
- 'っち' => 'cchi',
'っちょ' => 'ccho',
'っちゅ' => 'cchu',
// 'っひゃ'=>'hya',
@@ -348,7 +346,6 @@ return [
'っりゅ' => 'rryu',
'っしゃ' => 'ssha',
'っしぇ' => 'sshe',
- 'っし' => 'sshi',
'っしょ' => 'ssho',
'っしゅ' => 'sshu',
@@ -370,7 +367,6 @@ return [
'ふぉ' => 'fo',
'ちゃ' => 'cha',
'ちぇ' => 'che',
- 'ち' => 'chi',
'ちょ' => 'cho',
'ちゅ' => 'chu',
'ひゃ' => 'hya',
@@ -415,7 +411,6 @@ return [
'りゅ' => 'ryu',
'しゃ' => 'sha',
'しぇ' => 'she',
- 'し' => 'shi',
'しょ' => 'sho',
'しゅ' => 'shu',
'じゃ' => 'ja',
@@ -633,12 +628,10 @@ return [
'ッリュー' => 'rryuu',
'ッシャー' => 'sshaa',
'ッシェー' => 'sshee',
- 'ッシー' => 'sshii',
'ッショー' => 'sshoo',
'ッシュー' => 'sshuu',
'ッチャー' => 'cchaa',
'ッチェー' => 'cchee',
- 'ッチー' => 'cchii',
'ッチョー' => 'cchoo',
'ッチュー' => 'cchuu',
'ッティー' => 'ttii',
@@ -646,8 +639,6 @@ return [
// 3 character syllables - doubled vowels
'ファー' => 'faa',
- 'フェー' => 'fee',
- 'フィー' => 'fii',
'フォー' => 'foo',
'フャー' => 'fyaa',
'フェー' => 'fyee',
@@ -696,12 +687,10 @@ return [
'リュー' => 'ryuu',
'シャー' => 'shaa',
'シェー' => 'shee',
- 'シー' => 'shii',
'ショー' => 'shoo',
'シュー' => 'shuu',
'ジャー' => 'jaa',
'ジェー' => 'jee',
- 'ジー' => 'jii',
'ジョー' => 'joo',
'ジュー' => 'juu',
'スァー' => 'swaa',
@@ -716,19 +705,16 @@ return [
'デゥー' => 'duu',
'チャー' => 'chaa',
'チェー' => 'chee',
- 'チー' => 'chii',
'チョー' => 'choo',
'チュー' => 'chuu',
'ヂャー' => 'dyaa',
'ヂェー' => 'dyee',
- 'ヂィー' => 'dyii',
'ヂョー' => 'dyoo',
'ヂュー' => 'dyuu',
'ツャー' => 'tsaa',
'ツェー' => 'tsee',
'ツィー' => 'tsii',
'ツョー' => 'tsoo',
- 'ツー' => 'tsuu',
'トァー' => 'twaa',
'トェー' => 'twee',
'トィー' => 'twii',
@@ -740,13 +726,9 @@ return [
'ドォー' => 'dwoo',
'ドゥー' => 'dwuu',
'ウァー' => 'whaa',
- 'ウェー' => 'whee',
- 'ウィー' => 'whii',
'ウォー' => 'whoo',
'ウゥー' => 'whuu',
'ヴャー' => 'vyaa',
- 'ヴェー' => 'vyee',
- 'ヴィー' => 'vyii',
'ヴョー' => 'vyoo',
'ヴュー' => 'vyuu',
'ヴァー' => 'vaa',
@@ -798,12 +780,10 @@ return [
'ッリュ' => 'rryu',
'ッシャ' => 'ssha',
'ッシェ' => 'sshe',
- 'ッシ' => 'sshi',
'ッショ' => 'ssho',
'ッシュ' => 'sshu',
'ッチャ' => 'ccha',
'ッチェ' => 'cche',
- 'ッチ' => 'cchi',
'ッチョ' => 'ccho',
'ッチュ' => 'cchu',
'ッティ' => 'tti',
@@ -868,8 +848,6 @@ return [
// 2 character syllables - normal
'ファ' => 'fa',
- 'フェ' => 'fe',
- 'フィ' => 'fi',
'フォ' => 'fo',
'フゥ' => 'fu',
// 'フャ'=>'fya',
@@ -942,7 +920,6 @@ return [
'デゥ' => 'du',
'チャ' => 'cha',
'チェ' => 'che',
- 'チ' => 'chi',
'チョ' => 'cho',
'チュ' => 'chu',
// 'ヂャ'=>'dya',
@@ -954,7 +931,6 @@ return [
'ツェ' => 'tse',
'ツィ' => 'tsi',
'ツョ' => 'tso',
- 'ツ' => 'tsu',
'トァ' => 'twa',
'トェ' => 'twe',
'トィ' => 'twi',
@@ -966,13 +942,9 @@ return [
'ドォ' => 'dwo',
'ドゥ' => 'dwu',
'ウァ' => 'wha',
- 'ウェ' => 'whe',
- 'ウィ' => 'whi',
'ウォ' => 'who',
'ウゥ' => 'whu',
'ヴャ' => 'vya',
- 'ヴェ' => 'vye',
- 'ヴィ' => 'vyi',
'ヴョ' => 'vyo',
'ヴュ' => 'vyu',
'ヴァ' => 'va',
@@ -1414,7 +1386,6 @@ return [
'ำ' => 'am',
'ํา' => 'am',
'ิ' => 'i',
- 'ี' => 'i',
'ึ' => 'ue',
'ี' => 'ue',
'ุ' => 'u',
diff --git a/inc/Utf8/tables/specials.php b/inc/Utf8/tables/specials.php
index f6243bccd..2f9c14619 100644
--- a/inc/Utf8/tables/specials.php
+++ b/inc/Utf8/tables/specials.php
@@ -1,4 +1,5 @@
<?php
+
/**
* UTF-8 array of common special characters
*
@@ -13,6 +14,7 @@
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::stripspecials()
*/
+
return [
0x1a, // 
0x1b, // 
diff --git a/inc/Utf8/tables/upperaccents.php b/inc/Utf8/tables/upperaccents.php
index e6e48de2c..facda0c30 100644
--- a/inc/Utf8/tables/upperaccents.php
+++ b/inc/Utf8/tables/upperaccents.php
@@ -1,4 +1,5 @@
<?php
+
/**
* UTF-8 lookup table for upper case accented letters
*
@@ -8,6 +9,7 @@
* @author Andreas Gohr <andi@splitbrain.org>
* @see \dokuwiki\Utf8\Clean::deaccent()
*/
+
return [
'Á' => 'A',
'À' => 'A',