diff options
Diffstat (limited to 'inc/Utf8')
-rw-r--r-- | inc/Utf8/Asian.php | 3 | ||||
-rw-r--r-- | inc/Utf8/Clean.php | 1 | ||||
-rw-r--r-- | inc/Utf8/Conversion.php | 14 | ||||
-rw-r--r-- | inc/Utf8/PhpString.php | 15 | ||||
-rw-r--r-- | inc/Utf8/Sort.php | 6 | ||||
-rw-r--r-- | inc/Utf8/Table.php | 1 | ||||
-rw-r--r-- | inc/Utf8/Unicode.php | 157 | ||||
-rw-r--r-- | inc/Utf8/tables/case.php | 2 | ||||
-rw-r--r-- | inc/Utf8/tables/loweraccents.php | 2 | ||||
-rw-r--r-- | inc/Utf8/tables/romanization.php | 33 | ||||
-rw-r--r-- | inc/Utf8/tables/specials.php | 2 | ||||
-rw-r--r-- | inc/Utf8/tables/upperaccents.php | 2 |
12 files changed, 85 insertions, 153 deletions
diff --git a/inc/Utf8/Asian.php b/inc/Utf8/Asian.php index c7baa3029..75406594a 100644 --- a/inc/Utf8/Asian.php +++ b/inc/Utf8/Asian.php @@ -11,13 +11,12 @@ namespace dokuwiki\Utf8; */ class Asian { - /** * This defines a non-capturing group for the use in regular expressions to match any asian character that * needs to be treated as a word. Uses the Unicode-Ranges for Asian characters taken from * http://en.wikipedia.org/wiki/Unicode_block */ - const REGEXP = + public const REGEXP = '(?:' . '[\x{0E00}-\x{0E7F}]' . // Thai diff --git a/inc/Utf8/Clean.php b/inc/Utf8/Clean.php index 0975ff559..434da7043 100644 --- a/inc/Utf8/Clean.php +++ b/inc/Utf8/Clean.php @@ -200,5 +200,4 @@ class Clean return $i; } - } diff --git a/inc/Utf8/Conversion.php b/inc/Utf8/Conversion.php index fad9cd0b1..acde3fb2f 100644 --- a/inc/Utf8/Conversion.php +++ b/inc/Utf8/Conversion.php @@ -7,7 +7,6 @@ namespace dokuwiki\Utf8; */ class Conversion { - /** * Encodes UTF-8 characters to HTML entities * @@ -58,14 +57,14 @@ class Conversion if (!$entities) { return preg_replace_callback( '/(&#([Xx])?([0-9A-Za-z]+);)/m', - [__CLASS__, 'decodeNumericEntity'], + [self::class, 'decodeNumericEntity'], $str ); } return preg_replace_callback( '/&(#)?([Xx])?([0-9A-Za-z]+);/m', - [__CLASS__, 'decodeAnyEntity'], + [self::class, 'decodeAnyEntity'], $str ); } @@ -84,9 +83,7 @@ class Conversion $table = get_html_translation_table(HTML_ENTITIES); $table = array_flip($table); $table = array_map( - static function ($c) { - return Unicode::toUtf8(array(ord($c))); - }, + static fn($c) => Unicode::toUtf8([ord($c)]), $table ); } @@ -116,10 +113,10 @@ class Conversion $cp = hexdec($ent[3]); break; default: - $cp = intval($ent[3]); + $cp = (int) $ent[3]; break; } - return Unicode::toUtf8(array($cp)); + return Unicode::toUtf8([$cp]); } /** @@ -158,5 +155,4 @@ class Conversion $uni = unpack('n*', $str); return Unicode::toUtf8($uni); } - } diff --git a/inc/Utf8/PhpString.php b/inc/Utf8/PhpString.php index d382f14aa..6d9a8d547 100644 --- a/inc/Utf8/PhpString.php +++ b/inc/Utf8/PhpString.php @@ -7,7 +7,6 @@ namespace dokuwiki\Utf8; */ class PhpString { - /** * A locale independent basename() implementation * @@ -138,29 +137,21 @@ class PhpString if ($length === null) { $length_pattern = '(.*)$'; // the rest of the string } else { - if (!isset($strlen)) $strlen = self::strlen($str); // see notes if ($offset > $strlen) return ''; // another trivial case if ($length > 0) { - // reduce any length that would go past the end of the string $length = min($strlen - $offset, $length); - $Lx = (int)($length / 65535); $Ly = $length % 65535; - // +ve length requires ... a captured group of length characters if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; $length_pattern = '(' . $length_pattern . '.{' . $Ly . '})'; - - } else if ($length < 0) { - + } elseif ($length < 0) { if ($length < ($offset - $strlen)) return ''; - $Lx = (int)((-$length) / 65535); $Ly = (-$length) % 65535; - // -ve length requires ... capture everything except a group of -length characters // anchored at the tail-end of the string if ($Lx) $length_pattern = '(?:.{65535}){' . $Lx . '}'; @@ -268,7 +259,7 @@ class PhpString */ public static function strtolower($string) { - if($string === null) return ''; // pre-8.1 behaviour + if ($string === null) return ''; // pre-8.1 behaviour if (UTF8_MBSTRING) { if (class_exists('Normalizer', $autoload = false)) { return \Normalizer::normalize(mb_strtolower($string, 'utf-8')); @@ -379,6 +370,4 @@ class PhpString return $length; } - - } diff --git a/inc/Utf8/Sort.php b/inc/Utf8/Sort.php index decb058a7..9f066b050 100644 --- a/inc/Utf8/Sort.php +++ b/inc/Utf8/Sort.php @@ -55,7 +55,9 @@ class Sort 'Collator created with locale "' . $lc . '": numeric collation on, ' . 'valid locale "' . $collator->getLocale(\Locale::VALID_LOCALE) . '", ' . 'actual locale "' . $collator->getLocale(\Locale::ACTUAL_LOCALE) . '"', - null, __FILE__, __LINE__ + null, + __FILE__, + __LINE__ ); self::$collators[$lc] = $collator; } @@ -128,7 +130,7 @@ class Sort { $collator = self::getCollator(); if (isset($collator)) { - return uksort($array, array($collator, 'compare')); + return uksort($array, [$collator, 'compare']); } else { return ksort($array, SORT_NATURAL | SORT_FLAG_CASE); } diff --git a/inc/Utf8/Table.php b/inc/Utf8/Table.php index 8683c9238..f618c69ea 100644 --- a/inc/Utf8/Table.php +++ b/inc/Utf8/Table.php @@ -9,7 +9,6 @@ namespace dokuwiki\Utf8; */ class Table { - /** * Get the upper to lower case conversion table * diff --git a/inc/Utf8/Unicode.php b/inc/Utf8/Unicode.php index 4b6426533..31faf95d7 100644 --- a/inc/Utf8/Unicode.php +++ b/inc/Utf8/Unicode.php @@ -7,7 +7,6 @@ namespace dokuwiki\Utf8; */ class Unicode { - /** * Takes an UTF-8 string and returns an array of ints representing the * Unicode characters. Astral planes are supported ie. the ints in the @@ -40,45 +39,39 @@ class Unicode $mUcs4 = 0; // cached Unicode character $mBytes = 1; // cached expected number of octets in the current sequence - $out = array(); + $out = []; $len = strlen($str); for ($i = 0; $i < $len; $i++) { - $in = ord($str[$i]); if ($mState === 0) { - // When mState is zero we expect either a US-ASCII character or a // multi-octet sequence. if (0 === (0x80 & $in)) { // US-ASCII, pass straight through. $out[] = $in; $mBytes = 1; - - } else if (0xC0 === (0xE0 & $in)) { + } elseif (0xC0 === (0xE0 & $in)) { // First octet of 2 octet sequence $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x1F) << 6; $mState = 1; $mBytes = 2; - - } else if (0xE0 === (0xF0 & $in)) { + } elseif (0xE0 === (0xF0 & $in)) { // First octet of 3 octet sequence $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x0F) << 12; $mState = 2; $mBytes = 3; - - } else if (0xF0 === (0xF8 & $in)) { + } elseif (0xF0 === (0xF8 & $in)) { // First octet of 4 octet sequence $mUcs4 = $in; $mUcs4 = ($mUcs4 & 0x07) << 18; $mState = 3; $mBytes = 4; - - } else if (0xF8 === (0xFC & $in)) { + } elseif (0xF8 === (0xFC & $in)) { /* First octet of 5 octet sequence. * * This is illegal because the encoded codepoint must be either @@ -91,14 +84,12 @@ class Unicode $mUcs4 = ($mUcs4 & 0x03) << 24; $mState = 4; $mBytes = 5; - - } else if (0xFC === (0xFE & $in)) { + } elseif (0xFC === (0xFE & $in)) { // First octet of 6 octet sequence, see comments for 5 octet sequence. $mUcs4 = $in; $mUcs4 = ($mUcs4 & 1) << 30; $mState = 5; $mBytes = 6; - } elseif ($strict) { /* Current octet is neither in the US-ASCII range nor a legal first * octet of a multi-octet sequence. @@ -109,76 +100,67 @@ class Unicode E_USER_WARNING ); return false; - } - - } else { - + } elseif (0x80 === (0xC0 & $in)) { // When mState is non-zero, we expect a continuation of the multi-octet // sequence - if (0x80 === (0xC0 & $in)) { - - // Legal continuation. - $shift = ($mState - 1) * 6; - $tmp = $in; - $tmp = ($tmp & 0x0000003F) << $shift; - $mUcs4 |= $tmp; - - /** - * End of the multi-octet sequence. mUcs4 now contains the final - * Unicode codepoint to be output + // Legal continuation. + $shift = ($mState - 1) * 6; + $tmp = $in; + $tmp = ($tmp & 0x0000003F) << $shift; + $mUcs4 |= $tmp; + /** + * End of the multi-octet sequence. mUcs4 now contains the final + * Unicode codepoint to be output + */ + if (0 === --$mState) { + /* + * Check for illegal sequences and codepoints. */ - if (0 === --$mState) { - - /* - * Check for illegal sequences and codepoints. - */ - // From Unicode 3.1, non-shortest form is illegal - if (((2 === $mBytes) && ($mUcs4 < 0x0080)) || - ((3 === $mBytes) && ($mUcs4 < 0x0800)) || - ((4 === $mBytes) && ($mUcs4 < 0x10000)) || - (4 < $mBytes) || - // From Unicode 3.2, surrogate characters are illegal - (($mUcs4 & 0xFFFFF800) === 0xD800) || - // Codepoints outside the Unicode range are illegal - ($mUcs4 > 0x10FFFF)) { - - if ($strict) { - trigger_error( - 'utf8_to_unicode: Illegal sequence or codepoint ' . - 'in UTF-8 at byte ' . $i, - E_USER_WARNING - ); - - return false; - } - + // From Unicode 3.1, non-shortest form is illegal + if ( + ((2 === $mBytes) && ($mUcs4 < 0x0080)) || + ((3 === $mBytes) && ($mUcs4 < 0x0800)) || + ((4 === $mBytes) && ($mUcs4 < 0x10000)) || + (4 < $mBytes) || + // From Unicode 3.2, surrogate characters are illegal + (($mUcs4 & 0xFFFFF800) === 0xD800) || + // Codepoints outside the Unicode range are illegal + ($mUcs4 > 0x10FFFF) + ) { + if ($strict) { + trigger_error( + 'utf8_to_unicode: Illegal sequence or codepoint ' . + 'in UTF-8 at byte ' . $i, + E_USER_WARNING + ); + + return false; } - - if (0xFEFF !== $mUcs4) { - // BOM is legal but we don't want to output it - $out[] = $mUcs4; - } - - //initialize UTF8 cache - $mState = 0; - $mUcs4 = 0; - $mBytes = 1; } - } elseif ($strict) { - /** - *((0xC0 & (*in) != 0x80) && (mState != 0)) - * Incomplete multi-octet sequence. - */ - trigger_error( - 'utf8_to_unicode: Incomplete multi-octet ' . - ' sequence in UTF-8 at byte ' . $i, - E_USER_WARNING - ); + if (0xFEFF !== $mUcs4) { + // BOM is legal but we don't want to output it + $out[] = $mUcs4; + } - return false; + //initialize UTF8 cache + $mState = 0; + $mUcs4 = 0; + $mBytes = 1; } + } elseif ($strict) { + /** + *((0xC0 & (*in) != 0x80) && (mState != 0)) + * Incomplete multi-octet sequence. + */ + trigger_error( + 'utf8_to_unicode: Incomplete multi-octet ' . + ' sequence in UTF-8 at byte ' . $i, + E_USER_WARNING + ); + + return false; } } return $out; @@ -215,25 +197,18 @@ class Unicode ob_start(); foreach (array_keys($arr) as $k) { - if (($arr[$k] >= 0) && ($arr[$k] <= 0x007f)) { # ASCII range (including control chars) - echo chr($arr[$k]); - - } else if ($arr[$k] <= 0x07ff) { + } elseif ($arr[$k] <= 0x07ff) { # 2 byte sequence - echo chr(0xc0 | ($arr[$k] >> 6)); echo chr(0x80 | ($arr[$k] & 0x003f)); - - } else if ($arr[$k] == 0xFEFF) { + } elseif ($arr[$k] == 0xFEFF) { # Byte order mark (skip) // nop -- zap the BOM - - } else if ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { + } elseif ($arr[$k] >= 0xD800 && $arr[$k] <= 0xDFFF) { # Test for illegal surrogates - // found a surrogate if ($strict) { trigger_error( @@ -243,24 +218,18 @@ class Unicode ); return false; } - - } else if ($arr[$k] <= 0xffff) { + } elseif ($arr[$k] <= 0xffff) { # 3 byte sequence - echo chr(0xe0 | ($arr[$k] >> 12)); echo chr(0x80 | (($arr[$k] >> 6) & 0x003f)); echo chr(0x80 | ($arr[$k] & 0x003f)); - - } else if ($arr[$k] <= 0x10ffff) { + } elseif ($arr[$k] <= 0x10ffff) { # 4 byte sequence - echo chr(0xf0 | ($arr[$k] >> 18)); echo chr(0x80 | (($arr[$k] >> 12) & 0x3f)); echo chr(0x80 | (($arr[$k] >> 6) & 0x3f)); echo chr(0x80 | ($arr[$k] & 0x3f)); - } elseif ($strict) { - trigger_error( 'unicode_to_utf8: Codepoint out of Unicode range ' . 'at index: ' . $k . ', value: ' . $arr[$k], diff --git a/inc/Utf8/tables/case.php b/inc/Utf8/tables/case.php index 6c41b5808..ca5b48078 100644 --- a/inc/Utf8/tables/case.php +++ b/inc/Utf8/tables/case.php @@ -1,4 +1,5 @@ <?php + /** * UTF-8 Case lookup table * @@ -7,6 +8,7 @@ * * @author Andreas Gohr <andi@splitbrain.org> */ + return [ 'A' => 'a', 'B' => 'b', diff --git a/inc/Utf8/tables/loweraccents.php b/inc/Utf8/tables/loweraccents.php index 789379b64..74cbff321 100644 --- a/inc/Utf8/tables/loweraccents.php +++ b/inc/Utf8/tables/loweraccents.php @@ -1,4 +1,5 @@ <?php + /** * UTF-8 lookup table for lower case accented letters * @@ -8,6 +9,7 @@ * @author Andreas Gohr <andi@splitbrain.org> * @see \dokuwiki\Utf8\Clean::deaccent() */ + return [ 'á' => 'a', 'à' => 'a', diff --git a/inc/Utf8/tables/romanization.php b/inc/Utf8/tables/romanization.php index b15a9baa9..006a70598 100644 --- a/inc/Utf8/tables/romanization.php +++ b/inc/Utf8/tables/romanization.php @@ -1,4 +1,5 @@ <?php + /** * Romanization lookup table * @@ -24,6 +25,7 @@ * @link http://en.wikipedia.org/wiki/Royal_Thai_General_System_of_Transcription * @link http://www.btranslations.com/resources/romanization/korean.asp */ + return [ // scandinavian - differs from what we do in deaccent 'å' => 'a', @@ -298,9 +300,6 @@ return [ // 3 character syllables, っ doubles the consonant after 'っちゃ' => 'ccha', - 'っちぇ' => 'cche', - 'っちょ' => 'ccho', - 'っちゅ' => 'cchu', 'っびゃ' => 'bbya', 'っびぇ' => 'bbye', 'っびぃ' => 'bbyi', @@ -313,7 +312,6 @@ return [ 'っぴゅ' => 'ppyu', 'っちゃ' => 'ccha', 'っちぇ' => 'cche', - 'っち' => 'cchi', 'っちょ' => 'ccho', 'っちゅ' => 'cchu', // 'っひゃ'=>'hya', @@ -348,7 +346,6 @@ return [ 'っりゅ' => 'rryu', 'っしゃ' => 'ssha', 'っしぇ' => 'sshe', - 'っし' => 'sshi', 'っしょ' => 'ssho', 'っしゅ' => 'sshu', @@ -370,7 +367,6 @@ return [ 'ふぉ' => 'fo', 'ちゃ' => 'cha', 'ちぇ' => 'che', - 'ち' => 'chi', 'ちょ' => 'cho', 'ちゅ' => 'chu', 'ひゃ' => 'hya', @@ -415,7 +411,6 @@ return [ 'りゅ' => 'ryu', 'しゃ' => 'sha', 'しぇ' => 'she', - 'し' => 'shi', 'しょ' => 'sho', 'しゅ' => 'shu', 'じゃ' => 'ja', @@ -633,12 +628,10 @@ return [ 'ッリュー' => 'rryuu', 'ッシャー' => 'sshaa', 'ッシェー' => 'sshee', - 'ッシー' => 'sshii', 'ッショー' => 'sshoo', 'ッシュー' => 'sshuu', 'ッチャー' => 'cchaa', 'ッチェー' => 'cchee', - 'ッチー' => 'cchii', 'ッチョー' => 'cchoo', 'ッチュー' => 'cchuu', 'ッティー' => 'ttii', @@ -646,8 +639,6 @@ return [ // 3 character syllables - doubled vowels 'ファー' => 'faa', - 'フェー' => 'fee', - 'フィー' => 'fii', 'フォー' => 'foo', 'フャー' => 'fyaa', 'フェー' => 'fyee', @@ -696,12 +687,10 @@ return [ 'リュー' => 'ryuu', 'シャー' => 'shaa', 'シェー' => 'shee', - 'シー' => 'shii', 'ショー' => 'shoo', 'シュー' => 'shuu', 'ジャー' => 'jaa', 'ジェー' => 'jee', - 'ジー' => 'jii', 'ジョー' => 'joo', 'ジュー' => 'juu', 'スァー' => 'swaa', @@ -716,19 +705,16 @@ return [ 'デゥー' => 'duu', 'チャー' => 'chaa', 'チェー' => 'chee', - 'チー' => 'chii', 'チョー' => 'choo', 'チュー' => 'chuu', 'ヂャー' => 'dyaa', 'ヂェー' => 'dyee', - 'ヂィー' => 'dyii', 'ヂョー' => 'dyoo', 'ヂュー' => 'dyuu', 'ツャー' => 'tsaa', 'ツェー' => 'tsee', 'ツィー' => 'tsii', 'ツョー' => 'tsoo', - 'ツー' => 'tsuu', 'トァー' => 'twaa', 'トェー' => 'twee', 'トィー' => 'twii', @@ -740,13 +726,9 @@ return [ 'ドォー' => 'dwoo', 'ドゥー' => 'dwuu', 'ウァー' => 'whaa', - 'ウェー' => 'whee', - 'ウィー' => 'whii', 'ウォー' => 'whoo', 'ウゥー' => 'whuu', 'ヴャー' => 'vyaa', - 'ヴェー' => 'vyee', - 'ヴィー' => 'vyii', 'ヴョー' => 'vyoo', 'ヴュー' => 'vyuu', 'ヴァー' => 'vaa', @@ -798,12 +780,10 @@ return [ 'ッリュ' => 'rryu', 'ッシャ' => 'ssha', 'ッシェ' => 'sshe', - 'ッシ' => 'sshi', 'ッショ' => 'ssho', 'ッシュ' => 'sshu', 'ッチャ' => 'ccha', 'ッチェ' => 'cche', - 'ッチ' => 'cchi', 'ッチョ' => 'ccho', 'ッチュ' => 'cchu', 'ッティ' => 'tti', @@ -868,8 +848,6 @@ return [ // 2 character syllables - normal 'ファ' => 'fa', - 'フェ' => 'fe', - 'フィ' => 'fi', 'フォ' => 'fo', 'フゥ' => 'fu', // 'フャ'=>'fya', @@ -942,7 +920,6 @@ return [ 'デゥ' => 'du', 'チャ' => 'cha', 'チェ' => 'che', - 'チ' => 'chi', 'チョ' => 'cho', 'チュ' => 'chu', // 'ヂャ'=>'dya', @@ -954,7 +931,6 @@ return [ 'ツェ' => 'tse', 'ツィ' => 'tsi', 'ツョ' => 'tso', - 'ツ' => 'tsu', 'トァ' => 'twa', 'トェ' => 'twe', 'トィ' => 'twi', @@ -966,13 +942,9 @@ return [ 'ドォ' => 'dwo', 'ドゥ' => 'dwu', 'ウァ' => 'wha', - 'ウェ' => 'whe', - 'ウィ' => 'whi', 'ウォ' => 'who', 'ウゥ' => 'whu', 'ヴャ' => 'vya', - 'ヴェ' => 'vye', - 'ヴィ' => 'vyi', 'ヴョ' => 'vyo', 'ヴュ' => 'vyu', 'ヴァ' => 'va', @@ -1414,7 +1386,6 @@ return [ 'ำ' => 'am', 'ํา' => 'am', 'ิ' => 'i', - 'ี' => 'i', 'ึ' => 'ue', 'ี' => 'ue', 'ุ' => 'u', diff --git a/inc/Utf8/tables/specials.php b/inc/Utf8/tables/specials.php index f6243bccd..2f9c14619 100644 --- a/inc/Utf8/tables/specials.php +++ b/inc/Utf8/tables/specials.php @@ -1,4 +1,5 @@ <?php + /** * UTF-8 array of common special characters * @@ -13,6 +14,7 @@ * @author Andreas Gohr <andi@splitbrain.org> * @see \dokuwiki\Utf8\Clean::stripspecials() */ + return [ 0x1a, // 0x1b, // diff --git a/inc/Utf8/tables/upperaccents.php b/inc/Utf8/tables/upperaccents.php index e6e48de2c..facda0c30 100644 --- a/inc/Utf8/tables/upperaccents.php +++ b/inc/Utf8/tables/upperaccents.php @@ -1,4 +1,5 @@ <?php + /** * UTF-8 lookup table for upper case accented letters * @@ -8,6 +9,7 @@ * @author Andreas Gohr <andi@splitbrain.org> * @see \dokuwiki\Utf8\Clean::deaccent() */ + return [ 'Á' => 'A', 'À' => 'A', |