<?php /* =========================================================================== * Copyright 2018-2021 Zindex Software * * Licensed under the Apache License, Version 2.0 (the "License"); * you may not use this file except in compliance with the License. * You may obtain a copy of the License at * * http://www.apache.org/licenses/LICENSE-2.0 * * Unless required by applicable law or agreed to in writing, software * distributed under the License is distributed on an "AS IS" BASIS, * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. * See the License for the specific language governing permissions and * limitations under the License. * ============================================================================ */ namespace Opis\String; use RuntimeException; use OutOfBoundsException; use Countable, ArrayAccess; use JsonSerializable; use Opis\String\Exception\{ UnicodeException, InvalidStringException, InvalidCodePointException }; class UnicodeString implements Countable, ArrayAccess, JsonSerializable { const KEEP_CASE = 0; const LOWER_CASE = 1; const UPPER_CASE = 2; const FOLD_CASE = 3; const ASCII_CONV = 4; /** * @var int[] */ private array $codes; /** * @var string[]|null */ private ?array $chars = null; private int $length; private ?string $str = null; private ?array $cache = null; /** * @var int[][] */ private static array $maps = []; /** * @param int[] $codes */ private function __construct(array $codes = []) { $this->codes = $codes; $this->length = count($codes); } /** * @return int[] */ public function codePoints(): array { return $this->codes; } /** * @return string[] */ public function chars(): array { if ($this->chars === null) { $this->chars = self::getCharsFromCodePoints($this->codes); } return $this->chars; } /** * @return int */ public function length(): int { return $this->length; } /** * @return bool */ public function isEmpty(): bool { return $this->length === 0; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @return bool */ public function equals($text, bool $ignoreCase = false): bool { return $this->compareTo($text, $ignoreCase) === 0; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @return int */ public function compareTo($text, bool $ignoreCase = false): int { $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $text = self::resolveCodePoints($text, $mode); $length = count($text); if ($length !== $this->length) { return $this->length <=> $length; } return $this->getMappedCodes($mode) <=> $text; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @return bool */ public function contains($text, bool $ignoreCase = false): bool { return $this->indexOf($text, 0, $ignoreCase) !== -1; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @return bool */ public function startsWith($text, bool $ignoreCase = false): bool { $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $text = self::resolveCodePoints($text, $mode); $len = count($text); if ($len === 0 || $len > $this->length) { return false; } return array_slice($this->getMappedCodes($mode), 0, $len) === $text; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @return bool */ public function endsWith($text, bool $ignoreCase = false): bool { $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $text = self::resolveCodePoints($text, $mode); if (empty($text)) { return false; } $codes = $this->getMappedCodes($mode); $offset = $this->length - count($text); if ($offset < 0) { return false; } return array_slice($codes, $offset) === $text; } /** * @param string|self|int[]|string[] $text * @param int $offset * @param bool $ignoreCase * @return int */ public function indexOf($text, int $offset = 0, bool $ignoreCase = false): int { if ($offset < 0) { $offset += $this->length; } if ($offset < 0 || $offset >= $this->length) { return -1; } $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $text = self::resolveCodePoints($text, $mode); $len = count($text); if ($len === 0 || $offset + $len > $this->length) { return -1; } return $this->doIndexOf($this->getMappedCodes($mode), $text, $offset); } /** * @param string|self|int[]|string[] $text * @param int $offset * @param bool $ignoreCase * @return int */ public function lastIndexOf($text, int $offset = 0, bool $ignoreCase = false): int { if ($offset < 0) { $start = $this->length + $offset; if ($start < 0) { return -1; } $last = 0; } else { if ($offset >= $this->length) { return -1; } $start = $this->length - 1; $last = $offset; } $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $text = self::resolveCodePoints($text, $mode); $len = count($text); if ($len === 0) { return -1; } if ($offset < 0) { if ($len > $this->length) { return -1; } $start = min($start, $this->length - $len); } elseif ($offset + $len > $this->length) { return -1; } $codes = $this->getMappedCodes($mode); for ($i = $start; $i >= $last; $i--) { $match = true; for ($j = 0; $j < $len; $j++) { if ($codes[$i + $j] !== $text[$j]) { $match = false; break; } } if ($match) { return $i; } } return -1; } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @param bool $allowPrefixOnly If true the result can contain only the prefix * @return $this */ public function ensurePrefix($text, bool $ignoreCase = false, bool $allowPrefixOnly = true): self { $text = self::resolveCodePoints($text); $len = count($text); if ($len === 0) { return clone $this; } if ($this->length === 0) { return new static($text); } if ($ignoreCase) { $prefix = self::getMappedCodePoints($text, self::FOLD_CASE); } else { $prefix = &$text; } if ($this->length === $len) { $part = $this->getMappedCodes($ignoreCase ? self::FOLD_CASE : self::KEEP_CASE); if ($allowPrefixOnly && $part === $prefix) { return clone $this; } // Remove last element to avoid double check array_pop($part); } elseif ($this->length < $len) { $part = $this->getMappedCodes($ignoreCase ? self::FOLD_CASE : self::KEEP_CASE); // Checks if this can be a suffix if ($allowPrefixOnly && (array_slice($prefix, 0, $this->length) === $part)) { $text = array_slice($text, $this->length); return new static(array_merge($this->codes, $text)); } } else { $part = array_slice($this->codes, 0, $len); if ($ignoreCase) { $part = self::getMappedCodePoints($part, self::FOLD_CASE); } if ($part === $prefix) { return clone $this; } // Remove last element to avoid double check array_pop($part); } $copy = $len; $part_len = count($part); while ($part_len) { if ($part === array_slice($prefix, -$part_len)) { $copy = $len - $part_len; break; } array_pop($part); $part_len--; } if ($copy === 0) { return clone $this; } if ($copy < $len) { $text = array_slice($text, 0, $copy); } return new static(array_merge($text, $this->codes)); } /** * @param string|self|int[]|string[] $text * @param bool $ignoreCase * @param bool $allowSuffixOnly If true the result can contain only the suffix * @return static */ public function ensureSuffix($text, bool $ignoreCase = false, bool $allowSuffixOnly = true): self { $text = self::resolveCodePoints($text); $len = count($text); if ($len === 0) { return clone $this; } if ($this->length === 0) { return new static($text); } if ($ignoreCase) { $suffix = self::getMappedCodePoints($text, self::FOLD_CASE); } else { $suffix = &$text; } if ($this->length === $len) { $part = $this->getMappedCodes($ignoreCase ? self::FOLD_CASE : self::KEEP_CASE); if ($allowSuffixOnly && $part === $suffix) { return clone $this; } // Remove first element to avoid double check array_shift($part); } elseif ($this->length < $len) { $part = $this->getMappedCodes($ignoreCase ? self::FOLD_CASE : self::KEEP_CASE); // Checks if this can be a prefix if ($allowSuffixOnly && (array_slice($suffix, -$this->length) === $part)) { $text = array_slice($text, 0, $len - $this->length); return new static(array_merge($text, $this->codes)); } } else { $part = array_slice($this->codes, -$len); if ($ignoreCase) { $part = self::getMappedCodePoints($part, self::FOLD_CASE); } if ($part === $suffix) { return clone $this; } // Remove first element to avoid double check array_shift($part); } $skip = 0; $part_len = count($part); while ($part_len) { if ($part === array_slice($suffix, 0, $part_len)) { $skip = $part_len; break; } array_shift($part); $part_len--; } if ($skip === $len) { return clone $this; } if ($skip) { array_splice($text, 0, $skip); } return new static(array_merge($this->codes, $text)); } /** * @param string|self|int[]|string[] $text * @param int $mode * @return static */ public function append($text, int $mode = self::KEEP_CASE): self { return new static(array_merge($this->codes, self::resolveCodePoints($text, $mode))); } /** * @param string|self|int[]|string[] $text * @param int $mode * @return static */ public function prepend($text, int $mode = self::KEEP_CASE): self { return new static(array_merge(self::resolveCodePoints($text, $mode), $this->codes)); } /** * @param string|self|int[]|string[] $text * @param int $offset * @param int $mode * @return static */ public function insert($text, int $offset, int $mode = self::KEEP_CASE): self { $codes = $this->codes; array_splice($codes, $offset, 0, self::resolveCodePoints($text, $mode)); return new static($codes); } /** * @param int $offset * @param int|null $length * @return static */ public function remove(int $offset, ?int $length = null): self { $codes = $this->codes; if ($length === null) { array_splice($codes, $offset); } else { array_splice($codes, $offset, $length); } return new static($codes); } /** * @param string|self|int[]|string[] $mask * @return static */ public function trim($mask = " \t\n\r\0\x0B"): self { return $this->doTrim($mask, true, true); } /** * @param string|self|int[]|string[] $mask * @return static */ public function trimLeft($mask = " \t\n\r\0\x0B"): self { return $this->doTrim($mask, true, false); } /** * @param string|self|int[]|string[] $mask * @return static */ public function trimRight($mask = " \t\n\r\0\x0B"): self { return $this->doTrim($mask, false, true); } /** * @return static */ public function reverse(): self { return new static(array_reverse($this->codes)); } /** * @param int $times * @return static */ public function repeat(int $times = 1): self { if ($times <= 1) { return clone $this; } $codes = []; while ($times--) { $codes = array_merge($codes, $this->codes); } return new static($codes); } /** * @param string|self|int[]|string[] $subject * @param string|self|int[]|string[] $replace * @param int $offset * @param bool $ignoreCase * @return static */ public function replace($subject, $replace, int $offset = 0, bool $ignoreCase = false): self { if ($offset < 0) { $offset += $this->length; } if ($offset < 0 || $offset >= $this->length) { return clone $this; } $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $subject = self::resolveCodePoints($subject, $mode); $len = count($subject); if ($len === 0 || $offset + $len > $this->length) { return clone $this; } $offset = $this->doIndexOf($this->getMappedCodes($mode), $subject, $offset); if ($offset === -1) { return clone $this; } $codes = $this->codes; array_splice($codes, $offset, count($subject), self::resolveCodePoints($replace)); return new static($codes); } /** * @param string|self|int[]|string[] $subject * @param string|self|int[]|string[] $replace * @param bool $ignoreCase * @param int $offset * @return static */ public function replaceAll($subject, $replace, int $offset = 0, bool $ignoreCase = false): self { if ($offset < 0) { $offset += $this->length; } if ($offset < 0 || $offset >= $this->length) { return clone $this; } $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $subject = self::resolveCodePoints($subject, $mode); $len = count($subject); if ($len === 0 || $offset + $len > $this->length) { return clone $this; } $replace = self::resolveCodePoints($replace); $codes = $this->getMappedCodes($mode); $copy = $this->codes; $fix = count($replace) - $len; $t = 0; while (($pos = $this->doIndexOf($codes, $subject, $offset)) >= 0) { array_splice($copy, $pos + $t * $fix, $len, $replace); $offset = $pos + $len; $t++; } return new static($copy); } /** * @param string|self|int[]|string[] $delimiter * @param bool $ignoreCase * @return array */ public function split($delimiter = '', bool $ignoreCase = false): array { $mode = $ignoreCase ? self::FOLD_CASE : self::KEEP_CASE; $delimiter = self::resolveCodePoints($delimiter, $mode); $len = count($delimiter); $ret = []; if ($len === 0) { foreach ($this->codes as $code) { $ret[] = new static([$code]); } } else { $codes = $this->getMappedCodes($mode); $offset = 0; while (($pos = $this->doIndexOf($codes, $delimiter, $offset)) >= 0) { $ret[] = new static(array_slice($this->codes, $offset, $pos - $offset)); $offset = $pos + $len; } $ret[] = new static(array_slice($this->codes, $offset)); } return $ret; } /** * @param int $start * @param int|null $length * @return static */ public function substring(int $start, ?int $length = null): self { return new static(array_slice($this->codes, $start, $length)); } /** * @param int $size If negative then pad left otherwise pad right * @param self|string|int $char A char or a code point * @return static */ public function pad(int $size, $char = 0x20): self { return new static(array_pad($this->codes, $size, self::resolveFirstCodePoint($char, 0x20))); } /** * @param int $size * @param self|string|int $char * @return static */ public function padLeft(int $size, $char = 0x20): self { if ($size > 0) { $size = -$size; } return $this->pad($size, $char); } /** * @param int $size * @param self|string|int $char * @return static */ public function padRight(int $size, $char = 0x20): self { if ($size < 0) { $size = -$size; } return $this->pad($size, $char); } /** * @return bool */ public function isLowerCase(): bool { return $this->isCase(self::LOWER_CASE); } /** * @return bool */ public function isUpperCase(): bool { return $this->isCase(self::UPPER_CASE); } /** * @return bool */ public function isAscii(): bool { $key = 'i' . self::ASCII_CONV; if (!isset($this->cache[$key])) { $ok = true; foreach ($this->codes as $code) { if ($code >= 0x80) { $ok = false; break; } } $this->cache[$key] = $ok; } return $this->cache[$key]; } /** * Convert all chars to lower case (where possible) * @return static */ public function toLower(): self { if ($this->cache['i' . self::LOWER_CASE] ?? false) { return clone $this; } return new static($this->getMappedCodes(self::LOWER_CASE)); } /** * Convert all chars to upper case (where possible) * @return static */ public function toUpper(): self { if ($this->cache['i' . self::UPPER_CASE] ?? false) { return clone $this; } return new static($this->getMappedCodes(self::UPPER_CASE)); } /** * Converts all chars to their ASCII equivalent (if any) * @return static */ public function toAscii(): self { if ($this->cache['i' . self::ASCII_CONV] ?? false) { return clone $this; } return new static($this->getMappedCodes(self::ASCII_CONV)); } /** * @param int $index * @return string */ public function charAt(int $index): string { // Allow negative index if ($index < 0 && $index + $this->length >= 0) { $index += $this->length; } if ($index < 0 || $index >= $this->length) { return ''; } return $this->chars()[$index]; } /** * @param int $index * @return int */ public function codePointAt(int $index): int { // Allow negative index if ($index < 0 && $index + $this->length >= 0) { $index += $this->length; } if ($index < 0 || $index >= $this->length) { return -1; } return $this->codes[$index]; } /** * @param int $offset * @return int */ public function __invoke(int $offset): int { if ($offset < 0) { if ($offset + $this->length < 0) { throw new OutOfBoundsException("Undefined offset: {$offset}"); } $offset += $this->length; } elseif ($offset >= $this->length) { throw new OutOfBoundsException("Undefined offset: {$offset}"); } return $this->codes[$offset]; } /** * @inheritDoc */ public function offsetExists($offset): bool { // Allow negative index if ($offset < 0) { $offset += $this->length; } return isset($this->codes[$offset]); } /** * @inheritDoc */ public function offsetGet($offset): string { if ($offset < 0) { if ($offset + $this->length < 0) { throw new OutOfBoundsException("Undefined offset: {$offset}"); } $offset += $this->length; } elseif ($offset >= $this->length) { throw new OutOfBoundsException("Undefined offset: {$offset}"); } return $this->chars()[$offset]; } /** * @inheritDoc */ #[\ReturnTypeWillChange] public function offsetSet($offset, $value) { // Allow negative index if ($offset < 0) { $offset += $this->length; } if (!isset($this->codes[$offset])) { return; } $value = self::resolveFirstCodePoint($value); if ($value === -1) { return; } if ($value === $this->codes[$offset]) { // Same value, nothing to do return; } $this->codes[$offset] = $value; // Clear cache $this->str = null; $this->cache = null; if ($this->chars) { $this->chars[$offset] = self::getCharFromCodePoint($value); } } /** * @inheritDoc */ #[\ReturnTypeWillChange] public function offsetUnset($offset) { throw new RuntimeException("Invalid operation"); } /** * @inheritDoc */ public function count(): int { return $this->length; } /** * @return string */ public function __toString(): string { if ($this->str === null) { $this->str = self::getStringFromCodePoints($this->codes); } return $this->str; } /** * @inheritDoc */ public function jsonSerialize(): string { return $this->__toString(); } public function __serialize(): array { return [ 'value' => $this->__toString(), ]; } public function __unserialize(array $data): void { $this->str = $data['value']; $this->codes = self::getCodePointsFromString($this->str); $this->length = count($this->codes); } /** * Creates an unicode string instance from raw string * @param string $string * @param string|null $encoding Defaults to UTF-8 * @param int $mode * @return static * @throws InvalidStringException */ public static function from(string $string, ?string $encoding = null, int $mode = self::KEEP_CASE): self { if ($encoding !== null && strcasecmp($encoding, 'UTF-8') !== 0) { if (false === $string = @iconv($encoding, 'UTF-8', $string)) { throw new UnicodeException("Could not convert string from '$encoding' encoding to UTF-8 encoding"); } } $instance = new static(self::getCodePointsFromString($string, $mode)); if ($mode === self::KEEP_CASE) { $instance->str = $string; } return $instance; } /** * Creates an unicode string instance from code points * @param int[] $codes * @param int $mode * @return static * @throws InvalidCodePointException */ public static function fromCodePoints(array $codes, int $mode = self::KEEP_CASE): self { $map = self::getMapByMode($mode); foreach ($codes as &$code) { if (!is_int($codes) || !self::isValidCodePoint($code)) { throw new InvalidCodePointException($code); } else { $code = $map[$code] ?? $code; } } return new static(array_values($codes)); } /** * Converts the code point to corresponding char * @param int $code * @return string The char or an empty string if code point is invalid */ public static function getCharFromCodePoint(int $code): string { if ($code < 0) { return ''; } if ($code < 0x80) { return chr($code); } if ($code < 0x800) { return chr(($code >> 6) + 0xC0) . chr(($code & 0x3F) + 0x80); } if ($code >= 0xD800 && $code <= 0xDFFF) { /* The definition of UTF-8 prohibits encoding character numbers between U+D800 and U+DFFF, which are reserved for use with the UTF-16 encoding form (as surrogate pairs) and do not directly represent characters. */ return ''; } if ($code <= 0xFFFF) { return chr(($code >> 12) + 0xE0) . chr((($code >> 6) & 0x3F) + 0x80) . chr(($code & 0x3F) + 0x80); } if ($code <= 0x10FFFF) { return chr(($code >> 18) + 0xF0) . chr((($code >> 12) & 0x3F) + 0x80) . chr((($code >> 6) & 0x3F) + 0x80) . chr(($code & 0x3F) + 0x80); } /* Restricted the range of characters to 0000-10FFFF (the UTF-16 accessible range). */ return ''; } /** * Convert a string to a code point array * @param string $str * @param int $mode * @return array * @throws InvalidStringException */ public static function getCodePointsFromString(string $str, int $mode = self::KEEP_CASE): array { // 0x00-0x7F // 0xC2-0xDF 0x80-0xBF // 0xE0-0xE0 0xA0-0xBF 0x80-0xBF // 0xE1-0xEC 0x80-0xBF 0x80-0xBF // 0xED-0xED 0x80-0x9F 0x80-0xBF // 0xEE-0xEF 0x80-0xBF 0x80-0xBF // 0xF0-0xF0 0x90-0xBF 0x80-0xBF 0x80-0xBF // 0xF1-0xF3 0x80-0xBF 0x80-0xBF 0x80-0xBF // 0xF4-0xF4 0x80-0x8F 0x80-0xBF 0x80-0xBF $codes = []; $length = strlen($str); $mode = self::getMapByMode($mode); $i = 0; while ($i < $length) { $ord0 = ord($str[$i++]); if ($ord0 < 0x80) { $codes[] = $mode[$ord0] ?? $ord0; continue; } if ($i === $length || $ord0 < 0xC2 || $ord0 > 0xF4) { throw new InvalidStringException($str, $i - 1); } $ord1 = ord($str[$i++]); if ($ord0 < 0xE0) { if ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } $ord1 = ($ord0 - 0xC0) * 64 + $ord1 - 0x80; $codes[] = $mode[$ord1] ?? $ord1; continue; } if ($i === $length) { throw new InvalidStringException($str, $i - 1); } $ord2 = ord($str[$i++]); if ($ord0 < 0xF0) { if ($ord0 === 0xE0) { if ($ord1 < 0xA0 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } } elseif ($ord0 === 0xED) { if ($ord1 < 0x80 || $ord1 >= 0xA0) { throw new InvalidStringException($str, $i - 2); } } elseif ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } if ($ord2 < 0x80 || $ord2 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } $ord2 = ($ord0 - 0xE0) * 0x1000 + ($ord1 - 0x80) * 64 + $ord2 - 0x80; $codes[] = $mode[$ord2] ?? $ord2; continue; } if ($i === $length) { throw new InvalidStringException($str, $i - 1); } $ord3 = ord($str[$i++]); if ($ord0 < 0xF5) { if ($ord0 === 0xF0) { if ($ord1 < 0x90 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 3); } } elseif ($ord0 === 0xF4) { if ($ord1 < 0x80 || $ord1 >= 0x90) { throw new InvalidStringException($str, $i - 3); } } elseif ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 3); } if ($ord2 < 0x80 || $ord2 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } if ($ord3 < 0x80 || $ord3 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } $ord3 = ($ord0 - 0xF0) * 0x40000 + ($ord1 - 0x80) * 0x1000 + ($ord2 - 0x80) * 64 + $ord3 - 0x80; $codes[] = $mode[$ord3] ?? $ord3; continue; } throw new InvalidStringException($str, $i - 1); } return $codes; } /** * @param string $str * @return iterable * * The key represents the current char index * Value is a two element array * - first element is an integer representing the code point * - second element is an array of integers (length 1 to 4) representing bytes */ public static function walkString(string $str): iterable { $i = 0; $length = strlen($str); while ($i < $length) { $index = $i; $ord0 = ord($str[$i++]); if ($ord0 < 0x80) { yield $index => [ $ord0, [$ord0] ]; continue; } if ($i === $length || $ord0 < 0xC2 || $ord0 > 0xF4) { throw new InvalidStringException($str, $i - 1); } $ord1 = ord($str[$i++]); if ($ord0 < 0xE0) { if ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } yield $index => [ ($ord0 - 0xC0) * 64 + $ord1 - 0x80, [$ord0, $ord1] ]; continue; } if ($i === $length) { throw new InvalidStringException($str, $i - 1); } $ord2 = ord($str[$i++]); if ($ord0 < 0xF0) { if ($ord0 === 0xE0) { if ($ord1 < 0xA0 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } } elseif ($ord0 === 0xED) { if ($ord1 < 0x80 || $ord1 >= 0xA0) { throw new InvalidStringException($str, $i - 2); } } elseif ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } if ($ord2 < 0x80 || $ord2 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } yield $index => [ ($ord0 - 0xE0) * 0x1000 + ($ord1 - 0x80) * 64 + $ord2 - 0x80, [$ord0, $ord1, $ord2] ]; continue; } if ($i === $length) { throw new InvalidStringException($str, $i - 1); } $ord3 = ord($str[$i++]); if ($ord0 < 0xF5) { if ($ord0 === 0xF0) { if ($ord1 < 0x90 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 3); } } elseif ($ord0 === 0xF4) { if ($ord1 < 0x80 || $ord1 >= 0x90) { throw new InvalidStringException($str, $i - 3); } } elseif ($ord1 < 0x80 || $ord1 >= 0xC0) { throw new InvalidStringException($str, $i - 3); } if ($ord2 < 0x80 || $ord2 >= 0xC0) { throw new InvalidStringException($str, $i - 2); } if ($ord3 < 0x80 || $ord3 >= 0xC0) { throw new InvalidStringException($str, $i - 1); } yield $index => [ ($ord0 - 0xF0) * 0x40000 + ($ord1 - 0x80) * 0x1000 + ($ord2 - 0x80) * 64 + $ord3 - 0x80, [$ord0, $ord1, $ord2, $ord3] ]; continue; } throw new InvalidStringException($str, $i - 1); } } /** * Converts each code point to a char * @param array $codes * @param int $mode * @return array * @throws InvalidCodePointException */ public static function getCharsFromCodePoints(array $codes, int $mode = self::KEEP_CASE): array { $mode = self::getMapByMode($mode); foreach ($codes as &$code) { $char = self::getCharFromCodePoint($mode[$code] ?? $code); if ($char === '') { throw new InvalidCodePointException($code); } else { $code = $char; } } return $codes; } /** * @param string $str * @param int $mode * @return string[] */ public static function getCharsFromString(string $str, int $mode = self::KEEP_CASE): array { return self::getCharsFromCodePoints(self::getCodePointsFromString($str), $mode); } /** * Converts all code points to chars and returns the string * Invalid code points are ignored * @param array $codes * @param int $mode * @return string */ public static function getStringFromCodePoints(array $codes, int $mode = self::KEEP_CASE): string { $str = ''; $mode = self::getMapByMode($mode); foreach ($codes as $code) { if (isset($mode[$code])) { $code = $mode[$code]; } if ($code < 0x80) { $str .= chr($code); continue; } if ($code < 0x800) { $str .= chr(($code >> 6) + 0xC0) . chr(($code & 0x3F) + 0x80); continue; } if ($code >= 0xD800 && $code <= 0xDFFF) { continue; } if ($code <= 0xFFFF) { $str .= chr(($code >> 12) + 0xE0) . chr((($code >> 6) & 0x3F) + 0x80) . chr(($code & 0x3F) + 0x80); continue; } if ($code <= 0x10FFFF) { $str .= chr(($code >> 18) + 0xF0) . chr((($code >> 12) & 0x3F) + 0x80) . chr((($code >> 6) & 0x3F) + 0x80) . chr(($code & 0x3F) + 0x80); } } return $str; } /** * @param array $codes * @param int $mode * @return array */ public static function getMappedCodePoints(array $codes, int $mode): array { if ($mode === self::KEEP_CASE) { return $codes; } $mode = self::getMapByMode($mode); if (empty($mode)) { return $codes; } foreach ($codes as &$code) { $code = $mode[$code] ?? $code; } return $codes; } /** * Checks if a code point is valid * @param int $code * @return bool */ public static function isValidCodePoint(int $code): bool { if ($code < 0 || $code > 0x10FFFF) { return false; } return $code < 0xD800 || $code > 0xDFFF; } /** * @param int $mode * @return int[] */ private function getMappedCodes(int $mode): array { if ($mode === self::KEEP_CASE || ($this->cache['i' . $mode] ?? false)) { return $this->codes; } $key = 'm' . $mode; if (!isset($this->cache[$key])) { $this->cache[$key] = self::getMappedCodePoints($this->codes, $mode); } return $this->cache[$key]; } /** * @param int $mode * @return bool */ private function isCase(int $mode): bool { $key = 'i' . $mode; if (!isset($this->cache[$key])) { $list = self::getMapByMode($mode); foreach ($this->codes as $code) { if (isset($list[$code])) { return $this->cache[$key] = false; } } return $this->cache[$key] = true; } return $this->cache[$key]; } /** * @param int[] $codes * @param int[] $text * @param int $offset * @return int */ private function doIndexOf(array $codes, array $text, int $offset = 0): int { $len = count($text); for ($i = $offset, $last = count($codes) - $len; $i <= $last; $i++) { $match = true; for ($j = 0; $j < $len; $j++) { if ($codes[$i + $j] !== $text[$j]) { $match = false; break; } } if ($match) { return $i; } } return -1; } /** * @param string|self|int[]|string[] $mask * @param bool $left * @param bool $right * @return static */ private function doTrim($mask, bool $left, bool $right): self { if ($this->length === 0) { return clone $this; } $mask = self::resolveCodePoints($mask); if (empty($mask)) { return clone $this; } $codes = $this->codes; if ($left) { while (in_array($codes[0], $mask, true)) { array_shift($codes); if (empty($codes)) { return new static(); } } } if ($right) { $last = count($codes) - 1; while (in_array($codes[$last], $mask, true)) { array_pop($codes); if (--$last < 0) { return new static(); } } } return new static($codes); } /** * @param string|self|int[]|string[] $text * @param int $mode * @return array */ private static function resolveCodePoints($text, int $mode = self::KEEP_CASE): array { if ($text instanceof self) { return $text->getMappedCodes($mode); } if (is_string($text)) { return self::getCodePointsFromString($text, $mode); } if ($text && is_array($text) && is_int($text[0])) { // assume code point array return self::getMappedCodePoints($text, $mode); } return []; } /** * @param self|string|int|string[]|int[] $text * @param int $invalid * @return int */ private static function resolveFirstCodePoint($text, int $invalid = -1): int { if ($text instanceof self) { return $text->length === 0 ? $invalid : $text->codes[0]; } if (is_array($text)) { if (empty($text)) { return $invalid; } $text = reset($text); } if (is_string($text)) { if (isset($text[4])) { $text = substr($text, 0, 4); } return self::getCodePointsFromString($text)[0] ?? $invalid; } if (is_int($text)) { return self::isValidCodePoint($text) ? $text : $invalid; } return $invalid; } /** * @param int $mode * @return int[] */ private static function getMapByMode(int $mode): array { if (isset(self::$maps[$mode])) { return self::$maps[$mode]; } switch ($mode) { case self::LOWER_CASE: $file = 'lower'; break; case self::UPPER_CASE: $file = 'upper'; break; case self::ASCII_CONV: $file = 'ascii'; break; case self::FOLD_CASE: $file = 'fold'; break; default: return []; } /** @noinspection PhpIncludeInspection */ return self::$maps[$mode] = include(__DIR__ . "/../res/{$file}.php"); } }