File "utf8.php"
Full Path: /home/sasslijg/public_html/wp-content/plugins/wordpress-importer/parsers/xml-processor/utf8.php
File size: 12.76 KB
MIME-type: text/x-php
Charset: utf-8
<?php
namespace WordPress\Encoding;
/*
* UTF-8 decoding pipeline by Dennis Snell (@dmsnell), originally
* proposed in https://github.com/WordPress/wordpress-develop/pull/6883.
*
* It enables parsing XML documents with incomplete UTF-8 byte sequences
* without crashing or depending on the mbstring extension.
*/
if ( ! defined( 'UTF8_DECODER_ACCEPT' ) ) {
define( 'UTF8_DECODER_ACCEPT', 0 );
}
if ( ! defined( 'UTF8_DECODER_REJECT' ) ) {
define( 'UTF8_DECODER_REJECT', 1 );
}
/**
* Indicates if a given byte stream represents valid UTF-8.
*
* Note that unpaired surrogate halves are not valid UTF-8 and will be rejected.
*
* Example:
*
* true === utf8_is_valid_byte_stream( 'Hello, World! 🌎' );
*
* false === utf8_is_valid_byte_stream( "Latin1 is n\xF6t valid UTF-8.", 0, $error_at );
* 12 === $error_at;
*
* false === utf8_is_valid_byte_stream( "Surrogate halves like '\xDE\xFF\x80' are not permitted.", 0, $error_at );
* 23 === $error_at;
*
* false === utf8_is_valid_byte_stream( "Broken stream: \xC2\xC2", 0, $error_at );
* 15 === $error_at;
*
* @param string $bytes Text to validate as UTF-8 bytes.
* @param int $starting_byte Byte offset in string where decoding should begin.
* @param int|null $first_error_byte_at Optional. If provided and byte stream fails to validate,
* will be set to the byte offset where the first invalid
* byte appeared. Otherwise, will not be set.
*
* @return bool Whether the given byte stream represents valid UTF-8.
* @since {WP_VERSION}
*
*/
function utf8_is_valid_byte_stream( $bytes, $starting_byte = 0, &$first_error_byte_at = null ) {
$state = UTF8_DECODER_ACCEPT;
$last_start_at = $starting_byte;
for ( $at = $starting_byte, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
if ( UTF8_DECODER_ACCEPT === $state ) {
$last_start_at = $at;
}
$state = utf8_decoder_apply_byte( $bytes[ $at ], $state );
}
if ( UTF8_DECODER_ACCEPT === $state ) {
return true;
} else {
$first_error_byte_at = $last_start_at;
return false;
}
}
/**
* Returns number of code points found within a UTF-8 string, similar to `strlen()`.
*
* If the byte stream fails to properly decode as UTF-8 this function will set the
* byte index of the first error byte and report the number of decoded code points.
*
* @param string $bytes Text for which to count code points.
* @param int|null $first_error_byte_at Optional. If provided, will be set upon finding
* the first invalid byte.
*
* @return int How many code points were decoded in the given byte stream before an error
* or before reaching the end of the string.
* @since {WP_VERSION}
*
*/
function utf8_codepoint_count( $bytes, &$first_error_byte_at = null ) {
$state = UTF8_DECODER_ACCEPT;
$last_start_at = 0;
$count = 0;
$codepoint = 0;
for ( $at = 0, $end = strlen( $bytes ); $at < $end && UTF8_DECODER_REJECT !== $state; $at++ ) {
if ( UTF8_DECODER_ACCEPT === $state ) {
$last_start_at = $at;
}
$state = utf8_decoder_apply_byte( $bytes[ $at ], $state, $codepoint );
if ( UTF8_DECODER_ACCEPT === $state ) {
++$count;
}
}
if ( UTF8_DECODER_ACCEPT !== $state ) {
$first_error_byte_at = $last_start_at;
}
return $count;
}
/**
* Inner loop for a number of UTF-8 decoding-related functions.
*
* You probably don't need this! This is highly-specific and optimized
* code for UTF-8 operations used in other functions.
*
* @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*
* @since {WP_VERSION}
*
* @access private
*
* @param string $byte Next byte to be applied in UTF-8 decoding or validation.
* @param int $state UTF-8 decoding state, one of the following values:<br><ul>
* <li>`UTF8_DECODER_ACCEPT`: Decoder is ready for a new code point.<br>
* <li>`UTF8_DECODER_REJECT`: An error has occurred.<br>
* Any other positive value: Decoder is waiting for additional bytes.
* @param int|null $codepoint Optional. If provided, will accumulate the decoded code point as
* each byte is processed. If not provided or unable to decode, will
* not be set, or will be set to invalid and unusable data.
*
* @return int Next decoder state after processing the current byte.
*/
function utf8_decoder_apply_byte( $byte, $state, &$codepoint = 0 ) {
/**
* State classification and transition table for UTF-8 validation.
*
* > The first part of the table maps bytes to character classes that
* > to reduce the size of the transition table and create bitmasks.
* >
* > The second part is a transition table that maps a combination
* > of a state of the automaton and a character class to a state.
*
* @see http://bjoern.hoehrmann.de/utf-8/decoder/dfa/
*/
static $state_table = (
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
"\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00\x00" .
"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09\x09" .
"\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07\x07" .
"\x08\x08\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02\x02" .
"\x10\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x03\x04\x03\x03" .
"\x11\x06\x06\x06\x05\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08\x08" .
"\x00\x01\x02\x03\x05\x08\x07\x01\x01\x01\x04\x06\x01\x01\x01\x01" .
"\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x00\x01\x01\x01\x01\x01\x00\x01\x00\x01\x01\x01\x01\x01\x01" .
"\x01\x02\x01\x01\x01\x01\x01\x02\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01" .
"\x01\x02\x01\x01\x01\x01\x01\x01\x01\x02\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01" .
"\x01\x03\x01\x01\x01\x01\x01\x03\x01\x03\x01\x01\x01\x01\x01\x01\x01\x03\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01\x01"
);
$byte = ord( $byte );
$type = ord( $state_table[ $byte ] );
$codepoint = ( UTF8_DECODER_ACCEPT === $state )
? ( ( 0xFF >> $type ) & $byte )
: ( ( $byte & 0x3F ) | ( $codepoint << 6 ) );
return ord( $state_table[ 256 + ( $state * 16 ) + $type ] );
}
/**
* Extract a slice of a text by code point, where invalid byte sequences count
* as a single code point, U+FFFD (the Unicode replacement character `�`).
*
* This function does not permit passing negative indices and will return
* the original string if such are provide.
*
* @param string $text Input text from which to extract.
* @param int $from Start extracting after this many code-points.
* @param int $length Extract this many code points.
*
* @return string Extracted slice of input string.
*/
function utf8_substr( $text, $from = 0, $length = null ) {
if ( $from < 0 || ( isset( $length ) && $length < 0 ) ) {
return $text;
}
$position_in_input = 0;
$codepoint_at = 0;
$end_byte = strlen( $text );
$buffer = '';
$seen_codepoints = 0;
$sliced_codepoints = 0;
$decoder_state = UTF8_DECODER_ACCEPT;
// Get to the start of the string.
while ( $position_in_input < $end_byte && $seen_codepoints < $length ) {
$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
++$position_in_input;
if ( $seen_codepoints >= $from ) {
++$sliced_codepoints;
$buffer .= substr( $text, $codepoint_at, $position_in_input - $codepoint_at );
}
++$seen_codepoints;
$codepoint_at = $position_in_input;
} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
// "\u{FFFD}" is not supported in PHP 5.6.
$buffer .= "\xEF\xBF\xBD";
// Skip to the start of the next code point.
while ( UTF8_DECODER_REJECT === $decoder_state && $position_in_input < $end_byte ) {
$decoder_state = utf8_decoder_apply_byte( $text[ ++$position_in_input ], UTF8_DECODER_ACCEPT );
}
++$seen_codepoints;
$codepoint_at = $position_in_input;
$decoder_state = UTF8_DECODER_ACCEPT;
} else {
++$position_in_input;
}
}
return $buffer;
}
/**
* Extract a unicode codepoint from a specific offset in text.
* Invalid byte sequences count as a single code point, U+FFFD
* (the Unicode replacement character ``).
*
* This function does not permit passing negative indices and will return
* null if such are provided.
*
* @param string $text Input text from which to extract.
* @param int $byte_offset Start at this byte offset in the input text.
* @param int $matched_bytes How many bytes were matched to produce the codepoint.
*
* @return int Unicode codepoint.
*/
function utf8_codepoint_at( $text, $byte_offset = 0, &$matched_bytes = 0 ) {
if ( $byte_offset < 0 ) {
return null;
}
$position_in_input = $byte_offset;
$codepoint_at = $byte_offset;
$end_byte = strlen( $text );
$codepoint = null;
$decoder_state = UTF8_DECODER_ACCEPT;
// Get to the start of the string.
while ( $position_in_input < $end_byte ) {
$decoder_state = utf8_decoder_apply_byte( $text[ $position_in_input ], $decoder_state );
if ( UTF8_DECODER_ACCEPT === $decoder_state ) {
++$position_in_input;
$codepoint = utf8_ord( substr( $text, $codepoint_at, $position_in_input - $codepoint_at ) );
break;
} elseif ( UTF8_DECODER_REJECT === $decoder_state ) {
// "\u{FFFD}" is not supported in PHP 5.6.
$codepoint = utf8_ord( "\xEF\xBF\xBD" );
break;
} else {
++$position_in_input;
}
}
$matched_bytes = $position_in_input - $byte_offset;
return $codepoint;
}
/**
* Convert a UTF-8 byte sequence to its Unicode codepoint.
*
* @param string $character UTF-8 encoded byte sequence representing a single Unicode character.
*
* @return int Unicode codepoint.
*/
function utf8_ord( $character ) {
// Convert the byte sequence to its binary representation
$bytes = unpack( 'C*', $character );
// Initialize the codepoint
$codepoint = 0;
// Calculate the codepoint based on the number of bytes
if ( count( $bytes ) === 1 ) {
$codepoint = $bytes[1];
} elseif ( count( $bytes ) === 2 ) {
$codepoint = ( ( $bytes[1] & 0x1F ) << 6 ) | ( $bytes[2] & 0x3F );
} elseif ( count( $bytes ) === 3 ) {
$codepoint = ( ( $bytes[1] & 0x0F ) << 12 ) | ( ( $bytes[2] & 0x3F ) << 6 ) | ( $bytes[3] & 0x3F );
} elseif ( count( $bytes ) === 4 ) {
$codepoint = ( ( $bytes[1] & 0x07 ) << 18 ) | ( ( $bytes[2] & 0x3F ) << 12 ) | ( ( $bytes[3] & 0x3F ) << 6 ) | ( $bytes[4] & 0x3F );
}
return $codepoint;
}
/**
* Encode a code point number into the UTF-8 encoding.
*
* This encoder implements the UTF-8 encoding algorithm for converting
* a code point into a byte sequence. If it receives an invalid code
* point it will return the Unicode Replacement Character U+FFFD `�`.
*
* Example:
*
* '🅰' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0x1f170 );
*
* // Half of a surrogate pair is an invalid code point.
* '�' === WP_HTML_Decoder::codepoint_to_utf8_bytes( 0xd83c );
*
* @since 6.6.0
*
* @see https://www.rfc-editor.org/rfc/rfc3629 For the UTF-8 standard.
*
* @param int $codepoint Which code point to convert.
* @return string Converted code point, or `�` if invalid.
*/
function codepoint_to_utf8_bytes( $codepoint ) {
// Pre-check to ensure a valid code point.
if (
$codepoint <= 0 ||
( $codepoint >= 0xD800 && $codepoint <= 0xDFFF ) ||
$codepoint > 0x10FFFF
) {
return '�';
}
if ( $codepoint <= 0x7F ) {
return chr( $codepoint );
}
if ( $codepoint <= 0x7FF ) {
$byte1 = chr( ( 0xC0 | ( ( $codepoint >> 6 ) & 0x1F ) ) );
$byte2 = chr( $codepoint & 0x3F | 0x80 );
return "{$byte1}{$byte2}";
}
if ( $codepoint <= 0xFFFF ) {
$byte1 = chr( ( $codepoint >> 12 ) | 0xE0 );
$byte2 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
$byte3 = chr( $codepoint & 0x3F | 0x80 );
return "{$byte1}{$byte2}{$byte3}";
}
// Any values above U+10FFFF are eliminated above in the pre-check.
$byte1 = chr( ( $codepoint >> 18 ) | 0xF0 );
$byte2 = chr( ( $codepoint >> 12 ) & 0x3F | 0x80 );
$byte3 = chr( ( $codepoint >> 6 ) & 0x3F | 0x80 );
$byte4 = chr( $codepoint & 0x3F | 0x80 );
return "{$byte1}{$byte2}{$byte3}{$byte4}";
}