| [ Index ] |
PHP Cross Reference of DokuWiki |
[Summary view] [Print] [Text view]
1 <?php 2 /** 3 * UTF8 helper functions 4 * 5 * @license LGPL (http://www.gnu.org/copyleft/lesser.html) 6 * @author Andreas Gohr <andi@splitbrain.org> 7 */ 8 9 /** 10 * check for mb_string support 11 */ 12 if(!defined('UTF8_MBSTRING')){ 13 if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){ 14 define('UTF8_MBSTRING',1); 15 }else{ 16 define('UTF8_MBSTRING',0); 17 } 18 } 19 20 if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); } 21 22 23 /** 24 * URL-Encode a filename to allow unicodecharacters 25 * 26 * Slashes are not encoded 27 * 28 * When the second parameter is true the string will 29 * be encoded only if non ASCII characters are detected - 30 * This makes it safe to run it multiple times on the 31 * same string (default is true) 32 * 33 * @author Andreas Gohr <andi@splitbrain.org> 34 * @see urlencode 35 */ 36 function utf8_encodeFN($file,$safe=true){ 37 if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){ 38 return $file; 39 } 40 $file = urlencode($file); 41 $file = str_replace('%2F','/',$file); 42 return $file; 43 } 44 45 /** 46 * URL-Decode a filename 47 * 48 * This is just a wrapper around urldecode 49 * 50 * @author Andreas Gohr <andi@splitbrain.org> 51 * @see urldecode 52 */ 53 function utf8_decodeFN($file){ 54 $file = urldecode($file); 55 return $file; 56 } 57 58 /** 59 * Checks if a string contains 7bit ASCII only 60 * 61 * @author Andreas Gohr <andi@splitbrain.org> 62 */ 63 function utf8_isASCII($str){ 64 for($i=0; $i<strlen($str); $i++){ 65 if(ord($str{$i}) >127) return false; 66 } 67 return true; 68 } 69 70 /** 71 * Strips all highbyte chars 72 * 73 * Returns a pure ASCII7 string 74 * 75 * @author Andreas Gohr <andi@splitbrain.org> 76 */ 77 function utf8_strip($str){ 78 $ascii = ''; 79 for($i=0; $i<strlen($str); $i++){ 80 if(ord($str{$i}) <128){ 81 $ascii .= $str{$i}; 82 } 83 } 84 return $ascii; 85 } 86 87 /** 88 * Tries to detect if a string is in Unicode encoding 89 * 90 * @author <bmorel@ssi.fr> 91 * @link http://www.php.net/manual/en/function.utf8-encode.php 92 */ 93 function utf8_check($Str) { 94 for ($i=0; $i<strlen($Str); $i++) { 95 $b = ord($Str[$i]); 96 if ($b < 0x80) continue; # 0bbbbbbb 97 elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb 98 elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb 99 elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb 100 elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb 101 elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b 102 else return false; # Does not match any model 103 for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ? 104 if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80)) 105 return false; 106 } 107 } 108 return true; 109 } 110 111 /** 112 * Unicode aware replacement for strlen() 113 * 114 * utf8_decode() converts characters that are not in ISO-8859-1 115 * to '?', which, for the purpose of counting, is alright - It's 116 * even faster than mb_strlen. 117 * 118 * @author <chernyshevsky at hotmail dot com> 119 * @see strlen() 120 * @see utf8_decode() 121 */ 122 function utf8_strlen($string){ 123 return strlen(utf8_decode($string)); 124 } 125 126 /** 127 * UTF-8 aware alternative to substr 128 * 129 * Return part of a string given character offset (and optionally length) 130 * 131 * @author Harry Fuecks <hfuecks@gmail.com> 132 * @author Chris Smith <chris@jalakai.co.uk> 133 * @param string 134 * @param integer number of UTF-8 characters offset (from left) 135 * @param integer (optional) length in UTF-8 characters from offset 136 * @return mixed string or false if failure 137 */ 138 function utf8_substr($str, $offset, $length = null) { 139 if(UTF8_MBSTRING){ 140 if( $length === null ){ 141 return mb_substr($str, $offset); 142 }else{ 143 return mb_substr($str, $offset, $length); 144 } 145 } 146 147 /* 148 * Notes: 149 * 150 * no mb string support, so we'll use pcre regex's with 'u' flag 151 * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for 152 * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536) 153 * 154 * substr documentation states false can be returned in some cases (e.g. offset > string length) 155 * mb_substr never returns false, it will return an empty string instead. 156 * 157 * calculating the number of characters in the string is a relatively expensive operation, so 158 * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length 159 */ 160 161 // cast parameters to appropriate types to avoid multiple notices/warnings 162 $str = (string)$str; // generates E_NOTICE for PHP4 objects, but not PHP5 objects 163 $offset = (int)$offset; 164 if (!is_null($length)) $length = (int)$length; 165 166 // handle trivial cases 167 if ($length === 0) return ''; 168 if ($offset < 0 && $length < 0 && $length < $offset) return ''; 169 170 $offset_pattern = ''; 171 $length_pattern = ''; 172 173 // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!) 174 if ($offset < 0) { 175 $strlen = strlen(utf8_decode($str)); // see notes 176 $offset = $strlen + $offset; 177 if ($offset < 0) $offset = 0; 178 } 179 180 // establish a pattern for offset, a non-captured group equal in length to offset 181 if ($offset > 0) { 182 $Ox = (int)($offset/65535); 183 $Oy = $offset%65535; 184 185 if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}'; 186 $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})'; 187 } else { 188 $offset_pattern = '^'; // offset == 0; just anchor the pattern 189 } 190 191 // establish a pattern for length 192 if (is_null($length)) { 193 $length_pattern = '(.*)$'; // the rest of the string 194 } else { 195 196 if (!isset($strlen)) $strlen = strlen(utf8_decode($str)); // see notes 197 if ($offset > $strlen) return ''; // another trivial case 198 199 if ($length > 0) { 200 201 $length = min($strlen-$offset, $length); // reduce any length that would go passed the end of the string 202 203 $Lx = (int)($length/65535); 204 $Ly = $length%65535; 205 206 // +ve length requires ... a captured group of length characters 207 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 208 $length_pattern = '('.$length_pattern.'.{'.$Ly.'})'; 209 210 } else if ($length < 0) { 211 212 if ($length < ($offset - $strlen)) return ''; 213 214 $Lx = (int)((-$length)/65535); 215 $Ly = (-$length)%65535; 216 217 // -ve length requires ... capture everything except a group of -length characters 218 // anchored at the tail-end of the string 219 if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}'; 220 $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$'; 221 } 222 } 223 224 if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return ''; 225 return $match[1]; 226 } 227 228 /** 229 * Unicode aware replacement for substr_replace() 230 * 231 * @author Andreas Gohr <andi@splitbrain.org> 232 * @see substr_replace() 233 */ 234 function utf8_substr_replace($string, $replacement, $start , $length=0 ){ 235 $ret = ''; 236 if($start>0) $ret .= utf8_substr($string, 0, $start); 237 $ret .= $replacement; 238 $ret .= utf8_substr($string, $start+$length); 239 return $ret; 240 } 241 242 /** 243 * Unicode aware replacement for ltrim() 244 * 245 * @author Andreas Gohr <andi@splitbrain.org> 246 * @see ltrim() 247 * @return string 248 */ 249 function utf8_ltrim($str,$charlist=''){ 250 if($charlist == '') return ltrim($str); 251 252 //quote charlist for use in a characterclass 253 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist); 254 255 return preg_replace('/^['.$charlist.']+/u','',$str); 256 } 257 258 /** 259 * Unicode aware replacement for rtrim() 260 * 261 * @author Andreas Gohr <andi@splitbrain.org> 262 * @see rtrim() 263 * @return string 264 */ 265 function utf8_rtrim($str,$charlist=''){ 266 if($charlist == '') return rtrim($str); 267 268 //quote charlist for use in a characterclass 269 $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist); 270 271 return preg_replace('/['.$charlist.']+$/u','',$str); 272 } 273 274 /** 275 * Unicode aware replacement for trim() 276 * 277 * @author Andreas Gohr <andi@splitbrain.org> 278 * @see trim() 279 * @return string 280 */ 281 function utf8_trim($str,$charlist='') { 282 if($charlist == '') return trim($str); 283 284 return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist); 285 } 286 287 288 /** 289 * This is a unicode aware replacement for strtolower() 290 * 291 * Uses mb_string extension if available 292 * 293 * @author Leo Feyer <leo@typolight.org> 294 * @see strtolower() 295 * @see utf8_strtoupper() 296 */ 297 function utf8_strtolower($string){ 298 if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8'); 299 300 global $UTF8_UPPER_TO_LOWER; 301 return strtr($string,$UTF8_UPPER_TO_LOWER); 302 } 303 304 /** 305 * This is a unicode aware replacement for strtoupper() 306 * 307 * Uses mb_string extension if available 308 * 309 * @author Leo Feyer <leo@typolight.org> 310 * @see strtoupper() 311 * @see utf8_strtoupper() 312 */ 313 function utf8_strtoupper($string){ 314 if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8'); 315 316 global $UTF8_LOWER_TO_UPPER; 317 return strtr($string,$UTF8_LOWER_TO_UPPER); 318 } 319 320 /** 321 * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents 322 * 323 * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1) 324 * letters. Default is to deaccent both cases ($case = 0) 325 * 326 * @author Andreas Gohr <andi@splitbrain.org> 327 */ 328 function utf8_deaccent($string,$case=0){ 329 if($case <= 0){ 330 global $UTF8_LOWER_ACCENTS; 331 $string = strtr($string,$UTF8_LOWER_ACCENTS); 332 } 333 if($case >= 0){ 334 global $UTF8_UPPER_ACCENTS; 335 $string = strtr($string,$UTF8_UPPER_ACCENTS); 336 } 337 return $string; 338 } 339 340 /** 341 * Romanize a non-latin string 342 * 343 * @author Andreas Gohr <andi@splitbrain.org> 344 */ 345 function utf8_romanize($string){ 346 if(utf8_isASCII($string)) return $string; //nothing to do 347 348 global $UTF8_ROMANIZATION; 349 return strtr($string,$UTF8_ROMANIZATION); 350 } 351 352 /** 353 * Removes special characters (nonalphanumeric) from a UTF-8 string 354 * 355 * This function adds the controlchars 0x00 to 0x19 to the array of 356 * stripped chars (they are not included in $UTF8_SPECIAL_CHARS) 357 * 358 * @author Andreas Gohr <andi@splitbrain.org> 359 * @param string $string The UTF8 string to strip of special chars 360 * @param string $repl Replace special with this string 361 * @param string $additional Additional chars to strip (used in regexp char class) 362 */ 363 function utf8_stripspecials($string,$repl='',$additional=''){ 364 global $UTF8_SPECIAL_CHARS; 365 global $UTF8_SPECIAL_CHARS2; 366 367 static $specials = null; 368 if(is_null($specials)){ 369 # $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/'); 370 $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/'); 371 } 372 373 return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string); 374 } 375 376 /** 377 * This is an Unicode aware replacement for strpos 378 * 379 * @author Leo Feyer <leo@typolight.org> 380 * @see strpos() 381 * @param string 382 * @param string 383 * @param integer 384 * @return integer 385 */ 386 function utf8_strpos($haystack, $needle, $offset=0){ 387 $comp = 0; 388 $length = null; 389 390 while (is_null($length) || $length < $offset) { 391 $pos = strpos($haystack, $needle, $offset + $comp); 392 393 if ($pos === false) 394 return false; 395 396 $length = utf8_strlen(substr($haystack, 0, $pos)); 397 398 if ($length < $offset) 399 $comp = $pos - $length; 400 } 401 402 return $length; 403 } 404 405 406 /** 407 * Encodes UTF-8 characters to HTML entities 408 * 409 * @author Tom N Harris <tnharris@whoopdedo.org> 410 * @author <vpribish at shopping dot com> 411 * @link http://www.php.net/manual/en/function.utf8-decode.php 412 */ 413 function utf8_tohtml ($str) { 414 $ret = ''; 415 foreach (utf8_to_unicode($str) as $cp) { 416 if ($cp < 0x80) 417 $ret .= chr($cp); 418 elseif ($cp < 0x100) 419 $ret .= "&#$cp;"; 420 else 421 $ret .= '&#x'.dechex($cp).';'; 422 } 423 return $ret; 424 } 425 426 /** 427 * Decodes HTML entities to UTF-8 characters 428 * 429 * Convert any &#..; entity to a codepoint, 430 * The entities flag defaults to only decoding numeric entities. 431 * Pass HTML_ENTITIES and named entities, including & < etc. 432 * are handled as well. Avoids the problem that would occur if you 433 * had to decode "&#38;&amp;#38;" 434 * 435 * unhtmlspecialchars(utf8_unhtml($s)) -> "&&" 436 * utf8_unhtml(unhtmlspecialchars($s)) -> "&&#38;" 437 * what it should be -> "&&#38;" 438 * 439 * @author Tom N Harris <tnharris@whoopdedo.org> 440 * @param string $str UTF-8 encoded string 441 * @param boolean $entities Flag controlling decoding of named entities. 442 * @return UTF-8 encoded string with numeric (and named) entities replaced. 443 */ 444 function utf8_unhtml($str, $entities=null) { 445 static $decoder = null; 446 if (is_null($decoder)) 447 $decoder = new utf8_entity_decoder(); 448 if (is_null($entities)) 449 return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m', 450 'utf8_decode_numeric', $str); 451 else 452 return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m', 453 array(&$decoder, 'decode'), $str); 454 } 455 function utf8_decode_numeric($ent) { 456 switch ($ent[2]) { 457 case 'X': 458 case 'x': 459 $cp = hexdec($ent[3]); 460 break; 461 default: 462 $cp = intval($ent[3]); 463 break; 464 } 465 return unicode_to_utf8(array($cp)); 466 } 467 class utf8_entity_decoder { 468 var $table; 469 function utf8_entity_decoder() { 470 $table = get_html_translation_table(HTML_ENTITIES); 471 $table = array_flip($table); 472 $this->table = array_map(array(&$this,'makeutf8'), $table); 473 } 474 function makeutf8($c) { 475 return unicode_to_utf8(array(ord($c))); 476 } 477 function decode($ent) { 478 if ($ent[1] == '#') { 479 return utf8_decode_numeric($ent); 480 } elseif (array_key_exists($ent[0],$this->table)) { 481 return $this->table[$ent[0]]; 482 } else { 483 return $ent[0]; 484 } 485 } 486 } 487 488 /** 489 * Takes an UTF-8 string and returns an array of ints representing the 490 * Unicode characters. Astral planes are supported ie. the ints in the 491 * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates 492 * are not allowed. 493 * 494 * If $strict is set to true the function returns false if the input 495 * string isn't a valid UTF-8 octet sequence and raises a PHP error at 496 * level E_USER_WARNING 497 * 498 * Note: this function has been modified slightly in this library to 499 * trigger errors on encountering bad bytes 500 * 501 * @author <hsivonen@iki.fi> 502 * @author Harry Fuecks <hfuecks@gmail.com> 503 * @param string UTF-8 encoded string 504 * @param boolean Check for invalid sequences? 505 * @return mixed array of unicode code points or false if UTF-8 invalid 506 * @see unicode_to_utf8 507 * @link http://hsivonen.iki.fi/php-utf8/ 508 * @link http://sourceforge.net/projects/phputf8/ 509 */ 510 function utf8_to_unicode($str,$strict=false) { 511 $mState = 0; // cached expected number of octets after the current octet 512 // until the beginning of the next UTF8 character sequence 513 $mUcs4 = 0; // cached Unicode character 514 $mBytes = 1; // cached expected number of octets in the current sequence 515 516 $out = array(); 517 518 $len = strlen($str); 519 520 for($i = 0; $i < $len; $i++) { 521 522 $in = ord($str{$i}); 523 524 if ( $mState == 0) { 525 526 // When mState is zero we expect either a US-ASCII character or a 527 // multi-octet sequence. 528 if (0 == (0x80 & ($in))) { 529 // US-ASCII, pass straight through. 530 $out[] = $in; 531 $mBytes = 1; 532 533 } else if (0xC0 == (0xE0 & ($in))) { 534 // First octet of 2 octet sequence 535 $mUcs4 = ($in); 536 $mUcs4 = ($mUcs4 & 0x1F) << 6; 537 $mState = 1; 538 $mBytes = 2; 539 540 } else if (0xE0 == (0xF0 & ($in))) { 541 // First octet of 3 octet sequence 542 $mUcs4 = ($in); 543 $mUcs4 = ($mUcs4 & 0x0F) << 12; 544 $mState = 2; 545 $mBytes = 3; 546 547 } else if (0xF0 == (0xF8 & ($in))) { 548 // First octet of 4 octet sequence 549 $mUcs4 = ($in); 550 $mUcs4 = ($mUcs4 & 0x07) << 18; 551 $mState = 3; 552 $mBytes = 4; 553 554 } else if (0xF8 == (0xFC & ($in))) { 555 /* First octet of 5 octet sequence. 556 * 557 * This is illegal because the encoded codepoint must be either 558 * (a) not the shortest form or 559 * (b) outside the Unicode range of 0-0x10FFFF. 560 * Rather than trying to resynchronize, we will carry on until the end 561 * of the sequence and let the later error handling code catch it. 562 */ 563 $mUcs4 = ($in); 564 $mUcs4 = ($mUcs4 & 0x03) << 24; 565 $mState = 4; 566 $mBytes = 5; 567 568 } else if (0xFC == (0xFE & ($in))) { 569 // First octet of 6 octet sequence, see comments for 5 octet sequence. 570 $mUcs4 = ($in); 571 $mUcs4 = ($mUcs4 & 1) << 30; 572 $mState = 5; 573 $mBytes = 6; 574 575 } elseif($strict) { 576 /* Current octet is neither in the US-ASCII range nor a legal first 577 * octet of a multi-octet sequence. 578 */ 579 trigger_error( 580 'utf8_to_unicode: Illegal sequence identifier '. 581 'in UTF-8 at byte '.$i, 582 E_USER_WARNING 583 ); 584 return false; 585 586 } 587 588 } else { 589 590 // When mState is non-zero, we expect a continuation of the multi-octet 591 // sequence 592 if (0x80 == (0xC0 & ($in))) { 593 594 // Legal continuation. 595 $shift = ($mState - 1) * 6; 596 $tmp = $in; 597 $tmp = ($tmp & 0x0000003F) << $shift; 598 $mUcs4 |= $tmp; 599 600 /** 601 * End of the multi-octet sequence. mUcs4 now contains the final 602 * Unicode codepoint to be output 603 */ 604 if (0 == --$mState) { 605 606 /* 607 * Check for illegal sequences and codepoints. 608 */ 609 // From Unicode 3.1, non-shortest form is illegal 610 if (((2 == $mBytes) && ($mUcs4 < 0x0080)) || 611 ((3 == $mBytes) && ($mUcs4 < 0x0800)) || 612 ((4 == $mBytes) && ($mUcs4 < 0x10000)) || 613 (4 < $mBytes) || 614 // From Unicode 3.2, surrogate characters are illegal 615 (($mUcs4 & 0xFFFFF800) == 0xD800) || 616 // Codepoints outside the Unicode range are illegal 617 ($mUcs4 > 0x10FFFF)) { 618 619 if($strict){ 620