[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/ -> utf8.php (source)

   1  <?php
   2  /**
   3   * UTF8 helper functions
   4   *
   5   * @license    LGPL (http://www.gnu.org/copyleft/lesser.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   */
   8  
   9  /**
  10   * check for mb_string support
  11   */
  12  if(!defined('UTF8_MBSTRING')){
  13    if(function_exists('mb_substr') && !defined('UTF8_NOMBSTRING')){
  14      define('UTF8_MBSTRING',1);
  15    }else{
  16      define('UTF8_MBSTRING',0);
  17    }
  18  }
  19  
  20  if(UTF8_MBSTRING){ mb_internal_encoding('UTF-8'); }
  21  
  22  
  23  /**
  24   * URL-Encode a filename to allow unicodecharacters
  25   *
  26   * Slashes are not encoded
  27   *
  28   * When the second parameter is true the string will
  29   * be encoded only if non ASCII characters are detected -
  30   * This makes it safe to run it multiple times on the
  31   * same string (default is true)
  32   *
  33   * @author Andreas Gohr <andi@splitbrain.org>
  34   * @see    urlencode
  35   */
  36  function utf8_encodeFN($file,$safe=true){
  37    if($safe && preg_match('#^[a-zA-Z0-9/_\-.%]+$#',$file)){
  38      return $file;
  39    }
  40    $file = urlencode($file);
  41    $file = str_replace('%2F','/',$file);
  42    return $file;
  43  }
  44  
  45  /**
  46   * URL-Decode a filename
  47   *
  48   * This is just a wrapper around urldecode
  49   *
  50   * @author Andreas Gohr <andi@splitbrain.org>
  51   * @see    urldecode
  52   */
  53  function utf8_decodeFN($file){
  54    $file = urldecode($file);
  55    return $file;
  56  }
  57  
  58  /**
  59   * Checks if a string contains 7bit ASCII only
  60   *
  61   * @author Andreas Gohr <andi@splitbrain.org>
  62   */
  63  function utf8_isASCII($str){
  64    for($i=0; $i<strlen($str); $i++){
  65      if(ord($str{$i}) >127) return false;
  66    }
  67    return true;
  68  }
  69  
  70  /**
  71   * Strips all highbyte chars
  72   *
  73   * Returns a pure ASCII7 string
  74   *
  75   * @author Andreas Gohr <andi@splitbrain.org>
  76   */
  77  function utf8_strip($str){
  78    $ascii = '';
  79    for($i=0; $i<strlen($str); $i++){
  80      if(ord($str{$i}) <128){
  81        $ascii .= $str{$i};
  82      }
  83    }
  84    return $ascii;
  85  }
  86  
  87  /**
  88   * Tries to detect if a string is in Unicode encoding
  89   *
  90   * @author <bmorel@ssi.fr>
  91   * @link   http://www.php.net/manual/en/function.utf8-encode.php
  92   */
  93  function utf8_check($Str) {
  94   for ($i=0; $i<strlen($Str); $i++) {
  95    $b = ord($Str[$i]);
  96    if ($b < 0x80) continue; # 0bbbbbbb
  97    elseif (($b & 0xE0) == 0xC0) $n=1; # 110bbbbb
  98    elseif (($b & 0xF0) == 0xE0) $n=2; # 1110bbbb
  99    elseif (($b & 0xF8) == 0xF0) $n=3; # 11110bbb
 100    elseif (($b & 0xFC) == 0xF8) $n=4; # 111110bb
 101    elseif (($b & 0xFE) == 0xFC) $n=5; # 1111110b
 102    else return false; # Does not match any model
 103    for ($j=0; $j<$n; $j++) { # n bytes matching 10bbbbbb follow ?
 104     if ((++$i == strlen($Str)) || ((ord($Str[$i]) & 0xC0) != 0x80))
 105     return false;
 106    }
 107   }
 108   return true;
 109  }
 110  
 111  /**
 112   * Unicode aware replacement for strlen()
 113   *
 114   * utf8_decode() converts characters that are not in ISO-8859-1
 115   * to '?', which, for the purpose of counting, is alright - It's
 116   * even faster than mb_strlen.
 117   *
 118   * @author <chernyshevsky at hotmail dot com>
 119   * @see    strlen()
 120   * @see    utf8_decode()
 121   */
 122  function utf8_strlen($string){
 123    return strlen(utf8_decode($string));
 124  }
 125  
 126  /**
 127   * UTF-8 aware alternative to substr
 128   *
 129   * Return part of a string given character offset (and optionally length)
 130   *
 131   * @author Harry Fuecks <hfuecks@gmail.com>
 132   * @author Chris Smith <chris@jalakai.co.uk>
 133   * @param string
 134   * @param integer number of UTF-8 characters offset (from left)
 135   * @param integer (optional) length in UTF-8 characters from offset
 136   * @return mixed string or false if failure
 137   */
 138  function utf8_substr($str, $offset, $length = null) {
 139      if(UTF8_MBSTRING){
 140          if( $length === null ){
 141              return mb_substr($str, $offset);
 142          }else{
 143              return mb_substr($str, $offset, $length);
 144          }
 145      }
 146  
 147      /*
 148       * Notes:
 149       *
 150       * no mb string support, so we'll use pcre regex's with 'u' flag
 151       * pcre only supports repetitions of less than 65536, in order to accept up to MAXINT values for
 152       * offset and length, we'll repeat a group of 65535 characters when needed (ok, up to MAXINT-65536)
 153       *
 154       * substr documentation states false can be returned in some cases (e.g. offset > string length)
 155       * mb_substr never returns false, it will return an empty string instead.
 156       *
 157       * calculating the number of characters in the string is a relatively expensive operation, so
 158       * we only carry it out when necessary. It isn't necessary for +ve offsets and no specified length
 159       */
 160  
 161      // cast parameters to appropriate types to avoid multiple notices/warnings
 162      $str = (string)$str;                          // generates E_NOTICE for PHP4 objects, but not PHP5 objects
 163      $offset = (int)$offset;
 164      if (!is_null($length)) $length = (int)$length;
 165  
 166      // handle trivial cases
 167      if ($length === 0) return '';
 168      if ($offset < 0 && $length < 0 && $length < $offset) return '';
 169  
 170      $offset_pattern = '';
 171      $length_pattern = '';
 172  
 173      // normalise -ve offsets (we could use a tail anchored pattern, but they are horribly slow!)
 174      if ($offset < 0) {
 175        $strlen = strlen(utf8_decode($str));        // see notes
 176        $offset = $strlen + $offset;
 177        if ($offset < 0) $offset = 0;
 178      }
 179  
 180      // establish a pattern for offset, a non-captured group equal in length to offset
 181      if ($offset > 0) {
 182        $Ox = (int)($offset/65535);
 183        $Oy = $offset%65535;
 184  
 185        if ($Ox) $offset_pattern = '(?:.{65535}){'.$Ox.'}';
 186        $offset_pattern = '^(?:'.$offset_pattern.'.{'.$Oy.'})';
 187      } else {
 188        $offset_pattern = '^';                      // offset == 0; just anchor the pattern
 189      }
 190  
 191      // establish a pattern for length
 192      if (is_null($length)) {
 193        $length_pattern = '(.*)$';                  // the rest of the string
 194      } else {
 195  
 196        if (!isset($strlen)) $strlen = strlen(utf8_decode($str));    // see notes
 197        if ($offset > $strlen) return '';           // another trivial case
 198  
 199        if ($length > 0) {
 200  
 201          $length = min($strlen-$offset, $length);  // reduce any length that would go passed the end of the string
 202  
 203          $Lx = (int)($length/65535);
 204          $Ly = $length%65535;
 205  
 206          // +ve length requires ... a captured group of length characters
 207          if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 208          $length_pattern = '('.$length_pattern.'.{'.$Ly.'})';
 209  
 210        } else if ($length < 0) {
 211  
 212          if ($length < ($offset - $strlen)) return '';
 213  
 214          $Lx = (int)((-$length)/65535);
 215          $Ly = (-$length)%65535;
 216  
 217          // -ve length requires ... capture everything except a group of -length characters
 218          //                         anchored at the tail-end of the string
 219          if ($Lx) $length_pattern = '(?:.{65535}){'.$Lx.'}';
 220          $length_pattern = '(.*)(?:'.$length_pattern.'.{'.$Ly.'})$';
 221        }
 222      }
 223  
 224      if (!preg_match('#'.$offset_pattern.$length_pattern.'#us',$str,$match)) return '';
 225      return $match[1];
 226  }
 227  
 228  /**
 229   * Unicode aware replacement for substr_replace()
 230   *
 231   * @author Andreas Gohr <andi@splitbrain.org>
 232   * @see    substr_replace()
 233   */
 234  function utf8_substr_replace($string, $replacement, $start , $length=0 ){
 235    $ret = '';
 236    if($start>0) $ret .= utf8_substr($string, 0, $start);
 237    $ret .= $replacement;
 238    $ret .= utf8_substr($string, $start+$length);
 239    return $ret;
 240  }
 241  
 242  /**
 243   * Unicode aware replacement for ltrim()
 244   *
 245   * @author Andreas Gohr <andi@splitbrain.org>
 246   * @see    ltrim()
 247   * @return string
 248   */
 249  function utf8_ltrim($str,$charlist=''){
 250    if($charlist == '') return ltrim($str);
 251  
 252    //quote charlist for use in a characterclass
 253    $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist);
 254  
 255    return preg_replace('/^['.$charlist.']+/u','',$str);
 256  }
 257  
 258  /**
 259   * Unicode aware replacement for rtrim()
 260   *
 261   * @author Andreas Gohr <andi@splitbrain.org>
 262   * @see    rtrim()
 263   * @return string
 264   */
 265  function  utf8_rtrim($str,$charlist=''){
 266    if($charlist == '') return rtrim($str);
 267  
 268    //quote charlist for use in a characterclass
 269    $charlist = preg_replace('!([\\\\\\-\\]\\[/])!','\\\$1}',$charlist);
 270  
 271    return preg_replace('/['.$charlist.']+$/u','',$str);
 272  }
 273  
 274  /**
 275   * Unicode aware replacement for trim()
 276   *
 277   * @author Andreas Gohr <andi@splitbrain.org>
 278   * @see    trim()
 279   * @return string
 280   */
 281  function  utf8_trim($str,$charlist='') {
 282    if($charlist == '') return trim($str);
 283  
 284    return utf8_ltrim(utf8_rtrim($str,$charlist),$charlist);
 285  }
 286  
 287  
 288  /**
 289   * This is a unicode aware replacement for strtolower()
 290   *
 291   * Uses mb_string extension if available
 292   *
 293   * @author Leo Feyer <leo@typolight.org>
 294   * @see    strtolower()
 295   * @see    utf8_strtoupper()
 296   */
 297  function utf8_strtolower($string){
 298    if(UTF8_MBSTRING) return mb_strtolower($string,'utf-8');
 299  
 300    global $UTF8_UPPER_TO_LOWER;
 301    return strtr($string,$UTF8_UPPER_TO_LOWER);
 302  }
 303  
 304  /**
 305   * This is a unicode aware replacement for strtoupper()
 306   *
 307   * Uses mb_string extension if available
 308   *
 309   * @author Leo Feyer <leo@typolight.org>
 310   * @see    strtoupper()
 311   * @see    utf8_strtoupper()
 312   */
 313  function utf8_strtoupper($string){
 314    if(UTF8_MBSTRING) return mb_strtoupper($string,'utf-8');
 315  
 316    global $UTF8_LOWER_TO_UPPER;
 317    return strtr($string,$UTF8_LOWER_TO_UPPER);
 318  }
 319  
 320  /**
 321   * Replace accented UTF-8 characters by unaccented ASCII-7 equivalents
 322   *
 323   * Use the optional parameter to just deaccent lower ($case = -1) or upper ($case = 1)
 324   * letters. Default is to deaccent both cases ($case = 0)
 325   *
 326   * @author Andreas Gohr <andi@splitbrain.org>
 327   */
 328  function utf8_deaccent($string,$case=0){
 329    if($case <= 0){
 330      global $UTF8_LOWER_ACCENTS;
 331      $string = strtr($string,$UTF8_LOWER_ACCENTS);
 332    }
 333    if($case >= 0){
 334      global $UTF8_UPPER_ACCENTS;
 335      $string = strtr($string,$UTF8_UPPER_ACCENTS);
 336    }
 337    return $string;
 338  }
 339  
 340  /**
 341   * Romanize a non-latin string
 342   *
 343   * @author Andreas Gohr <andi@splitbrain.org>
 344   */
 345  function utf8_romanize($string){
 346    if(utf8_isASCII($string)) return $string; //nothing to do
 347  
 348    global $UTF8_ROMANIZATION;
 349    return strtr($string,$UTF8_ROMANIZATION);
 350  }
 351  
 352  /**
 353   * Removes special characters (nonalphanumeric) from a UTF-8 string
 354   *
 355   * This function adds the controlchars 0x00 to 0x19 to the array of
 356   * stripped chars (they are not included in $UTF8_SPECIAL_CHARS)
 357   *
 358   * @author Andreas Gohr <andi@splitbrain.org>
 359   * @param  string $string     The UTF8 string to strip of special chars
 360   * @param  string $repl       Replace special with this string
 361   * @param  string $additional Additional chars to strip (used in regexp char class)
 362   */
 363  function utf8_stripspecials($string,$repl='',$additional=''){
 364    global $UTF8_SPECIAL_CHARS;
 365    global $UTF8_SPECIAL_CHARS2;
 366  
 367    static $specials = null;
 368    if(is_null($specials)){
 369  #    $specials = preg_quote(unicode_to_utf8($UTF8_SPECIAL_CHARS), '/');
 370      $specials = preg_quote($UTF8_SPECIAL_CHARS2, '/');
 371    }
 372  
 373    return preg_replace('/['.$additional.'\x00-\x19'.$specials.']/u',$repl,$string);
 374  }
 375  
 376  /**
 377   * This is an Unicode aware replacement for strpos
 378   *
 379   * @author Leo Feyer <leo@typolight.org>
 380   * @see    strpos()
 381   * @param  string
 382   * @param  string
 383   * @param  integer
 384   * @return integer
 385   */
 386  function utf8_strpos($haystack, $needle, $offset=0){
 387      $comp = 0;
 388      $length = null;
 389  
 390      while (is_null($length) || $length < $offset) {
 391          $pos = strpos($haystack, $needle, $offset + $comp);
 392  
 393          if ($pos === false)
 394              return false;
 395  
 396          $length = utf8_strlen(substr($haystack, 0, $pos));
 397  
 398          if ($length < $offset)
 399              $comp = $pos - $length;
 400      }
 401  
 402      return $length;
 403  }
 404  
 405  
 406  /**
 407   * Encodes UTF-8 characters to HTML entities
 408   *
 409   * @author Tom N Harris <tnharris@whoopdedo.org>
 410   * @author <vpribish at shopping dot com>
 411   * @link   http://www.php.net/manual/en/function.utf8-decode.php
 412   */
 413  function utf8_tohtml ($str) {
 414      $ret = '';
 415      foreach (utf8_to_unicode($str) as $cp) {
 416          if ($cp < 0x80)
 417              $ret .= chr($cp);
 418          elseif ($cp < 0x100)
 419              $ret .= "&#$cp;";
 420          else
 421              $ret .= '&#x'.dechex($cp).';';
 422      }
 423      return $ret;
 424  }
 425  
 426  /**
 427   * Decodes HTML entities to UTF-8 characters
 428   *
 429   * Convert any &#..; entity to a codepoint,
 430   * The entities flag defaults to only decoding numeric entities.
 431   * Pass HTML_ENTITIES and named entities, including &amp; &lt; etc.
 432   * are handled as well. Avoids the problem that would occur if you
 433   * had to decode "&amp;#38;&#38;amp;#38;"
 434   *
 435   * unhtmlspecialchars(utf8_unhtml($s)) -> "&#38;&#38;"
 436   * utf8_unhtml(unhtmlspecialchars($s)) -> "&&amp#38;"
 437   * what it should be                   -> "&#38;&amp#38;"
 438   *
 439   * @author Tom N Harris <tnharris@whoopdedo.org>
 440   * @param  string  $str      UTF-8 encoded string
 441   * @param  boolean $entities Flag controlling decoding of named entities.
 442   * @return UTF-8 encoded string with numeric (and named) entities replaced.
 443   */
 444  function utf8_unhtml($str, $entities=null) {
 445      static $decoder = null;
 446      if (is_null($decoder))
 447        $decoder = new utf8_entity_decoder();
 448      if (is_null($entities))
 449          return preg_replace_callback('/(&#([Xx])?([0-9A-Za-z]+);)/m',
 450                                       'utf8_decode_numeric', $str);
 451      else
 452          return preg_replace_callback('/&(#)?([Xx])?([0-9A-Za-z]+);/m',
 453                                       array(&$decoder, 'decode'), $str);
 454  }
 455  function utf8_decode_numeric($ent) {
 456      switch ($ent[2]) {
 457        case 'X':
 458        case 'x':
 459            $cp = hexdec($ent[3]);
 460            break;
 461        default:
 462            $cp = intval($ent[3]);
 463            break;
 464      }
 465      return unicode_to_utf8(array($cp));
 466  }
 467  class utf8_entity_decoder {
 468      var $table;
 469      function utf8_entity_decoder() {
 470          $table = get_html_translation_table(HTML_ENTITIES);
 471          $table = array_flip($table);
 472          $this->table = array_map(array(&$this,'makeutf8'), $table);
 473      }
 474      function makeutf8($c) {
 475          return unicode_to_utf8(array(ord($c)));
 476      }
 477      function decode($ent) {
 478          if ($ent[1] == '#') {
 479              return utf8_decode_numeric($ent);
 480          } elseif (array_key_exists($ent[0],$this->table)) {
 481              return $this->table[$ent[0]];
 482          } else {
 483              return $ent[0];
 484          }
 485      }
 486  }
 487  
 488  /**
 489   * Takes an UTF-8 string and returns an array of ints representing the
 490   * Unicode characters. Astral planes are supported ie. the ints in the
 491   * output can be > 0xFFFF. Occurrances of the BOM are ignored. Surrogates
 492   * are not allowed.
 493   *
 494   * If $strict is set to true the function returns false if the input
 495   * string isn't a valid UTF-8 octet sequence and raises a PHP error at
 496   * level E_USER_WARNING
 497   *
 498   * Note: this function has been modified slightly in this library to
 499   * trigger errors on encountering bad bytes
 500   *
 501   * @author <hsivonen@iki.fi>
 502   * @author Harry Fuecks <hfuecks@gmail.com>
 503   * @param  string  UTF-8 encoded string
 504   * @param  boolean Check for invalid sequences?
 505   * @return mixed array of unicode code points or false if UTF-8 invalid
 506   * @see    unicode_to_utf8
 507   * @link   http://hsivonen.iki.fi/php-utf8/
 508   * @link   http://sourceforge.net/projects/phputf8/
 509   */
 510  function utf8_to_unicode($str,$strict=false) {
 511      $mState = 0;     // cached expected number of octets after the current octet
 512                       // until the beginning of the next UTF8 character sequence
 513      $mUcs4  = 0;     // cached Unicode character
 514      $mBytes = 1;     // cached expected number of octets in the current sequence
 515  
 516      $out = array();
 517  
 518      $len = strlen($str);
 519  
 520      for($i = 0; $i < $len; $i++) {
 521  
 522          $in = ord($str{$i});
 523  
 524          if ( $mState == 0) {
 525  
 526              // When mState is zero we expect either a US-ASCII character or a
 527              // multi-octet sequence.
 528              if (0 == (0x80 & ($in))) {
 529                  // US-ASCII, pass straight through.
 530                  $out[] = $in;
 531                  $mBytes = 1;
 532  
 533              } else if (0xC0 == (0xE0 & ($in))) {
 534                  // First octet of 2 octet sequence
 535                  $mUcs4 = ($in);
 536                  $mUcs4 = ($mUcs4 & 0x1F) << 6;
 537                  $mState = 1;
 538                  $mBytes = 2;
 539  
 540              } else if (0xE0 == (0xF0 & ($in))) {
 541                  // First octet of 3 octet sequence
 542                  $mUcs4 = ($in);
 543                  $mUcs4 = ($mUcs4 & 0x0F) << 12;
 544                  $mState = 2;
 545                  $mBytes = 3;
 546  
 547              } else if (0xF0 == (0xF8 & ($in))) {
 548                  // First octet of 4 octet sequence
 549                  $mUcs4 = ($in);
 550                  $mUcs4 = ($mUcs4 & 0x07) << 18;
 551                  $mState = 3;
 552                  $mBytes = 4;
 553  
 554              } else if (0xF8 == (0xFC & ($in))) {
 555                  /* First octet of 5 octet sequence.
 556                   *
 557                   * This is illegal because the encoded codepoint must be either
 558                   * (a) not the shortest form or
 559                   * (b) outside the Unicode range of 0-0x10FFFF.
 560                   * Rather than trying to resynchronize, we will carry on until the end
 561                   * of the sequence and let the later error handling code catch it.
 562                   */
 563                  $mUcs4 = ($in);
 564                  $mUcs4 = ($mUcs4 & 0x03) << 24;
 565                  $mState = 4;
 566                  $mBytes = 5;
 567  
 568              } else if (0xFC == (0xFE & ($in))) {
 569                  // First octet of 6 octet sequence, see comments for 5 octet sequence.
 570                  $mUcs4 = ($in);
 571                  $mUcs4 = ($mUcs4 & 1) << 30;
 572                  $mState = 5;
 573                  $mBytes = 6;
 574  
 575              } elseif($strict) {
 576                  /* Current octet is neither in the US-ASCII range nor a legal first
 577                   * octet of a multi-octet sequence.
 578                   */
 579                  trigger_error(
 580                          'utf8_to_unicode: Illegal sequence identifier '.
 581                              'in UTF-8 at byte '.$i,
 582                          E_USER_WARNING
 583                      );
 584                  return false;
 585  
 586              }
 587  
 588          } else {
 589  
 590              // When mState is non-zero, we expect a continuation of the multi-octet
 591              // sequence
 592              if (0x80 == (0xC0 & ($in))) {
 593  
 594                  // Legal continuation.
 595                  $shift = ($mState - 1) * 6;
 596                  $tmp = $in;
 597                  $tmp = ($tmp & 0x0000003F) << $shift;
 598                  $mUcs4 |= $tmp;
 599  
 600                  /**
 601                   * End of the multi-octet sequence. mUcs4 now contains the final
 602                   * Unicode codepoint to be output
 603                   */
 604                  if (0 == --$mState) {
 605  
 606                      /*
 607                       * Check for illegal sequences and codepoints.
 608                       */
 609                      // From Unicode 3.1, non-shortest form is illegal
 610                      if (((2 == $mBytes) && ($mUcs4 < 0x0080)) ||
 611                          ((3 == $mBytes) && ($mUcs4 < 0x0800)) ||
 612                          ((4 == $mBytes) && ($mUcs4 < 0x10000)) ||
 613                          (4 < $mBytes) ||
 614                          // From Unicode 3.2, surrogate characters are illegal
 615                          (($mUcs4 & 0xFFFFF800) == 0xD800) ||
 616                          // Codepoints outside the Unicode range are illegal
 617                          ($mUcs4 > 0x10FFFF)) {
 618  
 619                          if($strict){
 620