[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/ -> indexer.php (source)

   1  <?php
   2  /**
   3   * Common DokuWiki functions
   4   *
   5   * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   */
   8  
   9    if(!defined('DOKU_INC')) define('DOKU_INC',fullpath(dirname(__FILE__).'/../').'/');
  10    require_once(DOKU_CONF.'dokuwiki.php');
  11    require_once (DOKU_INC.'inc/io.php');
  12    require_once (DOKU_INC.'inc/utf8.php');
  13    require_once (DOKU_INC.'inc/parserutils.php');
  14  
  15  // Asian characters are handled as words. The following regexp defines the
  16  // Unicode-Ranges for Asian characters
  17  // Ranges taken from http://en.wikipedia.org/wiki/Unicode_block
  18  // I'm no language expert. If you think some ranges are wrongly chosen or
  19  // a range is missing, please contact me
  20  define('IDX_ASIAN1','[\x{0E00}-\x{0E7F}]'); // Thai
  21  define('IDX_ASIAN2','['.
  22                     '\x{2E80}-\x{3040}'.  // CJK -> Hangul
  23                     '\x{309D}-\x{30A0}'.
  24                     '\x{30FD}-\x{31EF}\x{3200}-\x{D7AF}'.
  25                     '\x{F900}-\x{FAFF}'.  // CJK Compatibility Ideographs
  26                     '\x{FE30}-\x{FE4F}'.  // CJK Compatibility Forms
  27                     ']');
  28  define('IDX_ASIAN3','['.                // Hiragana/Katakana (can be two characters)
  29                     '\x{3042}\x{3044}\x{3046}\x{3048}'.
  30                     '\x{304A}-\x{3062}\x{3064}-\x{3082}'.
  31                     '\x{3084}\x{3086}\x{3088}-\x{308D}'.
  32                     '\x{308F}-\x{3094}'.
  33                     '\x{30A2}\x{30A4}\x{30A6}\x{30A8}'.
  34                     '\x{30AA}-\x{30C2}\x{30C4}-\x{30E2}'.
  35                     '\x{30E4}\x{30E6}\x{30E8}-\x{30ED}'.
  36                     '\x{30EF}-\x{30F4}\x{30F7}-\x{30FA}'.
  37                     ']['.
  38                     '\x{3041}\x{3043}\x{3045}\x{3047}\x{3049}'.
  39                     '\x{3063}\x{3083}\x{3085}\x{3087}\x{308E}\x{3095}-\x{309C}'.
  40                     '\x{30A1}\x{30A3}\x{30A5}\x{30A7}\x{30A9}'.
  41                     '\x{30C3}\x{30E3}\x{30E5}\x{30E7}\x{30EE}\x{30F5}\x{30F6}\x{30FB}\x{30FC}'.
  42                     '\x{31F0}-\x{31FF}'.
  43                     ']?');
  44  define('IDX_ASIAN', '(?:'.IDX_ASIAN1.'|'.IDX_ASIAN2.'|'.IDX_ASIAN3.')');
  45  
  46  /**
  47   * Measure the length of a string.
  48   * Differs from strlen in handling of asian characters.
  49   *
  50   * @author Tom N Harris <tnharris@whoopdedo.org>
  51   */
  52  function wordlen($w){
  53      $l = strlen($w);
  54      // If left alone, all chinese "words" will get put into w3.idx
  55      // So the "length" of a "word" is faked
  56      if(preg_match('/'.IDX_ASIAN2.'/u',$w))
  57          $l += ord($w) - 0xE1;  // Lead bytes from 0xE2-0xEF
  58      return $l;
  59  }
  60  
  61  /**
  62   * Write a list of strings to an index file.
  63   *
  64   * @author Tom N Harris <tnharris@whoopdedo.org>
  65   */
  66  function idx_saveIndex($pre, $wlen, &$idx){
  67      global $conf;
  68      $fn = $conf['indexdir'].'/'.$pre.$wlen;
  69      $fh = @fopen($fn.'.tmp','w');
  70      if(!$fh) return false;
  71      foreach ($idx as $line) {
  72          fwrite($fh,$line);
  73      }
  74      fclose($fh);
  75      if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
  76      io_rename($fn.'.tmp', $fn.'.idx');
  77      return true;
  78  }
  79  
  80  /**
  81   * Read the list of words in an index (if it exists).
  82   *
  83   * @author Tom N Harris <tnharris@whoopdedo.org>
  84   */
  85  function idx_getIndex($pre, $wlen){
  86      global $conf;
  87      $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
  88      if(!@file_exists($fn)) return array();
  89      return file($fn);
  90  }
  91  
  92  /**
  93   * Create an empty index file if it doesn't exist yet.
  94   *
  95   * FIXME: This function isn't currently used. It will probably be removed soon.
  96   *
  97   * @author Tom N Harris <tnharris@whoopdedo.org>
  98   */
  99  function idx_touchIndex($pre, $wlen){
 100      global $conf;
 101      $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
 102      if(!@file_exists($fn)){
 103          touch($fn);
 104          if($conf['fperm']) chmod($fn, $conf['fperm']);
 105      }
 106  }
 107  
 108  /**
 109   * Read a line ending with \n.
 110   * Returns false on EOF.
 111   *
 112   * @author Tom N Harris <tnharris@whoopdedo.org>
 113   */
 114  function _freadline($fh) {
 115      if (feof($fh)) return false;
 116      $ln = '';
 117      while (($buf = fgets($fh,4096)) !== false) {
 118          $ln .= $buf;
 119          if (substr($buf,-1) == "\n") break;
 120      }
 121      if ($ln === '') return false;
 122      if (substr($ln,-1) != "\n") $ln .= "\n";
 123      return $ln;
 124  }
 125  
 126  /**
 127   * Write a line to an index file.
 128   *
 129   * @author Tom N Harris <tnharris@whoopdedo.org>
 130   */
 131  function idx_saveIndexLine($pre, $wlen, $idx, $line){
 132      global $conf;
 133      if(substr($line,-1) != "\n") $line .= "\n";
 134      $fn = $conf['indexdir'].'/'.$pre.$wlen;
 135      $fh = @fopen($fn.'.tmp','w');
 136      if(!$fh) return false;
 137      $ih = @fopen($fn.'.idx','r');
 138      if ($ih) {
 139          $ln = -1;
 140          while (($curline = _freadline($ih)) !== false) {
 141              if (++$ln == $idx) {
 142                  fwrite($fh, $line);
 143              } else {
 144                  fwrite($fh, $curline);
 145              }
 146          }
 147          if ($idx > $ln) {
 148              fwrite($fh,$line);
 149          }
 150          fclose($ih);
 151      } else {
 152          fwrite($fh,$line);
 153      }
 154      fclose($fh);
 155      if($conf['fperm']) chmod($fn.'.tmp', $conf['fperm']);
 156      io_rename($fn.'.tmp', $fn.'.idx');
 157      return true;
 158  }
 159  
 160  /**
 161   * Read a single line from an index (if it exists).
 162   *
 163   * @author Tom N Harris <tnharris@whoopdedo.org>
 164   */
 165  function idx_getIndexLine($pre, $wlen, $idx){
 166      global $conf;
 167      $fn = $conf['indexdir'].'/'.$pre.$wlen.'.idx';
 168      if(!@file_exists($fn)) return '';
 169      $fh = @fopen($fn,'r');
 170      if(!$fh) return '';
 171      $ln = -1;
 172      while (($line = _freadline($fh)) !== false) {
 173          if (++$ln == $idx) break;
 174      }
 175      fclose($fh);
 176      return "$line";
 177  }
 178  
 179  /**
 180   * Split a page into words
 181   *
 182   * Returns an array of word counts, false if an error occurred.
 183   * Array is keyed on the word length, then the word index.
 184   *
 185   * @author Andreas Gohr <andi@splitbrain.org>
 186   * @author Christopher Smith <chris@jalakai.co.uk>
 187   */
 188  function idx_getPageWords($page){
 189      global $conf;
 190      $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
 191      if(@file_exists($swfile)){
 192          $stopwords = file($swfile);
 193      }else{
 194          $stopwords = array();
 195      }
 196  
 197      $body = '';
 198      $data = array($page, $body);
 199      $evt = new Doku_Event('INDEXER_PAGE_ADD', $data);
 200      if ($evt->advise_before()) $data[1] .= rawWiki($page);
 201      $evt->advise_after();
 202      unset($evt);
 203  
 204      list($page,$body) = $data;
 205      
 206      $body   = strtr($body, "\r\n\t", '   ');
 207      $tokens = explode(' ', $body);
 208      $tokens = array_count_values($tokens);   // count the frequency of each token
 209  
 210      // ensure the deaccented or romanised page names of internal links are added to the token array
 211      // (this is necessary for the backlink function -- there maybe a better way!)
 212      if ($conf['deaccent']) {
 213        $links = p_get_metadata($page,'relation references');
 214  
 215        if (!empty($links)) {
 216          $tmp = join(' ',array_keys($links));                // make a single string
 217          $tmp = strtr($tmp, ':', ' ');                       // replace namespace separator with a space
 218          $link_tokens = array_unique(explode(' ', $tmp));    // break into tokens
 219  
 220          foreach ($link_tokens as $link_token) {
 221            if (isset($tokens[$link_token])) continue;
 222            $tokens[$link_token] = 1;
 223          }
 224        }
 225      }
 226  
 227      $words = array();
 228      foreach ($tokens as $word => $count) {
 229          $arr = idx_tokenizer($word,$stopwords);
 230          $arr = array_count_values($arr);
 231          foreach ($arr as $w => $c) {
 232              $l = wordlen($w);
 233              if(isset($words[$l])){
 234                  $words[$l][$w] = $c * $count + (isset($words[$l][$w]) ? $words[$l][$w] : 0);
 235              }else{
 236                  $words[$l] = array($w => $c * $count);
 237              }
 238          }
 239      }
 240  
 241      // arrive here with $words = array(wordlen => array(word => frequency))
 242  
 243      $index = array(); //resulting index
 244      foreach (array_keys($words) as $wlen){
 245          $word_idx = idx_getIndex('w',$wlen);
 246          foreach ($words[$wlen] as $word => $freq) {
 247              $wid = array_search("$word\n",$word_idx);
 248              if(!is_int($wid)){
 249                  $wid = count($word_idx);
 250                  $word_idx[] = "$word\n";
 251              }
 252              if(!isset($index[$wlen]))
 253                  $index[$wlen] = array();
 254              $index[$wlen][$wid] = $freq;
 255          }
 256  
 257          // save back word index
 258          if(!idx_saveIndex('w',$wlen,$word_idx)){
 259              trigger_error("Failed to write word index", E_USER_ERROR);
 260              return false;
 261          }
 262      }
 263  
 264      return $index;
 265  }
 266  
 267  /**
 268   * Adds/updates the search for the given page
 269   *
 270   * This is the core function of the indexer which does most
 271   * of the work. This function needs to be called with proper
 272   * locking!
 273   *
 274   * @author Andreas Gohr <andi@splitbrain.org>
 275   */
 276  function idx_addPage($page){
 277      global $conf;
 278  
 279      // load known documents
 280      $page_idx = idx_getIndex('page','');
 281  
 282      // get page id (this is the linenumber in page.idx)
 283      $pid = array_search("$page\n",$page_idx);
 284      if(!is_int($pid)){
 285          $page_idx[] = "$page\n";
 286          $pid = count($page_idx)-1;
 287          // page was new - write back
 288          if (!idx_saveIndex('page','',$page_idx)){
 289              trigger_error("Failed to write page index", E_USER_ERROR);
 290              return false;
 291          }
 292      }
 293  
 294      $pagewords = array();
 295      // get word usage in page
 296      $words = idx_getPageWords($page);
 297      if($words === false) return false;
 298  
 299      if(!empty($words)) {
 300          foreach(array_keys($words) as $wlen){
 301              $index = idx_getIndex('i',$wlen);
 302              foreach($words[$wlen] as $wid => $freq){
 303                  if($wid<count($index)){
 304                      $index[$wid] = idx_updateIndexLine($index[$wid],$pid,$freq);
 305                  }else{
 306                      // New words **should** have been added in increasing order
 307                      // starting with the first unassigned index.
 308                      // If someone can show how this isn't true, then I'll need to sort
 309                      // or do something special.
 310                      $index[$wid] = idx_updateIndexLine('',$pid,$freq);
 311                  }
 312                  $pagewords[] = "$wlen*$wid";
 313              }
 314              // save back word index
 315              if(!idx_saveIndex('i',$wlen,$index)){
 316                  trigger_error("Failed to write index", E_USER_ERROR);
 317                  return false;
 318              }
 319          }
 320      }
 321      
 322      // Remove obsolete index entries
 323      $pageword_idx = trim(idx_getIndexLine('pageword','',$pid));
 324      if ($pageword_idx !== '') {
 325          $oldwords = explode(':',$pageword_idx);
 326          $delwords = array_diff($oldwords, $pagewords);
 327          $upwords = array();
 328          foreach ($delwords as $word) {
 329              if($word=='') continue;
 330              list($wlen,$wid) = explode('*',$word);
 331              $wid = (int)$wid;
 332              $upwords[$wlen][] = $wid;
 333          }
 334          foreach ($upwords as $wlen => $widx) {
 335              $index = idx_getIndex('i',$wlen);
 336              foreach ($widx as $wid) {
 337                  $index[$wid] = idx_updateIndexLine($index[$wid],$pid,0);
 338              }
 339              idx_saveIndex('i',$wlen,$index);
 340          }
 341      }
 342      // Save the reverse index
 343      $pageword_idx = join(':',$pagewords)."\n";
 344      if(!idx_saveIndexLine('pageword','',$pid,$pageword_idx)){
 345          trigger_error("Failed to write word index", E_USER_ERROR);
 346          return false;
 347      }
 348  
 349      return true;
 350  }
 351  
 352  /**
 353   * Write a new index line to the filehandle
 354   *
 355   * This function writes an line for the index file to the
 356   * given filehandle. It removes the given document from
 357   * the given line and readds it when $count is >0.
 358   *
 359   * @deprecated - see idx_updateIndexLine
 360   * @author Andreas Gohr <andi@splitbrain.org>
 361   */
 362  function idx_writeIndexLine($fh,$line,$pid,$count){
 363      fwrite($fh,idx_updateIndexLine($line,$pid,$count));
 364  }
 365  
 366  /**
 367   * Modify an index line with new information
 368   *
 369   * This returns a line of the index. It removes the
 370   * given document from the line and readds it if
 371   * $count is >0.
 372   *
 373   * @author Tom N Harris <tnharris@whoopdedo.org>
 374   * @author Andreas Gohr <andi@splitbrain.org>
 375   */
 376  function idx_updateIndexLine($line,$pid,$count){
 377      $line = trim($line);
 378      $updated = array();
 379      if($line != ''){
 380          $parts = explode(':',$line);
 381          // remove doc from given line
 382          foreach($parts as $part){
 383              if($part == '') continue;
 384              list($doc,$cnt) = explode('*',$part);
 385              if($doc != $pid){
 386                  $updated[] = $part;
 387              }
 388          }
 389      }
 390  
 391      // add doc
 392      if ($count){
 393          $updated[] = "$pid*$count";
 394      }
 395  
 396      return join(':',$updated)."\n";
 397  }
 398  
 399  /**
 400   * Get the word lengths that have been indexed.
 401   *
 402   * Reads the index directory and returns an array of lengths
 403   * that there are indices for.
 404   *
 405   * @author Tom N Harris <tnharris@whoopdedo.org>
 406   */
 407  function idx_indexLengths(&$filter){
 408      global $conf;
 409      $dir = @opendir($conf['indexdir']);
 410      if($dir===false)
 411          return array();
 412      $idx = array();
 413      if(is_array($filter)){
 414          while (($f = readdir($dir)) !== false) {
 415              if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
 416                  $i = substr($f,1,-4);
 417                  if (is_numeric($i) && isset($filter[(int)$i]))
 418                      $idx[] = (int)$i;
 419              }
 420          }
 421      }else{
 422          // Exact match first.
 423          if(@file_exists($conf['indexdir']."/i$filter.idx"))
 424              $idx[] = $filter;
 425          while (($f = readdir($dir)) !== false) {
 426              if (substr($f,0,1) == 'i' && substr($f,-4) == '.idx'){
 427                  $i = substr($f,1,-4);
 428                  if (is_numeric($i) && $i > $filter)
 429                      $idx[] = (int)$i;
 430              }
 431          }
 432      }
 433      closedir($dir);
 434      return $idx;
 435  }
 436  
 437  /**
 438   * Find the the index number of each search term.
 439   *
 440   * This will group together words that appear in the same index.
 441   * So it should perform better, because it only opens each index once.
 442   * Actually, it's not that great. (in my experience) Probably because of the disk cache.
 443   * And the sorted function does more work, making it slightly slower in some cases.
 444   *
 445   * @param array    $words   The query terms. Words should only contain valid characters,
 446   *                          with a '*' at either the beginning or end of the word (or both)
 447   * @param arrayref $result  Set to word => array("length*id" ...), use this to merge the
 448   *                          index locations with the appropriate query term.
 449   * @return array            Set to length => array(id ...)
 450   *
 451   * @author Tom N Harris <tnharris@whoopdedo.org>
 452   */
 453  function idx_getIndexWordsSorted($words,&$result){
 454      // parse and sort tokens
 455      $tokens = array();
 456      $tokenlength = array();
 457      $tokenwild = array();
 458      foreach($words as $word){
 459          $result[$word] = array();
 460          $wild = 0;
 461          $xword = $word;
 462          $wlen = wordlen($word);
 463  
 464          // check for wildcards
 465          if(substr($xword,0,1) == '*'){
 466              $xword = substr($xword,1);
 467              $wild |= 1;
 468              $wlen -= 1;
 469          }
 470          if(substr($xword,-1,1) == '*'){
 471              $xword = substr($xword,0,-1);
 472              $wild |= 2;
 473              $wlen -= 1;
 474          }
 475          if ($wlen < 3 && $wild == 0 && !is_numeric($xword)) continue;
 476          if(!isset($tokens[$xword])){
 477              $tokenlength[$wlen][] = $xword;
 478          }
 479          if($wild){
 480              $ptn = preg_quote($xword,'/');
 481              if(($wild&1) == 0) $ptn = '^'.$ptn;
 482              if(($wild&2) == 0) $ptn = $ptn.'$';
 483              $tokens[$xword][] = array($word, '/'.$ptn.'/');
 484              if(!isset($tokenwild[$xword])) $tokenwild[$xword] = $wlen;
 485          }else
 486              $tokens[$xword][] = array($word, null);
 487      }
 488      asort($tokenwild);
 489      // $tokens = array( base word => array( [ query word , grep pattern ] ... ) ... )
 490      // $tokenlength = array( base word length => base word ... )
 491      // $tokenwild = array( base word => base word length ... )
 492  
 493      $length_filter = empty($tokenwild) ? $tokenlength : min(array_keys($tokenlength));
 494      $indexes_known = idx_indexLengths($length_filter);
 495      if(!empty($tokenwild)) sort($indexes_known);
 496      // get word IDs
 497      $wids = array();
 498      foreach(