[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/ -> fulltext.php (source)

   1  <?php
   2  /**
   3   * DokuWiki fulltextsearch functions using the index
   4   *
   5   * @license    GPL 2 (http://www.gnu.org/licenses/gpl.html)
   6   * @author     Andreas Gohr <andi@splitbrain.org>
   7   */
   8  
   9    if(!defined('DOKU_INC')) define('DOKU_INC',fullpath(dirname(__FILE__).'/../').'/');
  10    require_once (DOKU_INC.'inc/indexer.php');
  11  
  12  
  13  /**
  14   * The fulltext search
  15   *
  16   * Returns a list of matching documents for the given query
  17   *
  18   * refactored into ft_pageSearch(), _ft_pageSearch() and trigger_event()
  19   *
  20   */
  21  function ft_pageSearch($query,&$highlight){
  22  
  23    $data['query'] = $query;
  24    $data['highlight'] =& $highlight;
  25  
  26    return trigger_event('SEARCH_QUERY_FULLPAGE', $data, '_ft_pageSearch');
  27  }
  28  function _ft_pageSearch(&$data){
  29      // split out original parameters
  30      $query = $data['query'];
  31      $highlight =& $data['highlight'];
  32  
  33      $q = ft_queryParser($query);
  34  
  35      $highlight = array();
  36  
  37      // remember for hilighting later
  38      foreach($q['words'] as $wrd){
  39          $highlight[] =  str_replace('*','',$wrd);
  40      }
  41  
  42      // lookup all words found in the query
  43      $words  = array_merge($q['and'],$q['not']);
  44      if(!count($words)) return array();
  45      $result = idx_lookup($words);
  46      if(!count($result)) return array();
  47  
  48      // merge search results with query
  49      foreach($q['and'] as $pos => $w){
  50          $q['and'][$pos] = $result[$w];
  51      }
  52      // create a list of unwanted docs
  53      $not = array();
  54      foreach($q['not'] as $pos => $w){
  55          $not = array_merge($not,array_keys($result[$w]));
  56      }
  57  
  58      // combine and-words
  59      if(count($q['and']) > 1){
  60          $docs = ft_resultCombine($q['and']);
  61      }else{
  62          $docs = $q['and'][0];
  63      }
  64      if(!count($docs)) return array();
  65  
  66      // create a list of hidden pages in the result
  67      $hidden = array();
  68      $hidden = array_filter(array_keys($docs),'isHiddenPage');
  69      $not = array_merge($not,$hidden);
  70  
  71      // filter unmatched namespaces
  72      if(!empty($q['ns'])) {
  73          $pattern = implode('|^',$q['ns']);
  74          foreach($docs as $key => $val) {
  75              if(!preg_match('/^'.$pattern.'/',$key)) {
  76                  unset($docs[$key]);
  77              }
  78          }
  79      }
  80  
  81      // remove negative matches
  82      foreach($not as $n){
  83          unset($docs[$n]);
  84      }
  85  
  86      if(!count($docs)) return array();
  87      // handle phrases
  88      if(count($q['phrases'])){
  89          $q['phrases'] = array_map('utf8_strtolower',$q['phrases']);
  90          // use this for higlighting later:
  91          $highlight = array_merge($highlight,$q['phrases']);
  92          $q['phrases'] = array_map('preg_quote_cb',$q['phrases']);
  93          // check the source of all documents for the exact phrases
  94          foreach(array_keys($docs) as $id){
  95              $text  = utf8_strtolower(rawWiki($id));
  96              foreach($q['phrases'] as $phrase){
  97                  if(!preg_match('/'.$phrase.'/usi',$text)){
  98                      unset($docs[$id]); // no hit - remove
  99                      break;
 100                  }
 101              }
 102          }
 103      }
 104  
 105      if(!count($docs)) return array();
 106  
 107      // check ACL permissions
 108      foreach(array_keys($docs) as $doc){
 109          if(auth_quickaclcheck($doc) < AUTH_READ){
 110              unset($docs[$doc]);
 111          }
 112      }
 113  
 114      if(!count($docs)) return array();
 115  
 116      // if there are any hits left, sort them by count
 117      arsort($docs);
 118  
 119      return $docs;
 120  }
 121  
 122  /**
 123   * Returns the backlinks for a given page
 124   *
 125   * Does a quick lookup with the fulltext index, then
 126   * evaluates the instructions of the found pages
 127   */
 128  function ft_backlinks($id){
 129      global $conf;
 130      $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
 131      $stopwords = @file_exists($swfile) ? file($swfile) : array();
 132  
 133      $result = array();
 134  
 135      // quick lookup of the pagename
 136      $page    = noNS($id);
 137      $matches = idx_lookup(idx_tokenizer($page,$stopwords));  // pagename may contain specials (_ or .)
 138      $docs    = array_keys(ft_resultCombine(array_values($matches)));
 139      $docs    = array_filter($docs,'isVisiblePage'); // discard hidden pages
 140      if(!count($docs)) return $result;
 141      require_once (DOKU_INC.'inc/parserutils.php');
 142  
 143      // check metadata for matching links
 144      foreach($docs as $match){
 145          // metadata relation reference links are already resolved
 146          $links = p_get_metadata($match,'relation references');
 147          if (isset($links[$id])) $result[] = $match;
 148      }
 149  
 150      if(!count($result)) return $result;
 151  
 152      // check ACL permissions
 153      foreach(array_keys($result) as $idx){
 154          if(auth_quickaclcheck($result[$idx]) < AUTH_READ){
 155              unset($result[$idx]);
 156          }
 157      }
 158  
 159      sort($result);
 160      return $result;
 161  }
 162  
 163  /**
 164   * Returns the pages that use a given media file
 165   *
 166   * Does a quick lookup with the fulltext index, then
 167   * evaluates the instructions of the found pages
 168   *
 169   * Aborts after $max found results
 170   */
 171  function ft_mediause($id,$max){
 172      global $conf;
 173      $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
 174      $stopwords = @file_exists($swfile) ? file($swfile) : array();
 175  
 176      if(!$max) $max = 1; // need to find at least one
 177  
 178      $result = array();
 179  
 180      // quick lookup of the mediafile
 181      $media   = noNS($id);
 182      $matches = idx_lookup(idx_tokenizer($media,$stopwords));
 183      $docs    = array_keys(ft_resultCombine(array_values($matches)));
 184      if(!count($docs)) return $result;
 185  
 186      // go through all found pages
 187      $found = 0;
 188      $pcre  = preg_quote($media,'/');
 189      foreach($docs as $doc){
 190          $ns = getNS($doc);
 191          preg_match_all('/\{\{([^|}]*'.$pcre.'[^|}]*)(|[^}]+)?\}\}/i',rawWiki($doc),$matches);
 192          foreach($matches[1] as $img){
 193              $img = trim($img);
 194              if(preg_match('/^https?:\/\//i',$img)) continue; // skip external images
 195              list($img) = explode('?',$img);                  // remove any parameters
 196              resolve_mediaid($ns,$img,$exists);               // resolve the possibly relative img
 197  
 198              if($img == $id){                                 // we have a match
 199                  $result[] = $doc;
 200                  $found++;
 201                  break;
 202              }
 203          }
 204          if($found >= $max) break;
 205      }
 206  
 207      sort($result);
 208      return $result;
 209  }
 210  
 211  
 212  
 213  /**
 214   * Quicksearch for pagenames
 215   *
 216   * By default it only matches the pagename and ignores the
 217   * namespace. This can be changed with the second parameter
 218   *
 219   * refactored into ft_pageLookup(), _ft_pageLookup() and trigger_event()
 220   *
 221   * @author Andreas Gohr <andi@splitbrain.org>
 222   */
 223  function ft_pageLookup($id,$pageonly=true){
 224      $data = array('id' => $id, 'pageonly' => $pageonly);
 225      return trigger_event('SEARCH_QUERY_PAGELOOKUP',$data,'_ft_pageLookup');
 226  }
 227  
 228  function _ft_pageLookup(&$data){
 229      // split out original parameterrs
 230      $id = $data['id'];
 231      $pageonly = $data['pageonly'];
 232  
 233      global $conf;
 234      $id    = preg_quote($id,'/');
 235      $pages = file($conf['indexdir'].'/page.idx');
 236      if($id) $pages = array_values(preg_grep('/'.$id.'/',$pages));
 237  
 238      $cnt = count($pages);
 239      for($i=0; $i<$cnt; $i++){
 240          if($pageonly){
 241              if(!preg_match('/'.$id.'/',noNS($pages[$i]))){
 242                  unset($pages[$i]);
 243                  continue;
 244              }
 245          }
 246          if(!page_exists($pages[$i])){
 247              unset($pages[$i]);
 248              continue;
 249          }
 250      }
 251  
 252      $pages = array_filter($pages,'isVisiblePage'); // discard hidden pages
 253      if(!count($pages)) return array();
 254  
 255      // check ACL permissions
 256      foreach(array_keys($pages) as $idx){
 257          if(auth_quickaclcheck($pages[$idx]) < AUTH_READ){
 258              unset($pages[$idx]);
 259          }
 260      }
 261  
 262      $pages = array_map('trim',$pages);
 263      usort($pages,'ft_pagesorter');
 264      return $pages;
 265  }
 266  
 267  /**
 268   * Sort pages based on their namespace level first, then on their string
 269   * values. This makes higher hierarchy pages rank higher than lower hierarchy
 270   * pages.
 271   */
 272  function ft_pagesorter($a, $b){
 273      $ac = count(explode(':',$a));
 274      $bc = count(explode(':',$b));
 275      if($ac < $bc){
 276          return -1;
 277      }elseif($ac > $bc){
 278          return 1;
 279      }
 280      return strcmp ($a,$b);
 281  }
 282  
 283  /**
 284   * Creates a snippet extract
 285   *
 286   * @author Andreas Gohr <andi@splitbrain.org>
 287   */
 288  function ft_snippet($id,$highlight){
 289      $text     = rawWiki($id);
 290      $match = array();
 291      $snippets = array();
 292      $utf8_offset = $offset = $end = 0;
 293      $len = utf8_strlen($text);
 294  
 295      // build a regexp from the phrases to highlight
 296      $re = join('|',array_map('preg_quote_cb',array_filter((array) $highlight)));
 297  
 298      for ($cnt=3; $cnt--;) {
 299        if (!preg_match('#('.$re.')#iu',$text,$match,PREG_OFFSET_CAPTURE,$offset)) break;
 300  
 301        list($str,$idx) = $match[0];
 302  
 303        // convert $idx (a byte offset) into a utf8 character offset
 304        $utf8_idx = utf8_strlen(substr($text,0,$idx));
 305        $utf8_len = utf8_strlen($str);
 306  
 307        // establish context, 100 bytes surrounding the match string
 308        // first look to see if we can go 100 either side,
 309        // then drop to 50 adding any excess if the other side can't go to 50,
 310        $pre = min($utf8_idx-$utf8_offset,100);
 311        $post = min($len-$utf8_idx-$utf8_len,100);
 312  
 313        if ($pre>50 && $post>50) {
 314          $pre = $post = 50;
 315        } else if ($pre>50) {
 316          $pre = min($pre,100-$post);
 317        } else if ($post>50) {
 318          $post = min($post, 100-$pre);
 319        } else {
 320          // both are less than 50, means the context is the whole string
 321          // make it so and break out of this loop - there is no need for the
 322          // complex snippet calculations
 323          $snippets = array($text);
 324          break;
 325        }
 326  
 327        // establish context start and end points, try to append to previous
 328        // context if possible
 329        $start = $utf8_idx - $pre;
 330        $append = ($start < $end) ? $end : false;  // still the end of the previous context snippet
 331        $end = $utf8_idx + $utf8_len + $post;      // now set it to the end of this context
 332  
 333        if ($append) {
 334          $snippets[count($snippets)-1] .= utf8_substr($text,$append,$end-$append);
 335        } else {
 336          $snippets[] = utf8_substr($text,$start,$end-$start);
 337        }
 338  
 339        // set $offset for next match attempt
 340        //   substract strlen to avoid splitting a potential search success,
 341        //   this is an approximation as the search pattern may match strings
 342        //   of varying length and it will fail if the context snippet
 343        //   boundary breaks a matching string longer than the current match
 344        $utf8_offset = $utf8_idx + $post;
 345        $offset = $idx + strlen(utf8_substr($text,$utf8_idx,$post));
 346        $offset = utf8_correctIdx($text,$offset);
 347      }
 348  
 349      $m = "\1";
 350      $snippets = preg_replace('#('.$re.')#iu',$m.'$1'.$m,$snippets);
 351      $snippet = preg_replace('#'.$m.'([^'.$m.']*?)'.$m.'#iu','<strong class="search_hit">$1</strong>',hsc(join('... ',$snippets)));
 352  
 353      return $snippet;
 354  }
 355  
 356  /**
 357   * Combine found documents and sum up their scores
 358   *
 359   * This function is used to combine searched words with a logical
 360   * AND. Only documents available in all arrays are returned.
 361   *
 362   * based upon PEAR's PHP_Compat function for array_intersect_key()
 363   *
 364   * @param array $args An array of page arrays
 365   */
 366  function ft_resultCombine($args){
 367      $array_count = count($args);
 368      if($array_count == 1){
 369          return $args[0];
 370      }
 371  
 372      $result = array();
 373      if ($array_count > 1) {
 374        foreach ($args[0] as $key => $value) {
 375          $result[$key] = $value;
 376          for ($i = 1; $i !== $array_count; $i++) {
 377              if (!isset($args[$i][$key])) {
 378                  unset($result[$key]);
 379                  break;
 380              }
 381              $result[$key] += $args[$i][$key];
 382          }
 383        }
 384      }
 385      return $result;
 386  }
 387  
 388  /**
 389   * Builds an array of search words from a query
 390   *
 391   * @todo support OR and parenthesises?
 392   */
 393  function ft_queryParser($query){
 394      global $conf;
 395      $swfile   = DOKU_INC.'inc/lang/'.$conf['lang'].'/stopwords.txt';
 396      if(@file_exists($swfile)){
 397          $stopwords = file($swfile);
 398      }else{
 399          $stopwords = array();
 400      }
 401  
 402      $q = array();
 403      $q['query']   = $query;
 404      $q['ns']      = array();
 405      $q['phrases'] = array();
 406      $q['words']   = array();
 407      $q['and']     = array();
 408      $q['not']     = array();
 409  
 410      // strip namespace from query
 411      if(preg_match('/([^@]*)@(.*)/',$query,$match))  {
 412          $query = $match[1];
 413          $q['ns'] = explode('@',preg_replace("/ /",'',$match[2]));
 414      }
 415  
 416      // handle phrase searches
 417      while(preg_match('/"(.*?)"/',$query,$match)){
 418          $q['phrases'][] = $match[1];
 419          $q['and'] = array_merge($q['and'], idx_tokenizer($match[0],$stopwords));
 420          $query = preg_replace('/"(.*?)"/','',$query,1);
 421      }
 422  
 423      $words = explode(' ',$query);
 424      foreach($words as $w){
 425          if($w{0} == '-'){
 426              $token = idx_tokenizer($w,$stopwords,true);
 427              if(count($token)) $q['not'] = array_merge($q['not'],$token);
 428          }else{
 429              // asian "words" need to be searched as phrases
 430              if(@preg_match_all('/(('.IDX_ASIAN.')+)/u',$w,$matches)){
 431                  $q['phrases'] = array_merge($q['phrases'],$matches[1]);
 432  
 433              }
 434              $token = idx_tokenizer($w,$stopwords,true);
 435              if(count($token)){
 436                  $q['and']   = array_merge($q['and'],$token);
 437                  $q['words'] = array_merge($q['words'],$token);
 438              }
 439          }
 440      }
 441  
 442      return $q;
 443  }
 444  
 445  //Setup VIM: ex: et ts=4 enc=utf-8 :


Generated: Tue Dec 2 01:30:01 2008 Cross-referenced by PHPXref 0.7