[ Index ]

PHP Cross Reference of DokuWiki

title

Body

[close]

/inc/parser/ -> lexer.php (source)

   1  <?php
   2  /**
   3  * Author Markus Baker: http://www.lastcraft.com
   4  * Version adapted from Simple Test: http://sourceforge.net/projects/simpletest/
   5  * For an intro to the Lexer see:
   6  * http://www.phppatterns.com/index.php/article/articleview/106/1/2/
   7  * @author Marcus Baker
   8  * @package Doku
   9  * @subpackage Lexer
  10  * @version $Id: lexer.php,v 1.1 2005/03/23 23:14:09 harryf Exp $
  11  */
  12  
  13  /**
  14  * Init path constant
  15  */
  16  if(!defined('DOKU_INC')) define('DOKU_INC',fullpath(dirname(__FILE__).'/../../').'/');
  17  
  18  /**#@+
  19   * lexer mode constant
  20   */
  21  define("DOKU_LEXER_ENTER", 1);
  22  define("DOKU_LEXER_MATCHED", 2);
  23  define("DOKU_LEXER_UNMATCHED", 3);
  24  define("DOKU_LEXER_EXIT", 4);
  25  define("DOKU_LEXER_SPECIAL", 5);
  26  /**#@-*/
  27  
  28  /**
  29   *    Compounded regular expression. Any of
  30   *    the contained patterns could match and
  31   *    when one does it's label is returned.
  32   *    @package Doku
  33   *    @subpackage Lexer
  34   */
  35  class Doku_LexerParallelRegex {
  36      var $_patterns;
  37      var $_labels;
  38      var $_regex;
  39      var $_case;
  40  
  41      /**
  42       *    Constructor. Starts with no patterns.
  43       *    @param boolean $case    True for case sensitive, false
  44       *                            for insensitive.
  45       *    @access public
  46       */
  47      function Doku_LexerParallelRegex($case) {
  48          $this->_case = $case;
  49          $this->_patterns = array();
  50          $this->_labels = array();
  51          $this->_regex = null;
  52      }
  53  
  54      /**
  55       *    Adds a pattern with an optional label.
  56       *    @param mixed $pattern       Perl style regex. Must be UTF-8
  57       *                                encoded. If its a string, the (, )
  58       *                                lose their meaning unless they
  59       *                                form part of a lookahead or
  60       *                                lookbehind assertation.
  61       *    @param string $label        Label of regex to be returned
  62       *                                on a match. Label must be ASCII
  63       *    @access public
  64       */
  65      function addPattern($pattern, $label = true) {
  66          $count = count($this->_patterns);
  67          $this->_patterns[$count] = $pattern;
  68          $this->_labels[$count] = $label;
  69          $this->_regex = null;
  70      }
  71  
  72      /**
  73       *    Attempts to match all patterns at once against
  74       *    a string.
  75       *    @param string $subject      String to match against.
  76       *    @param string $match        First matched portion of
  77       *                                subject.
  78       *    @return boolean             True on success.
  79       *    @access public
  80       */
  81      function match($subject, &$match) {
  82          if (count($this->_patterns) == 0) {
  83              return false;
  84          }
  85          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
  86              $match = "";
  87              return false;
  88          }
  89  
  90          $match = $matches[0];
  91          $size = count($matches);
  92          for ($i = 1; $i < $size; $i++) {
  93              if ($matches[$i] && isset($this->_labels[$i - 1])) {
  94                  return $this->_labels[$i - 1];
  95              }
  96          }
  97          return true;
  98      }
  99  
 100      /**
 101       *    Attempts to split the string against all patterns at once
 102       *
 103       *    @param string $subject      String to match against.
 104       *    @param array $split         The split result: array containing, pre-match, match & post-match strings
 105       *    @return boolean             True on success.
 106       *    @access public
 107       *
 108       *    @author Christopher Smith <chris@jalakai.co.uk>
 109       */
 110      function split($subject, &$split) {
 111          if (count($this->_patterns) == 0) {
 112              return false;
 113          }
 114  
 115          if (! preg_match($this->_getCompoundedRegex(), $subject, $matches)) {
 116              if(function_exists('preg_last_error')){
 117                  $err = preg_last_error();
 118                  switch($err){
 119                      case PREG_BACKTRACK_LIMIT_ERROR:
 120                          msg('A PCRE backtrack error occured. Try to increase the pcre.backtrack_limit in php.ini',-1);
 121                          break;
 122                      case PREG_RECURSION_LIMIT_ERROR:
 123                          msg('A PCRE recursion error occured. Try to increase the pcre.recursion_limit in php.ini',-1);
 124                          break;
 125                      case PREG_BAD_UTF8_ERROR:
 126                          msg('A PCRE UTF-8 error occured. This might be caused by a faulty plugin',-1);
 127                          break;
 128                      case PREG_INTERNAL_ERROR:
 129                          msg('A PCRE internal error occured. This might be caused by a faulty plugin',-1);
 130                          break;
 131                  }
 132              }
 133  
 134              $split = array($subject, "", "");
 135              return false;
 136          }
 137  
 138          $idx = count($matches)-2;
 139  
 140          list($pre, $post) = preg_split($this->_patterns[$idx].$this->_getPerlMatchingFlags(), $subject, 2);
 141  
 142          $split = array($pre, $matches[0], $post);
 143          return isset($this->_labels[$idx]) ? $this->_labels[$idx] : true;
 144      }
 145  
 146      /**
 147       *    Compounds the patterns into a single
 148       *    regular expression separated with the
 149       *    "or" operator. Caches the regex.
 150       *    Will automatically escape (, ) and / tokens.
 151       *    @param array $patterns    List of patterns in order.
 152       *    @access private
 153       */
 154      function _getCompoundedRegex() {
 155          if ($this->_regex == null) {
 156              $cnt = count($this->_patterns);
 157              for ($i = 0; $i < $cnt; $i++) {
 158  
 159                  /*
 160                   * decompose the input pattern into "(", "(?", ")", 
 161                   * "[...]", "[]..]", "[^]..]", "[...[:...:]..]", "\x"... 
 162                   * elements.
 163                   */ 
 164                  preg_match_all('/\\\\.|' . 
 165                                 '\(\?|' .
 166                                 '[()]|' .
 167                                 '\[\^?\]?(?:\\\\.|\[:[^]]*:\]|[^]\\\\])*\]|' .
 168                                 '[^[()\\\\]+/', $this->_patterns[$i], $elts);
 169  
 170                  $pattern = "";
 171                  $level = 0;
 172  
 173                  foreach ($elts[0] as $elt) {
 174                      /*
 175                       * for "(", ")" remember the nesting level, add "\" 
 176                       * only to the non-"(?" ones.
 177                       */
 178  
 179                      switch($elt) {
 180                      case '(':
 181                          $pattern .= '\(';
 182                          break;
 183                      case ')':
 184                          if ($level > 0)
 185                              $level--; /* closing (? */
 186                          else
 187                              $pattern .= '\\';
 188                          $pattern .= ')';
 189                          break;
 190                      case '(?':
 191                          $level++;
 192                          $pattern .= '(?';
 193                          break;
 194                      default:
 195                          if (substr($elt, 0, 1) == '\\')
 196                              $pattern .= $elt;
 197                          else
 198                              $pattern .= str_replace('/', '\/', $elt);
 199                      }
 200                  }
 201                  $this->_patterns[$i] = "($pattern)";
 202              }
 203              $this->_regex = "/" . implode("|", $this->_patterns) . "/" . $this->_getPerlMatchingFlags();
 204          }
 205          return $this->_regex;
 206      }
 207  
 208      /**
 209       *    Accessor for perl regex mode flags to use.
 210       *    @return string       Perl regex flags.
 211       *    @access private
 212       */
 213      function _getPerlMatchingFlags() {
 214          return ($this->_case ? "msS" : "msSi");
 215      }
 216  }
 217  
 218  /**
 219   *    States for a stack machine.
 220   *    @package Lexer
 221   *    @subpackage Lexer
 222   */
 223  class Doku_LexerStateStack {
 224      var $_stack;
 225  
 226      /**
 227       *    Constructor. Starts in named state.
 228       *    @param string $start        Starting state name.
 229       *    @access public
 230       */
 231      function Doku_LexerStateStack($start) {
 232          $this->_stack = array($start);
 233      }
 234  
 235      /**
 236       *    Accessor for current state.
 237       *    @return string       State.
 238       *    @access public
 239       */
 240      function getCurrent() {
 241          return $this->_stack[count($this->_stack) - 1];
 242      }
 243  
 244      /**
 245       *    Adds a state to the stack and sets it
 246       *    to be the current state.
 247       *    @param string $state        New state.
 248       *    @access public
 249       */
 250      function enter($state) {
 251          array_push($this->_stack, $state);
 252      }
 253  
 254      /**
 255       *    Leaves the current state and reverts
 256       *    to the previous one.
 257       *    @return boolean    False if we drop off
 258       *                       the bottom of the list.
 259       *    @access public
 260       */
 261      function leave() {
 262          if (count($this->_stack) == 1) {
 263              return false;
 264          }
 265          array_pop($this->_stack);
 266          return true;
 267      }
 268  }
 269  
 270  /**
 271   *    Accepts text and breaks it into tokens.
 272   *    Some optimisation to make the sure the
 273   *    content is only scanned by the PHP regex
 274   *    parser once. Lexer modes must not start
 275   *    with leading underscores.
 276   *    @package Doku
 277   *    @subpackage Lexer
 278   */
 279  class Doku_Lexer {
 280      var $_regexes;
 281      var $_parser;
 282      var $_mode;
 283      var $_mode_handlers;
 284      var $_case;
 285  
 286      /**
 287       *    Sets up the lexer in case insensitive matching
 288       *    by default.
 289       *    @param Doku_Parser $parser  Handling strategy by
 290       *                                    reference.
 291       *    @param string $start            Starting handler.
 292       *    @param boolean $case            True for case sensitive.
 293       *    @access public
 294       */
 295      function Doku_Lexer(&$parser, $start = "accept", $case = false) {
 296          $this->_case = $case;
 297          $this->_regexes = array();
 298          $this->_parser = &$parser;
 299          $this->_mode = &new Doku_LexerStateStack($start);
 300          $this->_mode_handlers = array();
 301      }
 302  
 303      /**
 304       *    Adds a token search pattern for a particular
 305       *    parsing mode. The pattern does not change the
 306       *    current mode.
 307       *    @param string $pattern      Perl style regex, but ( and )
 308       *                                lose the usual meaning.
 309       *    @param string $mode         Should only apply this
 310       *                                pattern when dealing with
 311       *                                this type of input.
 312       *    @access public
 313       */
 314      function addPattern($pattern, $mode = "accept") {
 315          if (! isset($this->_regexes[$mode])) {
 316              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 317          }
 318          $this->_regexes[$mode]->addPattern($pattern);
 319      }
 320  
 321      /**
 322       *    Adds a pattern that will enter a new parsing
 323       *    mode. Useful for entering parenthesis, strings,
 324       *    tags, etc.
 325       *    @param string $pattern      Perl style regex, but ( and )
 326       *                                lose the usual meaning.
 327       *    @param string $mode         Should only apply this
 328       *                                pattern when dealing with
 329       *                                this type of input.
 330       *    @param string $new_mode     Change parsing to this new
 331       *                                nested mode.
 332       *    @access public
 333       */
 334      function addEntryPattern($pattern, $mode, $new_mode) {
 335          if (! isset($this->_regexes[$mode])) {
 336              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 337          }
 338          $this->_regexes[$mode]->addPattern($pattern, $new_mode);
 339      }
 340  
 341      /**
 342       *    Adds a pattern that will exit the current mode
 343       *    and re-enter the previous one.
 344       *    @param string $pattern      Perl style regex, but ( and )
 345       *                                lose the usual meaning.
 346       *    @param string $mode         Mode to leave.
 347       *    @access public
 348       */
 349      function addExitPattern($pattern, $mode) {
 350          if (! isset($this->_regexes[$mode])) {
 351              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 352          }
 353          $this->_regexes[$mode]->addPattern($pattern, "__exit");
 354      }
 355  
 356      /**
 357       *    Adds a pattern that has a special mode. Acts as an entry
 358       *    and exit pattern in one go, effectively calling a special
 359       *    parser handler for this token only.
 360       *    @param string $pattern      Perl style regex, but ( and )
 361       *                                lose the usual meaning.
 362       *    @param string $mode         Should only apply this
 363       *                                pattern when dealing with
 364       *                                this type of input.
 365       *    @param string $special      Use this mode for this one token.
 366       *    @access public
 367       */
 368      function addSpecialPattern($pattern, $mode, $special) {
 369          if (! isset($this->_regexes[$mode])) {
 370              $this->_regexes[$mode] = new Doku_LexerParallelRegex($this->_case);
 371          }
 372          $this->_regexes[$mode]->addPattern($pattern, "_$special");
 373      }
 374  
 375      /**
 376       *    Adds a mapping from a mode to another handler.
 377       *    @param string $mode        Mode to be remapped.
 378       *    @param string $handler     New target handler.
 379       *    @access public
 380       */
 381      function mapHandler($mode, $handler) {
 382          $this->_mode_handlers[$mode] = $handler;
 383      }
 384  
 385      /**
 386       *    Splits the page text into tokens. Will fail
 387       *    if the handlers report an error or if no
 388       *    content is consumed. If successful then each
 389       *    unparsed and parsed token invokes a call to the
 390       *    held listener.
 391       *    @param string $raw        Raw HTML text.
 392       *    @return boolean           True on success, else false.
 393       *    @access public
 394       */
 395      function parse($raw) {
 396          if (! isset($this->_parser)) {
 397              return false;
 398          }
 399          $initialLength = strlen($raw);
 400          $length = $initialLength;
 401          $pos = 0;
 402          while (is_array($parsed = $this->_reduce($raw))) {
 403              list($unmatched, $matched, $mode) = $parsed;
 404              $currentLength = strlen($raw);
 405              $matchPos = $initialLength - $currentLength - strlen($matched);
 406              if (! $this->_dispatchTokens($unmatched, $matched, $mode, $pos, $matchPos)) {
 407                  return false;
 408              }
 409              if ($currentLength == $length) {
 410                  return false;
 411              }
 412              $length = $currentLength;
 413              $pos = $initialLength - $currentLength;
 414          }
 415          if (!$parsed) {
 416              return false;
 417          }
 418          return $this->_invokeParser($raw, DOKU_LEXER_UNMATCHED, $pos);
 419      }
 420  
 421      /**
 422       *    Sends the matched token and any leading unmatched
 423       *    text to the parser changing the lexer to a new
 424       *    mode if one is listed.
 425       *    @param string $unmatched    Unmatched leading portion.
 426       *    @param string $matched      Actual token match.
 427       *    @param string $mode         Mode after match. A boolean
 428       *                                false mode causes no change.
 429       *    @param int $pos         Current byte index location in raw doc
 430       *                                thats being parsed
 431       *    @return boolean             False if there was any error
 432       *                                from the parser.
 433       *    @access private
 434       */
 435      function _dispatchTokens($unmatched, $matched, $mode = false, $initialPos, $matchPos) {
 436          if (! $this->_invokeParser($unmatched, DOKU_LEXER_UNMATCHED, $initialPos) ){
 437              return false;
 438          }
 439          if ($this->_isModeEnd($mode)) {
 440              if (! $this->_invokeParser($matched, DOKU_LEXER_EXIT, $matchPos)) {
 441                  return false;
 442              }
 443              return $this->_mode->leave();
 444          }
 445          if ($this->_isSpecialMode($mode)) {
 446              $this->_mode->enter($this->_decodeSpecial($mode));
 447              if (! $this->_invokeParser($matched, DOKU_LEXER_SPECIAL, $matchPos)) {
 448                  return false;
 449              }
 450              return $this->_mode->leave();
 451          }
 452          if (is_string($mode)) {
 453              $this->_mode->enter($mode);
 454              return $this->_invokeParser($matched, DOKU_LEXER_ENTER, $matchPos);
 455          }
 456          return $this->_invokeParser($matched, DOKU_LEXER_MATCHED, $matchPos);
 457      }
 458  
 459      /**
 460       *    Tests to see if the new mode is actually to leave
 461       *    the current mode and pop an item from the matching
 462       *    mode stack.
 463       *    @param string $mode    Mode to test.
 464       *    @return boolean        True if this is the exit mode.
 465       *    @access private
 466       */
 467      function _isModeEnd($mode) {
 468          return ($mode === "__exit");
 469      }
 470  
 471      /**
 472       *    Test to see if the mode is one where this mode
 473       *    is entered for this token only and automatically
 474       *    leaves immediately afterwoods.
 475       *    @param string $mode    Mode to test.
 476       *    @return boolean        True if this is the exit mode.
 477       *    @access private
 478       */
 479      function _isSpecialMode($mode) {
 480          return (strncmp($mode, "_", 1) == 0);
 481      }
 482  
 483      /**
 484       *    Strips the magic underscore marking single token
 485       *    modes.
 486       *    @param string $mode    Mode to decode.
 487       *    @return string         Underlying mode name.
 488       *    @access private
 489       */
 490      function _decodeSpecial($mode) {
 491          return substr($mode, 1);
 492      }
 493  
 494      /**
 495       *    Calls the parser method named after the current
 496       *    mode. Empty content will be ignored. The lexer
 497       *    has a parser handler for each mode in the lexer.
 498       *    @param string $content        Text parsed.
 499       *    @param boolean $is_match      Token is recognised rather
 500       *                                  than unparsed data.
 501       *    @param int $pos         Current byte index location in raw doc
 502       *                                thats being parsed
 503       *    @access private
 504       */
 505      function _invokeParser($content, $is_match, $pos) {
 506          if (($content === "") || ($content === false)) {
 507              return true;
 508          }
 509          $handler = $this->_mode->getCurrent();
 510          if (isset($this->_mode_handlers[$handler])) {
 511              $handler = $this->_mode_handlers[$handler];
 512          }
 513  
 514          // modes starting with plugin_ are all handled by the same
 515          // handler but with an additional parameter
 516          if(substr($handler,0,7)=='plugin_'){
 517            list($handler,$plugin) = split('_',$handler,2);
 518                return $this->_parser->$handler($content, $is_match, $pos, $plugin);
 519          }
 520  
 521              return $this->_parser->$handler($content, $is_match, $pos);
 522          }
 523  
 524      /**
 525       *    Tries to match a chunk of text and if successful
 526       *    removes the recognised chunk and any leading
 527       *    unparsed data. Empty strings will not be matched.
 528       *    @param string $raw         The subject to parse. This is the
 529       *                               content that will be eaten.
 530       *    @return array              Three item list of unparsed
 531       *                               content followed by the
 532       *                               recognised token and finally the
 533       *                               action the parser is to take.
 534       *                               True if no match, false if there
 535       *                               is a parsing error.
 536       *    @access private
 537       */
 538      function _reduce(&$raw) {
 539          if (! isset($this->_regexes[$this->_mode->getCurrent()])) {
 540              return false;
 541          }
 542          if ($raw === "") {
 543              return true;
 544          }
 545          if ($action = $this->_regexes[$this->_mode->getCurrent()]->split($raw, $split)) {
 546              list($unparsed, $match, $raw) = $split;
 547              return array($unparsed, $match, $action);
 548          }
 549          return true;
 550      }
 551  }
 552  
 553  /**
 554  * Escapes regex characters other than (, ) and /
 555  * @TODO
 556  */
 557  function Doku_Lexer_Escape($str) {
 558      //$str = addslashes($str);
 559      $chars = array(
 560          '/\\\\/',
 561          '/\./',
 562