package jlib.util;

import java.util.*;



/**
 * The class <code>RegExp</code> searches a string looking for a 
 * substring that matches a pattern given by a regular expression. A 
 * regular expression is a sequence of <i>tokens</i>. Each token 
 * matches a certain character or a set of characters. The tokens that 
 * does not have a special meaning matches the character iterally.<p>
 * 
 * Bellow is a brief description of each token:
 *
 * <dl>
 * <dt>\b</dt>      <dd>matches a backspace</dd>
 * <dt>\f</dt>      <dd>matches a form feed</dd>
 * <dt>\n</dt>      <dd>matches a line feed</dd>
 * <dt>\r</dt>      <dd>matches a carriage return</dd>
 * <dt>\t</dt>      <dd>matches a tab</dd>
 * <dt>&#92;uhhhh</dt>  <dd>matches a character given by its Unicode 
 *                          value, as four hexadecimal digits 
 *                          (<i>hhhh</i>)</dd>
 * <dt>\x</dt>      <dd>matches the character <i>x</i> (used to match 
 *                      a character that otherwise would have a 
 *                      special meaning)</dd>
 * <dt>^</dt>       <dd>matches the start of the input; this token 
 *                      consider the start of the input the position 
 *                      from where start searching, not the beginning 
 *                      of the string</dd>
 * <dt>$</dt>       <dd>matches the end of the input</dd>
 * <dt>*</dt>       <dd>matches the preceding token 0 or more 
 *                      times</dd>
 * <dt>+</dt>       <dd>matches the preceding token 1 or more 
 *                      times</dd>
 * <dt>?</dt>       <dd>matches the preceding token 0 or 1 time</dd>
 * <dt>.</dt>       <dd>matches any character</dd>
 * <dt>(x)</dt>     <dd>matches the expression <i>x</i> 
 *                      (see grammar)</dd>
 * <dt>x|y</dt>     <dd>matches the expressions <i>x</i> or <i>y</i> 
 *                      (see grammar)</dd>
 * <dt>[xyz]</dt>   <dd>character set; matches any character of the 
 *                      set; a range of characters can be specified as 
 *                      <i>x</i><code>-</code><i>y</i> 
 *                      (see grammar)</dd>
 * <dt>[^xyz]</dt>  <dd>a negated character set; matches any character 
 *                      that is not in the set (see grammar)</dd>
 * <dt>\d</dt>      <dd>matches a digit; equivalent to 
 *                      <code>[0-9]</code></dd>
 * <dt>\D</dt>      <dd>matches any character except a digit; 
 *                      equivalent to <code>[^0-9]</code></dd>
 * <dt>\s</dt>      <dd>matches a space, tab, form feed or line feed; 
 *                      equivalent to <code>[ \f\n\r\t]</code></dd>
 * <dt>\S</dt>      <dd>matches any character except a space, tab, 
 *                      form feed and line feed; equivalent to 
                        <code>[^ \f\n\r\t]</code></dd>
 * <dt>\w</dt>      <dd>matches any alphanumeric character, including 
 *                      underscore; equivalent to 
 *                      <code>[A-Za-z0-9_]</code></dd>
 * <dt>\W</dt>      <dd>matches any character, except an alphanumeric 
 *                      character and underscore; equivalent to 
 *                      <code>[^A-Za-z0-9_]</code></dd>
 * </dl><p>
 * 
 * Bellow is a formal BNF grammar for the regular expression:
 *
 * <pre>
 * reg-exp      ::= exp { '|' exp }
 * exp          ::= token { token }
 * token        ::= rep-token [ '*' | '+' | '?' ] | '^' | '$'
 * rep-token    ::= string | '(' reg-exp ')' | set | '.' | class-match
 * string       ::= schar { schar }
 * set          ::= '[' ['^'] set-spec { set-spec } ']'
 * set-spec     ::= schar | schar '-' schar
 * class-match  ::= '\' ( 'd' | 'D' | 's' | 'S' | 'w' | 'W' )
 * schar        ::= escape | char
 * escape       ::= '\' 'u' hex hex hex hex | '\' char
 * hex          ::= hex digit
 * char         ::= unicode character
 * </pre>
 */
public class RegExp
  {
    private String  expression;    // original regular expression 
                                   // string representation
    private boolean ignoreCase;    // true to ignore case
    private Node    root;          // regular expression root node
    
    private String  input;         // input string
    private int     start;         // position to begin searching

    private int     index;         // current position
    private boolean found;         // true if last search successful, 
                                   // false otherwise
    private int     begin, end;    // last match positions



    //----------------------- inner classes -------------------------
    /**
     * Regular expression node. The regular expression is represented 
     * as a graph of nodes. Each node represents a particular regular 
     * expression pattern. the links represents the possible 
     * alternatives.
     */
    private class Node
      {
        Vector    links = new Vector();

        /**
         * Matches the pattern represented by this node to the input 
         * string current position and updates the current position if 
         * there is a match.
         *
         * @return  true if there is a match, false otherwise
         */
        boolean match()
          {
            return true;
          }
      }



    /**
     * Matches the beggining of the string.
     */
    private class Bos extends Node
      {
        boolean match()
          {
            if ( index == start )
              return true;
            else
              return false;
          }
      }



    /**
     * Matches the end of the string.
     */
    private class Eos extends Node
      {
        boolean match()
          {
            if ( index == input.length() )
              return true;
            else
              return false;
          }
      }



    /**
     * The class Literal matches a character.
     */
    private class Literal extends Node
      {
        char pattern;

        boolean match()
          {
            if ( index >= input.length() )
              return false;
            char c1 = ignoreCase?Character.toUpperCase(pattern):
                                 pattern;
            char c2 = input.charAt(index);
            if ( ignoreCase )
              c2 = Character.toUpperCase(c2);
            if ( c1 == c2 )
              {
                index++;
                return true;
              }
            else
              return false;
          }
      }



    /**
     * The class Set matches any character that is part of a set of 
     * characters, specified as a list of ranges.
     */
    private class Set extends Node
      {
        private char[][]    set;
        private boolean     invert;

        boolean match()
          {
            if ( index >= input.length() )
              return false;
            char c = input.charAt(index);
            if ( ignoreCase )
              c = Character.toUpperCase(c);
            for ( int i=0; i<set.length; i++ )
              {
                char rangeBegin = set[i][0];
                char rangeEnd = set[i][1];
                if ( ignoreCase )
                  {
                    rangeBegin = Character.toUpperCase(rangeBegin);
                    rangeEnd = Character.toUpperCase(rangeEnd);
                  }
                if ( (c >= rangeBegin) && (c <= rangeEnd) )
                  if ( invert )
                    return false;
                  else
                    {
                      index++;
                      return true;
                    }
              }
            if ( invert )
              {
                index++;
                return true;
              }
            return false;
          }
      }



    /**
     * The class Wildcard matches any character.
     */
    private class Wildcard extends Node
      {
        boolean match()
          {
            if ( index >= input.length() )
              return false;
            index++;
            return true;
          }
      }



    //----------------------- internal methods ----------------------
    /**
     * Throws an exception if the end of expression reached.
     *
     * @throws  SyntaxError   if end of expression reached
     */
    private void checkEnd() throws SyntaxError
      {
        if ( index >= expression.length() )
          throw new SyntaxError("unexpected end of input");
      }



    /**
     * Throws an exception if a syntax error found.
     *
     * @param   error   true if syntax error
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private void checkSyntax( boolean error ) throws SyntaxError
      {
        if ( error )
          throw new SyntaxError("syntax error at "+index);
      }



    /**
     * Parses a escape code.
     *
     * @return  the character corresponding to the escape code
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private char parseEscape() throws SyntaxError
      {
        checkEnd();
        char c = expression.charAt(index);
        index++;
        switch ( c )
          {
            case 'b':
            case 'B':
              c = '\b';
              break;

            case 'f':
            case 'F':
              c = '\f';
              break;

            case 'n':
            case 'N':
              c = '\n';
              break;

            case 'r':
            case 'R':
              c = '\r';
              break;

            case 't':
            case 'T':
              c = '\t';
              break;

            case 'u':
            case 'U':
              c = 0;
              char digit;
              for ( int i=0; i<4; i++ )
                {
                  checkEnd();
                  digit = Character.toUpperCase(expression.charAt(index));
                  checkSyntax( (digit < '0') || (digit > 'F') || 
                               ((digit > '9') && (digit < 'A')) );
                  digit -= '0';
                  if ( digit > 9 )
                    digit -= 7;
                  c = (char) (c*16 + digit);
                  index++;
                }
              break;
          }
        return c;
      }


    /**
     * Parses an expression.
     *
     * @return  the expression starting and ending nodes
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private Node[] parseExpression() throws SyntaxError
      {
        Node[] exp = null;
        while ( true )
          {
            Node[] token = parseToken();
            if ( exp == null )
              exp = token;
            else
              exp[1].links.addElement( token[0] );
            exp[1] = token[1];
            if ( index >= expression.length() )
              break;
            char c = expression.charAt(index);
            if ( (c == '|') || (c == ')') )
              break;
          }
        return exp;
      }



    /**
     * Parses a regular expression.
     *
     * @return  the regular expression starting and ending nodes
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private Node[] parseRegularExpression() throws SyntaxError
      {
        Node[] regexp = {new Node(), new Node()};
        while ( true )
          {
            Node[] exp = parseExpression();
            regexp[0].links.addElement( exp[0] );
            exp[1].links.addElement( regexp[1] );
            if ( index >= expression.length() )
              break;
            char c = expression.charAt(index);
            if ( c == ')' )
              break;
            checkSyntax( c != '|' );
            index++;
          }
        return regexp;
      }



    /**
     * Parses a token that accepts a modifier.
     *
     * @return  token starting and ending nodes
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private Node[] parseRepToken() throws SyntaxError
      {
        Node[] token;

        char c = expression.charAt(index);
        index++;
        switch ( c )
          {
            case '(':
              token = parseRegularExpression();
              checkEnd();
              c = expression.charAt(index);
              checkSyntax( c != ')' );
              index++;
              break;

            case '[':
              token = new Node[2];
              token[0] = token[1] = parseSet();
              break;

            case '.':
              token = new Node[2];
              token[0] = token[1] = new Wildcard();
              break;

            case '\\':
              token = new Node[2];
              checkEnd();
              c = expression.charAt(index);
              boolean invert = false;
              switch ( c )
                {
                  case 'D':
                    invert = true;
                  case 'd':
                    token[0] = token[1] = new Set();
                    ((Set)token[0]).set = new char[][] {{'0','9'}};
                    ((Set)token[0]).invert = invert;
                    index++;
                    break;
                    
                  case 'S':
                    invert = true;
                  case 's':
                    token[0] = token[1] = new Set();
                    ((Set)token[0]).set = new char[][] 
                                          {{'\f','\f'},{'\n','\n'},
                                          {'\r','\r'},{'\t','\t'},
                                          {' ',' '}};
                    ((Set)token[0]).invert = invert;
                    index++;
                    break;
                    
                  case 'W':
                    invert = true;
                  case 'w':
                    token[0] = token[1] = new Set();
                    ((Set)token[0]).set = new char[][] 
                                          {{'A','Z'},{'a','z'},
                                          {'0','9'},{'_','_'}};
                    ((Set)token[0]).invert = invert;
                    index++;
                    break;
                    
                  default:
                    token[0] = token[1] = new Literal();
                    ((Literal)token[0]).pattern = parseEscape();
                    break;
                }
              break;

            default:
              token = new Node[2];
              token[0] = token[1] = new Literal();
              ((Literal)token[0]).pattern = c;
              break;
          }

        if ( index < expression.length() )
          {
            c = expression.charAt(index);
            if ( (c == '*') || (c == '+') || (c == '?') )
              {
                Node[] aux = {new Node(), new Node()};
                aux[0].links.addElement( token[0] );
                token[1].links.addElement( aux[1] );
                if ( (c == '*') || (c == '?') )
                  aux[0].links.addElement( aux[1] );
                if ( (c == '*') || (c == '+') )
                  token[1].links.addElement( token[0] );
                token = aux;
                index++;
              }
          }

        return token;
      }



    /**
     * Parses a set.
     *
     * @return  a Set object
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private Set parseSet() throws SyntaxError
      {
        Vector  set     = new Vector();
        boolean invert = false;
        char    c;
        Set     node;
        
        checkEnd();
        c = expression.charAt(index);
        if ( c == '^' )
          {
            invert = true;
            index++;
            checkEnd();
            c = expression.charAt(index);
          }
        checkSyntax( c == ']' );
        while ( true )
          {
            char c1, c2;
            checkEnd();
            c1 = c2 = expression.charAt(index);
            checkSyntax( c1 == '-' );
            index++;
            if ( c1 == ']' )
              break;
            if ( c1 == '\\' )
              c1 = c2 = parseEscape();
            checkEnd();
            c = expression.charAt(index);
            if ( c == '-' )
              {
                index++;
                checkEnd();
                c2 = expression.charAt(index);
                checkSyntax( c2 == ']' );
                int aux = index;
                index++;
                if ( c2 == '\\' )
                  c2 = parseEscape();
                if ( c2 < c1 )
                  {
                    index = aux;
                    checkSyntax( true );
                  }
              }
            set.addElement( new char[] {c1,c2} );
          }
        node = new Set();
        node.set = new char[set.size()][];
        for ( int i=0; i<set.size(); i++ )
          node.set[i] = (char[]) set.elementAt(i);
        node.invert = invert;
        return node;
      }



    /**
     * Parses a token.
     *
     * @return  the token starting and ending nodes
     *
     * @throws  SyntaxError   if a syntax error found
     */
    private Node[] parseToken() throws SyntaxError
      {
        Node[]  token;
        Node    node;

        checkEnd();
        char c = expression.charAt(index);
        switch ( c )
          {
            case '*':
            case '+':
            case '?':
            case ']':
            case '|':
            case ')':
              checkSyntax( true );

            case '^':
              node = new Bos();
              token = new Node[] {node, node};
              index++;
              break;

            case '$':
              node = new Eos();
              token = new Node[] {node, node};
              index++;
              break;

            default:
              token = parseRepToken();
              break;
          }
        return token;
      }



    /**
     * Try every possible path until the last node is reached. Uses a 
     * loop to avoid tail recursion.
     *
     * @param   node    node to visit
     */
    private void visit( Node node )
      {
        int aux = index;
        while ( true )
          {
            if ( ! node.match() )
              break;
            if ( node.links.size() == 0 )
              {
                found = true;
                return;
              }
            for ( int i=0; i<node.links.size()-1; i++ )
              {
                visit( (Node)node.links.elementAt(i) );
                if ( found )
                  return;
              }
            node = (Node) node.links.lastElement();
          }
        index = aux;
      }



    //-------------------------- class API --------------------------
    /**
     * Constructs a case sensitive regular expression, given a string
     * representation of the expression.
     *
     * @param   expression  expression string representation
     *
     * @throws  SyntaxError   if the regular expression is not valid
     */
    public RegExp( String expression ) throws SyntaxError
      {
        this( expression, false );
      }



    /**
     * Constructs a regular expression, given a string representation 
     * of the expression.
     *
     * @param   expression  expression string representation
     * @param   useCase     false to ignore case
     *
     * @throws  SyntaxError   if the regular expression is not valid
     */
    public RegExp( String expression, boolean useCase ) 
           throws SyntaxError
      {
        this.expression = expression;
        this.ignoreCase = !useCase;
        index = 0;
        root = parseRegularExpression()[0];
        checkSyntax( index < expression.length() );
      }



    /**
     * Returns the status of the last search.
     *
     * @return  true if last search successfull, false otherwise
     */
    public boolean found()
      {
        return found;
      }



    /**
     * Returns whether this regular expression is case sensitive or 
     * not.
     *
     * @return  true if this regular is case sensitive, false 
     *          otherwise
     */
    public boolean getCase()
      {
        return !ignoreCase;
      }



    /**
     * Returns the original regular expression string representation.
     *
     * @return  the original expression
     */
    public String getExpression()
      {
        return expression;
      }



    /**
     * Returns the last string searched on.
     *
     * @return  the last string searched
     */
    public String getInput()
      {
        return input;
      }



    /**
     * Returns the substring found in the last search operation.
     *
     * @return  the substring found
     */
    public String match()
      {
        return input.substring(begin,end+1);
      }



    /**
     * Returns the starting position of the last match.
     *
     * @return  the starting position of the last match
     */
    public int matchBegin()
      {
        return begin;
      }



    /**
     * Returns the end position of the last match.
     *
     * @return  the end position of the last match
     */
    public int matchEnd()
      {
        return end;
      }



    /**
     * Searchs the next substring that matches this regular 
     * expression. The method <code>next</code> can only be used after 
     * the <code>search</code> and <code>startSearch</code> methods.
     *
     * @return  true if a match was found, false otherwise
     */
    public boolean next()
      {
        found = false;
        begin = index;
        while ( true )
          {
            index = begin;
            visit( root );
            if ( found )
              break;
            begin++;
            if ( begin >= input.length() )
              break;
          }
        end = index - 1;
        return found;
      }



    /**
     * Given a string, looks for the first substring that matches this 
     * regular expression starting at the beggining of the string. The 
     * results of the operation can be queried through a number of 
     * methods, until a new search is performed.
     *
     * @param   input     input string
     *
     * @return  true if a hit was found, false otherwise
     */
    public boolean search( String input )
      {
        return search(input,0);
      }



    /**
     * Given a string, looks for the first substring that matches this 
     * regular expression starting at a given position. The results of 
     * the operation can be queried through a number of methods, until 
     * a new search is performed.
     *
     * @param   input     input string
     * @param   start     position to begin searching
     *
     * @return  true if a match was found, false otherwise
     */
    public boolean search( String input, int start )
      {
        startSearch( input, start );
        return next();
      }



    /**
     * Sets whether this regular expression is case sensitive or not.
     *
     * @param   useCase   true to set this regular expression case 
     *                    sensitive
     */
    public void setCase( boolean useCase )
      {
        ignoreCase = !useCase;
      }



    /**
     * Initializes this object so that the <code>next</code> method 
     * will return the substrings that match this regular expression 
     * in sequence, starting at the beggining of the input string.
     *
     * @param   input     input string
     */
    public void startSearch( String input )
      {
        startSearch( input, 0 );
      }



    /**
     * Initializes this object so that the <code>next</code> method 
     * will return the substrings that match this regular expression 
     * in sequence, starting at a given position.
     *
     * @param   input     input string
     * @param   start     position to begin searching
     */
    public void startSearch( String input, int start )
      {
        this.input = input;
        this.start = start;
        index = start;
      }
  }
  
