- package com.sun.org.apache.regexp.internal;
-
- /*
- * ====================================================================
- *
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution, if
- * any, must include the following acknowlegement:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowlegement may appear in the software itself,
- * if and wherever such third-party acknowlegements normally appear.
- *
- * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
- * Foundation" must not be used to endorse or promote products derived
- * from this software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache"
- * nor may "Apache" appear in their names without prior written
- * permission of the Apache Group.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- */
-
- import com.sun.org.apache.regexp.internal.RE;
- import java.util.Hashtable;
-
- /**
- * A regular expression compiler class. This class compiles a pattern string into a
- * regular expression program interpretable by the RE evaluator class. The 'recompile'
- * command line tool uses this compiler to pre-compile regular expressions for use
- * with RE. For a description of the syntax accepted by RECompiler and what you can
- * do with regular expressions, see the documentation for the RE matcher class.
- *
- * @see RE
- * @see recompile
- *
- * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
- * @version $Id: RECompiler.java,v 1.2 2000/05/14 21:04:17 jon Exp $
- */
- public class RECompiler
- {
- // The compiled program
- char[] instruction; // The compiled RE 'program' instruction buffer
- int lenInstruction; // The amount of the program buffer currently in use
-
- // Input state for compiling regular expression
- String pattern; // Input string
- int len; // Length of the pattern string
- int idx; // Current input index into ac
- int parens; // Total number of paren pairs
-
- // Node flags
- static final int NODE_NORMAL = 0; // No flags (nothing special)
- static final int NODE_NULLABLE = 1; // True if node is potentially null
- static final int NODE_TOPLEVEL = 2; // True if top level expr
-
- // Special types of 'escapes'
- static final char ESC_MASK = 0xfff0; // Escape complexity mask
- static final char ESC_BACKREF = 0xffff; // Escape is really a backreference
- static final char ESC_COMPLEX = 0xfffe; // Escape isn't really a true character
- static final char ESC_CLASS = 0xfffd; // Escape represents a whole class of characters
-
- // {m,n} stacks
- static final int maxBrackets = 10; // Maximum number of bracket pairs
- static int brackets = 0; // Number of bracket sets
- static int[] bracketStart = null; // Starting point
- static int[] bracketEnd = null; // Ending point
- static int[] bracketMin = null; // Minimum number of matches
- static int[] bracketOpt = null; // Additional optional matches
- static final int bracketUnbounded = -1; // Unbounded value
- static final int bracketFinished = -2; // Unbounded value
-
- // Lookup table for POSIX character class names
- static Hashtable hashPOSIX = new Hashtable();
- static
- {
- hashPOSIX.put("alnum", new Character(RE.POSIX_CLASS_ALNUM));
- hashPOSIX.put("alpha", new Character(RE.POSIX_CLASS_ALPHA));
- hashPOSIX.put("blank", new Character(RE.POSIX_CLASS_BLANK));
- hashPOSIX.put("cntrl", new Character(RE.POSIX_CLASS_CNTRL));
- hashPOSIX.put("digit", new Character(RE.POSIX_CLASS_DIGIT));
- hashPOSIX.put("graph", new Character(RE.POSIX_CLASS_GRAPH));
- hashPOSIX.put("lower", new Character(RE.POSIX_CLASS_LOWER));
- hashPOSIX.put("print", new Character(RE.POSIX_CLASS_PRINT));
- hashPOSIX.put("punct", new Character(RE.POSIX_CLASS_PUNCT));
- hashPOSIX.put("space", new Character(RE.POSIX_CLASS_SPACE));
- hashPOSIX.put("upper", new Character(RE.POSIX_CLASS_UPPER));
- hashPOSIX.put("xdigit", new Character(RE.POSIX_CLASS_XDIGIT));
- hashPOSIX.put("javastart", new Character(RE.POSIX_CLASS_JSTART));
- hashPOSIX.put("javapart", new Character(RE.POSIX_CLASS_JPART));
- }
-
- /**
- * Constructor. Creates (initially empty) storage for a regular expression program.
- */
- public RECompiler()
- {
- // Start off with a generous, yet reasonable, initial size
- instruction = new char[128];
- lenInstruction = 0;
- }
-
- /**
- * Ensures that n more characters can fit in the program buffer.
- * If n more can't fit, then the size is doubled until it can.
- * @param n Number of additional characters to ensure will fit.
- */
- void ensure(int n)
- {
- // Get current program length
- int curlen = instruction.length;
-
- // If the current length + n more is too much
- if (lenInstruction + n >= curlen)
- {
- // Double the size of the program array until n more will fit
- while (lenInstruction + n >= curlen)
- {
- curlen *= 2;
- }
-
- // Allocate new program array and move data into it
- char[] newInstruction = new char[curlen];
- System.arraycopy(instruction, 0, newInstruction, 0, lenInstruction);
- instruction = newInstruction;
- }
- }
-
- /**
- * Emit a single character into the program stream.
- * @param c Character to add
- */
- void emit(char c)
- {
- // Make room for character
- ensure(1);
-
- // Add character
- instruction[lenInstruction++] = c;
- }
-
- /**
- * Inserts a node with a given opcode and opdata at insertAt. The node relative next
- * pointer is initialized to 0.
- * @param opcode Opcode for new node
- * @param opdata Opdata for new node (only the low 16 bits are currently used)
- * @param insertAt Index at which to insert the new node in the program
- */
- void nodeInsert(char opcode, int opdata, int insertAt)
- {
- // Make room for a new node
- ensure(RE.nodeSize);
-
- // Move everything from insertAt to the end down nodeSize elements
- System.arraycopy(instruction, insertAt, instruction, insertAt + RE.nodeSize, lenInstruction - insertAt);
- instruction[insertAt + RE.offsetOpcode] = opcode;
- instruction[insertAt + RE.offsetOpdata] = (char)opdata;
- instruction[insertAt + RE.offsetNext] = 0;
- lenInstruction += RE.nodeSize;
- }
-
- /**
- * Appends a node to the end of a node chain
- * @param node Start of node chain to traverse
- * @param pointTo Node to have the tail of the chain point to
- */
- void setNextOfEnd(int node, int pointTo)
- {
- // Traverse the chain until the next offset is 0
- int next;
- while ((next = instruction[node + RE.offsetNext]) != 0)
- {
- node += next;
- }
-
- // Point the last node in the chain to pointTo.
- instruction[node + RE.offsetNext] = (char)(short)(pointTo - node);
- }
-
- /**
- * Adds a new node
- * @param opcode Opcode for node
- * @param opdata Opdata for node (only the low 16 bits are currently used)
- * @return Index of new node in program
- */
- int node(char opcode, int opdata)
- {
- // Make room for a new node
- ensure(RE.nodeSize);
-
- // Add new node at end
- instruction[lenInstruction + RE.offsetOpcode] = opcode;
- instruction[lenInstruction + RE.offsetOpdata] = (char)opdata;
- instruction[lenInstruction + RE.offsetNext] = 0;
- lenInstruction += RE.nodeSize;
-
- // Return index of new node
- return lenInstruction - RE.nodeSize;
- }
-
-
- /**
- * Throws a new internal error exception
- * @exception Error Thrown in the event of an internal error.
- */
- void internalError() throws Error
- {
- throw new Error("Internal error!");
- }
-
- /**
- * Throws a new syntax error exception
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- void syntaxError(String s) throws RESyntaxException
- {
- throw new RESyntaxException(s);
- }
-
- /**
- * Allocate storage for brackets only as needed
- */
- void allocBrackets()
- {
- // Allocate bracket stacks if not already done
- if (bracketStart == null)
- {
- // Allocate storage
- bracketStart = new int[maxBrackets];
- bracketEnd = new int[maxBrackets];
- bracketMin = new int[maxBrackets];
- bracketOpt = new int[maxBrackets];
-
- // Initialize to invalid values
- for (int i = 0; i < maxBrackets; i++)
- {
- bracketStart[i] = bracketEnd[i] = bracketMin[i] = bracketOpt[i] = -1;
- }
- }
- }
-
- /**
- * Match bracket {m,n} expression put results in bracket member variables
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- void bracket() throws RESyntaxException
- {
- // Current character must be a '{'
- if (idx >= len || pattern.charAt(idx++) != '{')
- {
- internalError();
- }
-
- // Next char must be a digit
- if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
- {
- syntaxError("Expected digit");
- }
-
- // Get min ('m' of {m,n}) number
- StringBuffer number = new StringBuffer();
- while (idx < len && Character.isDigit(pattern.charAt(idx)))
- {
- number.append(pattern.charAt(idx++));
- }
- try
- {
- bracketMin[brackets] = Integer.parseInt(number.toString());
- }
- catch (NumberFormatException e)
- {
- syntaxError("Expected valid number");
- }
-
- // If out of input, fail
- if (idx >= len)
- {
- syntaxError("Expected comma or right bracket");
- }
-
- // If end of expr, optional limit is 0
- if (pattern.charAt(idx) == '}')
- {
- idx++;
- bracketOpt[brackets] = 0;
- return;
- }
-
- // Must have at least {m,} and maybe {m,n}.
- if (idx >= len || pattern.charAt(idx++) != ',')
- {
- syntaxError("Expected comma");
- }
-
- // If out of input, fail
- if (idx >= len)
- {
- syntaxError("Expected comma or right bracket");
- }
-
- // If {m,} max is unlimited
- if (pattern.charAt(idx) == '}')
- {
- idx++;
- bracketOpt[brackets] = bracketUnbounded;
- return;
- }
-
- // Next char must be a digit
- if (idx >= len || !Character.isDigit(pattern.charAt(idx)))
- {
- syntaxError("Expected digit");
- }
-
- // Get max number
- number.setLength(0);
- while (idx < len && Character.isDigit(pattern.charAt(idx)))
- {
- number.append(pattern.charAt(idx++));
- }
- try
- {
- bracketOpt[brackets] = Integer.parseInt(number.toString()) - bracketMin[brackets];
- }
- catch (NumberFormatException e)
- {
- syntaxError("Expected valid number");
- }
-
- // Optional repetitions must be > 0
- if (bracketOpt[brackets] <= 0)
- {
- syntaxError("Bad range");
- }
-
- // Must have close brace
- if (idx >= len || pattern.charAt(idx++) != '}')
- {
- syntaxError("Missing close brace");
- }
- }
-
- /**
- * Match an escape sequence. Handles quoted chars and octal escapes as well
- * as normal escape characters. Always advances the input stream by the
- * right amount. This code "understands" the subtle difference between an
- * octal escape and a backref. You can access the type of ESC_CLASS or
- * ESC_COMPLEX or ESC_BACKREF by looking at pattern[idx - 1].
- * @return ESC_* code or character if simple escape
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- char escape() throws RESyntaxException
- {
- // "Shouldn't" happen
- if (pattern.charAt(idx) != '\\')
- {
- internalError();
- }
-
- // Escape shouldn't occur as last character in string!
- if (idx + 1 == len)
- {
- syntaxError("Escape terminates string");
- }
-
- // Switch on character after backslash
- idx += 2;
- char escapeChar = pattern.charAt(idx - 1);
- switch (escapeChar)
- {
- case RE.E_BOUND:
- case RE.E_NBOUND:
- return ESC_COMPLEX;
-
- case RE.E_ALNUM:
- case RE.E_NALNUM:
- case RE.E_SPACE:
- case RE.E_NSPACE:
- case RE.E_DIGIT:
- case RE.E_NDIGIT:
- return ESC_CLASS;
-
- case 'u':
- case 'x':
- {
- // Exact required hex digits for escape type
- int hexDigits = (escapeChar == 'u' ? 4 : 2);
-
- // Parse up to hexDigits characters from input
- int val = 0;
- for ( ; idx < len && hexDigits-- > 0; idx++)
- {
- // Get char
- char c = pattern.charAt(idx);
-
- // If it's a hexadecimal digit (0-9)
- if (c >= '0' && c <= '9')
- {
- // Compute new value
- val = (val << 4) + c - '0';
- }
- else
- {
- // If it's a hexadecimal letter (a-f)
- c = Character.toLowerCase(c);
- if (c >= 'a' && c <= 'f')
- {
- // Compute new value
- val = (val << 4) + (c - 'a') + 10;
- }
- else
- {
- // If it's not a valid digit or hex letter, the escape must be invalid
- // because hexDigits of input have not been absorbed yet.
- syntaxError("Expected " + hexDigits + " hexadecimal digits after \\" + escapeChar);
- }
- }
- }
- return (char)val;
- }
-
- case 't':
- return '\t';
-
- case 'n':
- return '\n';
-
- case 'r':
- return '\r';
-
- case 'f':
- return '\f';
-
- case '0':
- case '1':
- case '2':
- case '3':
- case '4':
- case '5':
- case '6':
- case '7':
- case '8':
- case '9':
-
- // An octal escape starts with a 0 or has two digits in a row
- if ((idx < len && Character.isDigit(pattern.charAt(idx))) || escapeChar == '0')
- {
- // Handle \nnn octal escapes
- int val = escapeChar - '0';
- if (idx < len && Character.isDigit(pattern.charAt(idx)))
- {
- val = ((val << 3) + (pattern.charAt(idx++) - '0'));
- if (idx < len && Character.isDigit(pattern.charAt(idx)))
- {
- val = ((val << 3) + (pattern.charAt(idx++) - '0'));
- }
- }
- return (char)val;
- }
-
- // It's actually a backreference (\[1-9]), not an escape
- return ESC_BACKREF;
-
- default:
-
- // Simple quoting of a character
- return escapeChar;
- }
- }
-
- /**
- * Compile a character class
- * @return Index of class node
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int characterClass() throws RESyntaxException
- {
- // Check for bad calling or empty class
- if (pattern.charAt(idx) != '[')
- {
- internalError();
- }
-
- // Check for unterminated or empty class
- if ((idx + 1) >= len || pattern.charAt(++idx) == ']')
- {
- syntaxError("Empty or unterminated class");
- }
-
- // Check for POSIX character class
- if (idx < len && pattern.charAt(idx) == ':')
- {
- // Skip colon
- idx++;
-
- // POSIX character classes are denoted with lowercase ASCII strings
- int idxStart = idx;
- while (idx < len && pattern.charAt(idx) >= 'a' && pattern.charAt(idx) <= 'z')
- {
- idx++;
- }
-
- // Should be a ":]" to terminate the POSIX character class
- if ((idx + 1) < len && pattern.charAt(idx) == ':' && pattern.charAt(idx + 1) == ']')
- {
- // Get character class
- String charClass = pattern.substring(idxStart, idx);
-
- // Select the POSIX class id
- Character i = (Character)hashPOSIX.get(charClass);
- if (i != null)
- {
- // Move past colon and right bracket
- idx += 2;
-
- // Return new POSIX character class node
- return node(RE.OP_POSIXCLASS, i.charValue());
- }
- syntaxError("Invalid POSIX character class '" + charClass + "'");
- }
- syntaxError("Invalid POSIX character class syntax");
- }
-
- // Try to build a class. Create OP_ANYOF node
- int ret = node(RE.OP_ANYOF, 0);
-
- // Parse class declaration
- char CHAR_INVALID = Character.MAX_VALUE;
- char last = CHAR_INVALID;
- char simpleChar = 0;
- boolean include = true;
- boolean definingRange = false;
- int idxFirst = idx;
- char rangeStart = Character.MIN_VALUE;
- char rangeEnd;
- RERange range = new RERange();
- while (idx < len && pattern.charAt(idx) != ']')
- {
-
- switchOnCharacter:
-
- // Switch on character
- switch (pattern.charAt(idx))
- {
- case '^':
- include = !include;
- if (idx == idxFirst)
- {
- range.include(Character.MIN_VALUE, Character.MAX_VALUE, true);
- }
- idx++;
- continue;
-
- case '\\':
- {
- // Escape always advances the stream
- char c;
- switch (c = escape ())
- {
- case ESC_COMPLEX:
- case ESC_BACKREF:
-
- // Word boundaries and backrefs not allowed in a character class!
- syntaxError("Bad character class");
-
- case ESC_CLASS:
-
- // Classes can't be an endpoint of a range
- if (definingRange)
- {
- syntaxError("Bad character class");
- }
-
- // Handle specific type of class (some are ok)
- switch (pattern.charAt(idx - 1))
- {
- case RE.E_NSPACE:
- case RE.E_NDIGIT:
- case RE.E_NALNUM:
- syntaxError("Bad character class");
-
- case RE.E_SPACE:
- range.include('\t', include);
- range.include('\r', include);
- range.include('\f', include);
- range.include('\n', include);
- range.include('\b', include);
- range.include(' ', include);
- break;
-
- case RE.E_ALNUM:
- range.include('a', 'z', include);
- range.include('A', 'Z', include);
- range.include('_', include);
-
- // Fall through!
-
- case RE.E_DIGIT:
- range.include('0', '9', include);
- break;
- }
-
- // Make last char invalid (can't be a range start)
- last = CHAR_INVALID;
- break;
-
- default:
-
- // Escape is simple so treat as a simple char
- simpleChar = c;
- break switchOnCharacter;
- }
- }
- continue;
-
- case '-':
-
- // Start a range if one isn't already started
- if (definingRange)
- {
- syntaxError("Bad class range");
- }
- definingRange = true;
-
- // If no last character, start of range is 0
- rangeStart = (last == CHAR_INVALID ? 0 : last);
-
- // Premature end of range. define up to Character.MAX_VALUE
- if ((idx + 1) < len && pattern.charAt(++idx) == ']')
- {
- simpleChar = Character.MAX_VALUE;
- break;
- }
- continue;
-
- default:
- simpleChar = pattern.charAt(idx++);
- break;
- }
-
- // Handle simple character simpleChar
- if (definingRange)
- {
- // if we are defining a range make it now
- rangeEnd = simpleChar;
-
- // Actually create a range if the range is ok
- if (rangeStart >= rangeEnd)
- {
- syntaxError("Bad character class");
- }
- range.include(rangeStart, rangeEnd, include);
-
- // We are done defining the range
- last = CHAR_INVALID;
- definingRange = false;
- }
- else
- {
- // If simple character and not start of range, include it
- if ((idx + 1) >= len || pattern.charAt(idx + 1) != '-')
- {
- range.include(simpleChar, include);
- }
- last = simpleChar;
- }
- }
-
- // Shouldn't be out of input
- if (idx == len)
- {
- syntaxError("Unterminated character class");
- }
-
- // Absorb the ']' end of class marker
- idx++;
-
- // Emit character class definition
- instruction[ret + RE.offsetOpdata] = (char)range.num;
- for (int i = 0; i < range.num; i++)
- {
- emit((char)range.minRange[i]);
- emit((char)range.maxRange[i]);
- }
- return ret;
- }
-
- /**
- * Absorb an atomic character string. This method is a little tricky because
- * it can un-include the last character of string if a closure operator follows.
- * This is correct because *+? have higher precedence than concatentation (thus
- * ABC* means AB(C*) and NOT (ABC)*).
- * @return Index of new atom node
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int atom() throws RESyntaxException
- {
- // Create a string node
- int ret = node(RE.OP_ATOM, 0);
-
- // Length of atom
- int lenAtom = 0;
-
- // Loop while we've got input
-
- atomLoop:
-
- while (idx < len)
- {
- // Is there a next char?
- if ((idx + 1) < len)
- {
- char c = pattern.charAt(idx + 1);
-
- // If the next 'char' is an escape, look past the whole escape
- if (pattern.charAt(idx) == '\\')
- {
- int idxEscape = idx;
- escape();
- if (idx < len)
- {
- c = pattern.charAt(idx);
- }
- idx = idxEscape;
- }
-
- // Switch on next char
- switch (c)
- {
- case '{':
- case '?':
- case '*':
- case '+':
-
- // If the next character is a closure operator and our atom is non-empty, the
- // current character should bind to the closure operator rather than the atom
- if (lenAtom != 0)
- {
- break atomLoop;
- }
- }
- }
-
- // Switch on current char
- switch (pattern.charAt(idx))
- {
- case ']':
- case '^':
- case '$':
- case '.':
- case '[':
- case '(':
- case ')':
- case '|':
- break atomLoop;
-
- case '{':
- case '?':
- case '*':
- case '+':
-
- // We should have an atom by now
- if (lenAtom == 0)
- {
- // No atom before closure
- syntaxError("Missing operand to closure");
- }
- break atomLoop;
-
- case '\\':
-
- {
- // Get the escaped character (advances input automatically)
- int idxBeforeEscape = idx;
- char c = escape();
-
- // Check if it's a simple escape (as opposed to, say, a backreference)
- if ((c & ESC_MASK) == ESC_MASK)
- {
- // Not a simple escape, so backup to where we were before the escape.
- idx = idxBeforeEscape;
- break atomLoop;
- }
-
- // Add escaped char to atom
- emit(c);
- lenAtom++;
- }
- break;
-
- default:
-
- // Add normal character to atom
- emit(pattern.charAt(idx++));
- lenAtom++;
- break;
- }
- }
-
- // This "shouldn't" happen
- if (lenAtom == 0)
- {
- internalError();
- }
-
- // Emit the atom length into the program
- instruction[ret + RE.offsetOpdata] = (char)lenAtom;
- return ret;
- }
-
- /**
- * Match a terminal node.
- * @param flags Flags
- * @return Index of terminal node (closeable)
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int terminal(int[] flags) throws RESyntaxException
- {
- switch (pattern.charAt(idx))
- {
- case RE.OP_EOL:
- case RE.OP_BOL:
- case RE.OP_ANY:
- return node(pattern.charAt(idx++), 0);
-
- case '[':
- return characterClass();
-
- case '(':
- return expr(flags);
-
- case ')':
- syntaxError("Unexpected close paren");
-
- case '|':
- internalError();
-
- case ']':
- syntaxError("Mismatched class");
-
- case 0:
- syntaxError("Unexpected end of input");
-
- case '?':
- case '+':
- case '{':
- case '*':
- syntaxError("Missing operand to closure");
-
- case '\\':
- {
- // Don't forget, escape() advances the input stream!
- int idxBeforeEscape = idx;
-
- // Switch on escaped character
- switch (escape())
- {
- case ESC_CLASS:
- case ESC_COMPLEX:
- flags[0] &= ~NODE_NULLABLE;
- return node(RE.OP_ESCAPE, pattern.charAt(idx - 1));
-
- case ESC_BACKREF:
- {
- char backreference = (char)(pattern.charAt(idx - 1) - '0');
- if (parens <= backreference)
- {
- syntaxError("Bad backreference");
- }
- flags[0] |= NODE_NULLABLE;
- return node(RE.OP_BACKREF, backreference);
- }
-
- default:
-
- // We had a simple escape and we want to have it end up in
- // an atom, so we back up and fall though to the default handling
- idx = idxBeforeEscape;
- flags[0] &= ~NODE_NULLABLE;
- break;
- }
- }
- }
-
- // Everything above either fails or returns.
- // If it wasn't one of the above, it must be the start of an atom.
- flags[0] &= ~NODE_NULLABLE;
- return atom();
- }
-
- /**
- * Compile a possibly closured terminal
- * @param flags Flags passed by reference
- * @return Index of closured node
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int closure(int[] flags) throws RESyntaxException
- {
- // Before terminal
- int idxBeforeTerminal = idx;
-
- // Values to pass by reference to terminal()
- int[] terminalFlags = { NODE_NORMAL };
-
- // Get terminal symbol
- int ret = terminal(terminalFlags);
-
- // Or in flags from terminal symbol
- flags[0] |= terminalFlags[0];
-
- // Advance input, set NODE_NULLABLE flag and do sanity checks
- if (idx >= len)
- {
- return ret;
- }
- boolean greedy = true;
- char closureType = pattern.charAt(idx);
- switch (closureType)
- {
- case '?':
- case '*':
-
- // The current node can be null
- flags[0] |= NODE_NULLABLE;
-
- case '+':
-
- // Eat closure character
- idx++;
-
- case '{':
-
- // Don't allow blantant stupidity
- int opcode = instruction[ret + RE.offsetOpcode];
- if (opcode == RE.OP_BOL || opcode == RE.OP_EOL)
- {
- syntaxError("Bad closure operand");
- }
- if ((terminalFlags[0] & NODE_NULLABLE) != 0)
- {
- syntaxError("Closure operand can't be nullable");
- }
- break;
- }
-
- // If the next character is a '?', make the closure non-greedy (reluctant)
- if (idx < len && pattern.charAt(idx) == '?')
- {
- idx++;
- greedy = false;
- }
-
- if (greedy)
- {
- // Actually do the closure now
- switch (closureType)
- {
- case '{':
- {
- // We look for our bracket in the list
- boolean found = false;
- int i;
- allocBrackets();
- for (i = 0; i < brackets; i++)
- {
- if (bracketStart[i] == idx)
- {
- found = true;
- break;
- }
- }
-
- // If its not in the list we parse the {m,n}
- if (!found)
- {
- if (brackets >= maxBrackets)
- {
- syntaxError("Too many bracketed closures (limit is 10)");
- }
- bracketStart[brackets] = idx;
- bracket();
- bracketEnd[brackets] = idx;
- i = brackets++;
- }
-
- // If there's a min, rewind stream and reparse
- if (--bracketMin[i] > 0)
- {
- // Rewind stream and run it through again
- idx = idxBeforeTerminal;
- break;
- }
-
- // Do the right thing for maximum ({m,})
- if (bracketOpt[i] == bracketFinished)
- {
- // Drop through now and closure expression.
- // We are done with the {m,} expr, so skip rest
- closureType = '*';
- bracketOpt[i] = 0;
- idx = bracketEnd[i];
- }
- else
- if (bracketOpt[i] == bracketUnbounded)
- {
- idx = idxBeforeTerminal;
- bracketOpt[i] = bracketFinished;
- break;
- }
- else
- if (bracketOpt[i]-- > 0)
- {
- // Drop through to optionally close and then 'play it again sam!'
- idx = idxBeforeTerminal;
- closureType = '?';
- }
- else
- {
- // We are done. skip the rest of {m,n} expr
- idx = bracketEnd[i];
- break;
- }
- }
-
- // Fall through!
-
- case '?':
- case '*':
-
- if (!greedy)
- {
- break;
- }
-
- if (closureType == '?')
- {
- // X? is compiled as (X|)
- nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
- setNextOfEnd(ret, node (RE.OP_BRANCH, 0)); // inserted branch to option
- int nothing = node (RE.OP_NOTHING, 0); // which is OP_NOTHING
- setNextOfEnd(ret, nothing); // point (second) branch to OP_NOTHING
- setNextOfEnd(ret + RE.nodeSize, nothing); // point the end of X to OP_NOTHING node
- }
-
- if (closureType == '*')
- {
- // X* is compiled as (X{gotoX}|)
- nodeInsert(RE.OP_BRANCH, 0, ret); // branch before X
- setNextOfEnd(ret + RE.nodeSize, node(RE.OP_BRANCH, 0)); // end of X points to an option
- setNextOfEnd(ret + RE.nodeSize, node(RE.OP_GOTO, 0)); // to goto
- setNextOfEnd(ret + RE.nodeSize, ret); // the start again
- setNextOfEnd(ret, node(RE.OP_BRANCH, 0)); // the other option is
- setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // OP_NOTHING
- }
- break;
-
- case '+':
- {
- // X+ is compiled as X({gotoX}|)
- int branch;
- branch = node(RE.OP_BRANCH, 0); // a new branch
- setNextOfEnd(ret, branch); // is added to the end of X
- setNextOfEnd(node(RE.OP_GOTO, 0), ret); // one option is to go back to the start
- setNextOfEnd(branch, node(RE.OP_BRANCH, 0)); // the other option
- setNextOfEnd(ret, node(RE.OP_NOTHING, 0)); // is OP_NOTHING
- }
- break;
- }
- }
- else
- {
- // Add end after closured subexpr
- setNextOfEnd(ret, node(RE.OP_END, 0));
-
- // Actually do the closure now
- switch (closureType)
- {
- case '?':
- nodeInsert(RE.OP_RELUCTANTMAYBE, 0, ret);
- break;
-
- case '*':
- nodeInsert(RE.OP_RELUCTANTSTAR, 0, ret);
- break;
-
- case '+':
- nodeInsert(RE.OP_RELUCTANTPLUS, 0, ret);
- break;
- }
-
- // Point to the expr after the closure
- setNextOfEnd(ret, lenInstruction);
- }
- return ret;
- }
-
- /**
- * Compile one branch of an or operator (implements concatenation)
- * @param flags Flags passed by reference
- * @return Pointer to branch node
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int branch(int[] flags) throws RESyntaxException
- {
- // Get each possibly closured piece and concat
- int node;
- int ret = node(RE.OP_BRANCH, 0);
- int chain = -1;
- int[] closureFlags = new int[1];
- boolean nullable = true;
- while (idx < len && pattern.charAt(idx) != '|' && pattern.charAt(idx) != ')')
- {
- // Get new node
- closureFlags[0] = NODE_NORMAL;
- node = closure(closureFlags);
- if (closureFlags[0] == NODE_NORMAL)
- {
- nullable = false;
- }
-
- // If there's a chain, append to the end
- if (chain != -1)
- {
- setNextOfEnd(chain, node);
- }
-
- // Chain starts at current
- chain = node;
- }
-
- // If we don't run loop, make a nothing node
- if (chain == -1)
- {
- node(RE.OP_NOTHING, 0);
- }
-
- // Set nullable flag for this branch
- if (nullable)
- {
- flags[0] |= NODE_NULLABLE;
- }
- return ret;
- }
-
- /**
- * Compile an expression with possible parens around it. Paren matching
- * is done at this level so we can tie the branch tails together.
- * @param flags Flag value passed by reference
- * @return Node index of expression in instruction array
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- */
- int expr(int[] flags) throws RESyntaxException
- {
- // Create open paren node unless we were called from the top level (which has no parens)
- boolean paren = false;
- int ret = -1;
- int closeParens = parens;
- if ((flags[0] & NODE_TOPLEVEL) == 0 && pattern.charAt(idx) == '(')
- {
- idx++;
- paren = true;
- ret = node(RE.OP_OPEN, parens++);
- }
- flags[0] &= ~NODE_TOPLEVEL;
-
- // Create a branch node
- int branch = branch(flags);
- if (ret == -1)
- {
- ret = branch;
- }
- else
- {
- setNextOfEnd(ret, branch);
- }
-
- // Loop through branches
- while (idx < len && pattern.charAt(idx) == '|')
- {
- idx++;
- branch = branch(flags);
- setNextOfEnd(ret, branch);
- }
-
- // Create an ending node (either a close paren or an OP_END)
- int end;
- if (paren)
- {
- if (idx < len && pattern.charAt(idx) == ')')
- {
- idx++;
- }
- else
- {
- syntaxError("Missing close paren");
- }
- end = node(RE.OP_CLOSE, closeParens);
- }
- else
- {
- end = node(RE.OP_END, 0);
- }
-
- // Append the ending node to the ret nodelist
- setNextOfEnd(ret, end);
-
- // Hook the ends of each branch to the end node
- for (int next = -1, i = ret; next != 0; next = instruction[i + RE.offsetNext], i += next)
- {
- // If branch, make the end of the branch's operand chain point to the end node.
- if (instruction[i + RE.offsetOpcode] == RE.OP_BRANCH)
- {
- setNextOfEnd(i + RE.nodeSize, end);
- }
- }
-
- // Return the node list
- return ret;
- }
-
- /**
- * Compiles a regular expression pattern into a program runnable by the pattern
- * matcher class 'RE'.
- * @param pattern Regular expression pattern to compile (see RECompiler class
- * for details).
- * @return A compiled regular expression program.
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- * @see RECompiler
- * @see RE
- */
- public REProgram compile(String pattern) throws RESyntaxException
- {
- // Initialize variables for compilation
- this.pattern = pattern; // Save pattern in instance variable
- len = pattern.length(); // Precompute pattern length for speed
- idx = 0; // Set parsing index to the first character
- lenInstruction = 0; // Set emitted instruction count to zero
- parens = 1; // Set paren level to 1 (the implicit outer parens)
- brackets = 0; // No bracketed closures yet
-
- // Initialize pass by reference flags value
- int[] flags = { NODE_TOPLEVEL };
-
- // Parse expression
- expr(flags);
-
- // Should be at end of input
- if (idx != len)
- {
- if (pattern.charAt(idx) == ')')
- {
- syntaxError("Unmatched close paren");
- }
- syntaxError("Unexpected input remains");
- }
-
- // Return the result
- char[] ins = new char[lenInstruction];
- System.arraycopy(instruction, 0, ins, 0, lenInstruction);
- return new REProgram(ins);
- }
-
- /**
- * Local, nested class for maintaining character ranges for character classes.
- */
- class RERange
- {
- int size = 16; // Capacity of current range arrays
- int[] minRange = new int[size]; // Range minima
- int[] maxRange = new int[size]; // Range maxima
- int num = 0; // Number of range array elements in use
-
- /**
- * Deletes the range at a given index from the range lists
- * @param index Index of range to delete from minRange and maxRange arrays.
- */
- void delete(int index)
- {
- // Return if no elements left or index is out of range
- if (num == 0 || index >= num)
- {
- return;
- }
-
- // Move elements down
- while (index++ < num)
- {
- if (index - 1 >= 0)
- {
- minRange[index-1] = minRange[index];
- maxRange[index-1] = maxRange[index];
- }
- }
-
- // One less element now
- num--;
- }
-
- /**
- * Merges a range into the range list, coalescing ranges if possible.
- * @param min Minimum end of range
- * @param max Maximum end of range
- */
- void merge(int min, int max)
- {
- // Loop through ranges
- for (int i = 0; i < num; i++)
- {
- // Min-max is subsumed by minRange[i]-maxRange[i]
- if (min >= minRange[i] && max <= maxRange[i])
- {
- return;
- }
-
- // Min-max subsumes minRange[i]-maxRange[i]
- else if (min <= minRange[i] && max >= maxRange[i])
- {
- delete(i);
- merge(min, max);
- return;
- }
-
- // Min is in the range, but max is outside
- else if (min >= minRange[i] && min <= maxRange[i])
- {
- delete(i);
- min = minRange[i];
- merge(min, max);
- return;
- }
-
- // Max is in the range, but min is outside
- else if (max >= minRange[i] && max <= maxRange[i])
- {
- delete(i);
- max = maxRange[i];
- merge(min, max);
- return;
- }
- }
-
- // Must not overlap any other ranges
- if (num >= size)
- {
- size *= 2;
- int[] newMin = new int[size];
- int[] newMax = new int[size];
- System.arraycopy(minRange, 0, newMin, 0, num);
- System.arraycopy(maxRange, 0, newMax, 0, num);
- minRange = newMin;
- maxRange = newMax;
- }
- minRange[num] = min;
- maxRange[num] = max;
- num++;
- }
-
- /**
- * Removes a range by deleting or shrinking all other ranges
- * @param min Minimum end of range
- * @param max Maximum end of range
- */
- void remove(int min, int max)
- {
- // Loop through ranges
- for (int i = 0; i < num; i++)
- {
- // minRange[i]-maxRange[i] is subsumed by min-max
- if (minRange[i] >= min && maxRange[i] <= max)
- {
- delete(i);
- i--;
- return;
- }
-
- // min-max is subsumed by minRange[i]-maxRange[i]
- else if (min >= minRange[i] && max <= maxRange[i])
- {
- int minr = minRange[i];
- int maxr = maxRange[i];
- delete(i);
- if (minr < min - 1)
- {
- merge(minr, min - 1);
- }
- if (max + 1 < maxr)
- {
- merge(max + 1, maxr);
- }
- return;
- }
-
- // minRange is in the range, but maxRange is outside
- else if (minRange[i] >= min && minRange[i] <= max)
- {
- minRange[i] = max + 1;
- return;
- }
-
- // maxRange is in the range, but minRange is outside
- else if (maxRange[i] >= min && maxRange[i] <= max)
- {
- maxRange[i] = min - 1;
- return;
- }
- }
- }
-
- /**
- * Includes (or excludes) the range from min to max, inclusive.
- * @param min Minimum end of range
- * @param max Maximum end of range
- * @param include True if range should be included. False otherwise.
- */
- void include(int min, int max, boolean include)
- {
- if (include)
- {
- merge(min, max);
- }
- else
- {
- remove(min, max);
- }
- }
-
- /**
- * Includes a range with the same min and max
- * @param minmax Minimum and maximum end of range (inclusive)
- * @param include True if range should be included. False otherwise.
- */
- void include(char minmax, boolean include)
- {
- include(minmax, minmax, include);
- }
- }
- }