- package com.sun.org.apache.regexp.internal;
-
- /*
- * ====================================================================
- *
- * The Apache Software License, Version 1.1
- *
- * Copyright (c) 1999 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution, if
- * any, must include the following acknowlegement:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowlegement may appear in the software itself,
- * if and wherever such third-party acknowlegements normally appear.
- *
- * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
- * Foundation" must not be used to endorse or promote products derived
- * from this software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache"
- * nor may "Apache" appear in their names without prior written
- * permission of the Apache Group.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- *
- */
-
- import java.util.Vector;
-
- /**
- * RE is an efficient, lightweight regular expression evaluator/matcher class.
- * Regular expressions are pattern descriptions which enable sophisticated matching of
- * strings. In addition to being able to match a string against a pattern, you
- * can also extract parts of the match. This is especially useful in text parsing!
- * Details on the syntax of regular expression patterns are given below.
- *
- * <p>
- *
- * To compile a regular expression (RE), you can simply construct an RE matcher
- * object from the string specification of the pattern, like this:
- *
- * <pre>
- *
- * RE r = new RE("a*b");
- *
- * </pre>
- *
- * <p>
- *
- * Once you have done this, you can call either of the RE.match methods to
- * perform matching on a String. For example:
- *
- * <pre>
- *
- * boolean matched = r.match("aaaab");
- *
- * </pre>
- *
- * will cause the boolean matched to be set to true because the
- * pattern "a*b" matches the string "aaaab".
- *
- * <p>
- * If you were interested in the <i>number</i> of a's which matched the first
- * part of our example expression, you could change the expression to
- * "(a*)b". Then when you compiled the expression and matched it against
- * something like "xaaaab", you would get results like this:
- *
- * <pre>
- *
- * RE r = new RE("(a*)b"); // Compile expression
- * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
- *
- * <br>
- *
- * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
- * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
- *
- * <br>
- *
- * int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
- * int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
- * int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
- *
- * <br>
- *
- * int startInside = getParenStart(1); // startInside will be index 1
- * int endInside = getParenEnd(1); // endInside will be index 5
- * int lenInside = getParenLength(1); // lenInside will be 4
- *
- * </pre>
- *
- * You can also refer to the contents of a parenthesized expression within
- * a regular expression itself. This is called a 'backreference'. The first
- * backreference in a regular expression is denoted by \1, the second by \2
- * and so on. So the expression:
- *
- * <pre>
- *
- * ([0-9]+)=\1
- *
- * </pre>
- *
- * will match any string of the form n=n (like 0=0 or 2=2).
- *
- * <p>
- *
- * The full regular expression syntax accepted by RE is described here:
- *
- * <pre>
- *
- * <br>
- *
- * <b><font face=times roman>Characters</font></b>
- *
- * <br>
- *
- * <i>unicodeChar</i> Matches any identical unicode character
- * \ Used to quote a meta-character (like '*')
- * \\ Matches a single '\' character
- * \0nnn Matches a given octal character
- * \xhh Matches a given 8-bit hexadecimal character
- * \\uhhhh Matches a given 16-bit hexadecimal character
- * \t Matches an ASCII tab character
- * \n Matches an ASCII newline character
- * \r Matches an ASCII return character
- * \f Matches an ASCII form feed character
- *
- * <br>
- *
- * <b><font face=times roman>Character Classes</font></b>
- *
- * <br>
- *
- * [abc] Simple character class
- * [a-zA-Z] Character class with ranges
- * [^abc] Negated character class
- *
- * <br>
- *
- * <b><font face=times roman>Standard POSIX Character Classes</font></b>
- *
- * <br>
- *
- * [:alnum:] Alphanumeric characters.
- * [:alpha:] Alphabetic characters.
- * [:blank:] Space and tab characters.
- * [:cntrl:] Control characters.
- * [:digit:] Numeric characters.
- * [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
- * [:lower:] Lower-case alphabetic characters.
- * [:print:] Printable characters (characters that are not control characters.)
- * [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
- * [:space:] Space characters (such as space, tab, and formfeed, to name a few).
- * [:upper:] Upper-case alphabetic characters.
- * [:xdigit:] Characters that are hexadecimal digits.
- *
- * <br>
- *
- * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
- *
- * <br>
- *
- * [:javastart:] Start of a Java identifier
- * [:javapart:] Part of a Java identifier
- *
- * <br>
- *
- * <b><font face=times roman>Predefined Classes</font></b>
- *
- * <br>
- *
- * . Matches any character other than newline
- * \w Matches a "word" character (alphanumeric plus "_")
- * \W Matches a non-word character
- * \s Matches a whitespace character
- * \S Matches a non-whitespace character
- * \d Matches a digit character
- * \D Matches a non-digit character
- *
- * <br>
- *
- * <b><font face=times roman>Boundary Matchers</font></b>
- *
- * <br>
- *
- * ^ Matches only at the beginning of a line
- * $ Matches only at the end of a line
- * \b Matches only at a word boundary
- * \B Matches only at a non-word boundary
- *
- * <br>
- *
- * <b><font face=times roman>Greedy Closures</font></b>
- *
- * <br>
- *
- * A* Matches A 0 or more times (greedy)
- * A+ Matches A 1 or more times (greedy)
- * A? Matches A 1 or 0 times (greedy)
- * A{n} Matches A exactly n times (greedy)
- * A{n,} Matches A at least n times (greedy)
- * A{n,m} Matches A at least n but not more than m times (greedy)
- *
- * <br>
- *
- * <b><font face=times roman>Reluctant Closures</font></b>
- *
- * <br>
- *
- * A*? Matches A 0 or more times (reluctant)
- * A+? Matches A 1 or more times (reluctant)
- * A?? Matches A 0 or 1 times (reluctant)
- *
- * <br>
- *
- * <b><font face=times roman>Logical Operators</font></b>
- *
- * <br>
- *
- * AB Matches A followed by B
- * A|B Matches either A or B
- * (A) Used for subexpression grouping
- *
- * <br>
- *
- * <b><font face=times roman>Backreferences</font></b>
- *
- * <br>
- *
- * \1 Backreference to 1st parenthesized subexpression
- * \2 Backreference to 2nd parenthesized subexpression
- * \3 Backreference to 3rd parenthesized subexpression
- * \4 Backreference to 4th parenthesized subexpression
- * \5 Backreference to 5th parenthesized subexpression
- * \6 Backreference to 6th parenthesized subexpression
- * \7 Backreference to 7th parenthesized subexpression
- * \8 Backreference to 8th parenthesized subexpression
- * \9 Backreference to 9th parenthesized subexpression
- *
- * <br>
- *
- * </pre>
- *
- * <p>
- *
- * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
- * match as many elements of the string as possible without causing the overall
- * match to fail. If you want a closure to be reluctant (non-greedy), you can
- * simply follow it with a '?'. A reluctant closure will match as few elements
- * of the string as possible when finding matches. {m,n} closures don't currently
- * support reluctancy.
- *
- * <p>
- *
- * RE runs programs compiled by the RECompiler class. But the RE matcher class
- * does not include the actual regular expression compiler for reasons of
- * efficiency. In fact, if you want to pre-compile one or more regular expressions,
- * the 'recompile' class can be invoked from the command line to produce compiled
- * output like this:
- *
- * <pre>
- *
- * // Pre-compiled regular expression "a*b"
- * char[] re1Instructions =
- * {
- * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
- * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
- * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
- * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
- * 0x0000,
- * };
- *
- * <br>
- *
- * REProgram re1 = new REProgram(re1Instructions);
- *
- * </pre>
- *
- * You can then construct a regular expression matcher (RE) object from the pre-compiled
- * expression re1 and thus avoid the overhead of compiling the expression at runtime.
- * If you require more dynamic regular expressions, you can construct a single RECompiler
- * object and re-use it to compile each expression. Similarly, you can change the
- * program run by a given matcher object at any time. However, RE and RECompiler are
- * not threadsafe (for efficiency reasons, and because requiring thread safety in this
- * class is deemed to be a rare requirement), so you will need to construct a separate
- * compiler or matcher object for each thread (unless you do thread synchronization
- * yourself).
- *
- * </pre>
- * <br><p><br>
- *
- * <font color=red>
- * <i>ISSUES:</i>
- *
- * <ul>
- * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
- * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
- * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
- * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
- * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
- * <li>Should RE support character iterators (for backwards RE matching!)?
- * <li>Should RE support reluctant {m,n} closures (does anyone care)?
- * <li>Not *all* possibilities are considered for greediness when backreferences
- * are involved (as POSIX suggests should be the case). The POSIX RE
- * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
- * of acdacaa where \1 is "a". This is not the case in this RE package,
- * and actually Perl doesn't go to this extent either! Until someone
- * actually complains about this, I'm not sure it's worth "fixing".
- * If it ever is fixed, test #137 in RETest.txt should be updated.
- * </ul>
- *
- * </font>
- *
- * @see recompile
- * @see RECompiler
- *
- * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
- * @version $Id: RE.java,v 1.6 2000/08/22 17:19:38 jon Exp $
- */
- public class RE
- {
- /**
- * Specifies normal, case-sensitive matching behaviour.
- */
- public static final int MATCH_NORMAL = 0x0000;
-
- /**
- * Flag to indicate that matching should be case-independent (folded)
- */
- public static final int MATCH_CASEINDEPENDENT = 0x0001;
-
- /**
- * Newlines should match as BOL/EOL (^ and $)
- */
- public static final int MATCH_MULTILINE = 0x0002;
-
- /**
- * Consider all input a single body of text - newlines are matched by .
- */
- public static final int MATCH_SINGLELINE = 0x0004;
-
- /************************************************
- * *
- * The format of a node in a program is: *
- * *
- * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
- * *
- * char OPCODE - instruction *
- * char OPDATA - modifying data *
- * char OPNEXT - next node (relative offset) *
- * *
- ************************************************/
-
- // Opcode Char Opdata/Operand Meaning
- // ---------- ---------- --------------- --------------------------------------------------
- static final char OP_END = 'E'; // end of program
- static final char OP_BOL = '^'; // match only if at beginning of line
- static final char OP_EOL = '$'; // match only if at end of line
- static final char OP_ANY = '.'; // match any single character except newline
- static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
- static final char OP_BRANCH = '|'; // node match this alternative or the next one
- static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
- static final char OP_STAR = '*'; // node kleene closure
- static final char OP_PLUS = '+'; // node positive closure
- static final char OP_MAYBE = '?'; // node optional closure
- static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
- static final char OP_OPEN = '('; // number nth opening paren
- static final char OP_CLOSE = ')'; // number nth closing paren
- static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
- static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
- static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
- static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
- static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
- static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
- static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
-
- // Escape codes
- static final char E_ALNUM = 'w'; // Alphanumeric
- static final char E_NALNUM = 'W'; // Non-alphanumeric
- static final char E_BOUND = 'b'; // Word boundary
- static final char E_NBOUND = 'B'; // Non-word boundary
- static final char E_SPACE = 's'; // Whitespace
- static final char E_NSPACE = 'S'; // Non-whitespace
- static final char E_DIGIT = 'd'; // Digit
- static final char E_NDIGIT = 'D'; // Non-digit
-
- // Posix character classes
- static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
- static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
- static final char POSIX_CLASS_BLANK = 'b'; // Blanks
- static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
- static final char POSIX_CLASS_DIGIT = 'd'; // Digits
- static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
- static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
- static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
- static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
- static final char POSIX_CLASS_SPACE = 's'; // Spaces
- static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
- static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
- static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
- static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
-
- // Limits
- static final int maxNode = 65536; // Maximum number of nodes in a program
- static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
-
- // Node layout constants
- static final int offsetOpcode = 0; // Opcode offset (first character)
- static final int offsetOpdata = 1; // Opdata offset (second char)
- static final int offsetNext = 2; // Next index offset (third char)
- static final int nodeSize = 3; // Node size (in chars)
-
- /** Line Separator */
- static final String NEWLINE = System.getProperty("line.separator");
-
- // State of current program
- REProgram program; // Compiled regular expression 'program'
- CharacterIterator search; // The string being matched against
- int idx; // Current index in string being searched
- int matchFlags; // Match behaviour flags
-
- // Parenthesized subexpressions
- int parenCount; // Number of subexpressions matched (num open parens + 1)
- int start0; // Cache of start[0]
- int end0; // Cache of start[0]
- int start1; // Cache of start[1]
- int end1; // Cache of start[1]
- int start2; // Cache of start[2]
- int end2; // Cache of start[2]
- int[] startn; // Lazy-alloced array of sub-expression starts
- int[] endn; // Lazy-alloced array of sub-expression ends
-
- // Backreferences
- int[] startBackref; // Lazy-alloced array of backref starts
- int[] endBackref; // Lazy-alloced array of backref ends
-
- /**
- * Constructs a regular expression matcher from a String by compiling it
- * using a new instance of RECompiler. If you will be compiling many
- * expressions, you may prefer to use a single RECompiler object instead.
- * @param pattern The regular expression pattern to compile.
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- * @see RECompiler
- * @see recompile
- */
- public RE(String pattern) throws RESyntaxException
- {
- this(pattern, MATCH_NORMAL);
- }
-
- /**
- * Constructs a regular expression matcher from a String by compiling it
- * using a new instance of RECompiler. If you will be compiling many
- * expressions, you may prefer to use a single RECompiler object instead.
- * @param pattern The regular expression pattern to compile.
- * @param matchFlags The matching style
- * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
- * @see RECompiler
- * @see recompile
- */
- public RE(String pattern, int matchFlags) throws RESyntaxException
- {
- this(new RECompiler().compile(pattern));
- setMatchFlags(matchFlags);
- }
-
- /**
- * Construct a matcher for a pre-compiled regular expression from program
- * (bytecode) data. Permits special flags to be passed in to modify matching
- * behaviour.
- * @param program Compiled regular expression program (see RECompiler and/or recompile)
- * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
- *
- * <pre>
- *
- * MATCH_NORMAL // Normal (case-sensitive) matching
- * MATCH_CASEINDEPENDENT // Case folded comparisons
- * MATCH_MULTILINE // Newline matches as BOL/EOL
- *
- * </pre>
- *
- * @see RECompiler
- * @see REProgram
- * @see recompile
- */
- public RE(REProgram program, int matchFlags)
- {
- setProgram(program);
- setMatchFlags(matchFlags);
- }
-
- /**
- * Construct a matcher for a pre-compiled regular expression from program
- * (bytecode) data.
- * @param program Compiled regular expression program
- * @see RECompiler
- * @see recompile
- */
- public RE(REProgram program)
- {
- this(program, MATCH_NORMAL);
- }
-
- /**
- * Constructs a regular expression matcher with no initial program.
- * This is likely to be an uncommon practice, but is still supported.
- */
- public RE()
- {
- this((REProgram)null, MATCH_NORMAL);
- }
-
- /**
- * Converts a 'simplified' regular expression to a full regular expression
- * @param pattern The pattern to convert
- * @return The full regular expression
- */
- public static String simplePatternToFullRegularExpression(String pattern)
- {
- StringBuffer buf = new StringBuffer();
- for (int i = 0; i < pattern.length(); i++)
- {
- char c = pattern.charAt(i);
- switch (c)
- {
- case '*':
- buf.append(".*");
- break;
-
- case '.':
- case '[':
- case ']':
- case '\\':
- case '+':
- case '?':
- case '{':
- case '}':
- case '$':
- case '^':
- case '|':
- case '(':
- case ')':
- buf.append('\\');
- default:
- buf.append(c);
- break;
- }
- }
- return buf.toString();
- }
-
- /**
- * Sets match behaviour flags which alter the way RE does matching.
- * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
- *
- * <pre>
- *
- * MATCH_NORMAL // Normal (case-sensitive) matching
- * MATCH_CASEINDEPENDENT // Case folded comparisons
- * MATCH_MULTILINE // Newline matches as BOL/EOL
- *
- * </pre>
- *
- */
- public void setMatchFlags(int matchFlags)
- {
- this.matchFlags = matchFlags;
- }
-
- /**
- * Returns the current match behaviour flags.
- * @return Current match behaviour flags (RE.MATCH_*).
- *
- * <pre>
- *
- * MATCH_NORMAL // Normal (case-sensitive) matching
- * MATCH_CASEINDEPENDENT // Case folded comparisons
- * MATCH_MULTILINE // Newline matches as BOL/EOL
- *
- * </pre>
- *
- * @see #setMatchFlags
- *
- */
- public int getMatchFlags()
- {
- return matchFlags;
- }
-
- /**
- * Sets the current regular expression program used by this matcher object.
- * @param program Regular expression program compiled by RECompiler.
- * @see RECompiler
- * @see REProgram
- * @see recompile
- */
- public void setProgram(REProgram program)
- {
- this.program = program;
- }
-
- /**
- * Returns the current regular expression program in use by this matcher object.
- * @return Regular expression program
- * @see #setProgram
- */
- public REProgram getProgram()
- {
- return program;
- }
-
- /**
- * Returns the number of parenthesized subexpressions available after a successful match.
- * @return Number of available parenthesized subexpressions
- */
- public int getParenCount()
- {
- return parenCount;
- }
-
- /**
- * Gets the contents of a parenthesized subexpression after a successful match.
- * @param which Nesting level of subexpression
- * @return String
- */
- public String getParen(int which)
- {
- int start;
- if (which < parenCount && (start = getParenStart(which)) >= 0)
- {
- return search.substring(start, getParenEnd(which));
- }
- return null;
- }
-
- /**
- * Returns the start index of a given paren level.
- * @param which Nesting level of subexpression
- * @return String index
- */
- public final int getParenStart(int which)
- {
- if (which < parenCount)
- {
- switch (which)
- {
- case 0:
- return start0;
-
- case 1:
- return start1;
-
- case 2:
- return start2;
-
- default:
- if (startn == null)
- {
- allocParens();
- }
- return startn[which];
- }
- }
- return -1;
- }
-
- /**
- * Returns the end index of a given paren level.
- * @param which Nesting level of subexpression
- * @return String index
- */
- public final int getParenEnd(int which)
- {
- if (which < parenCount)
- {
- switch (which)
- {
- case 0:
- return end0;
-
- case 1:
- return end1;
-
- case 2:
- return end2;
-
- default:
- if (endn == null)
- {
- allocParens();
- }
- return endn[which];
- }
- }
- return -1;
- }
-
- /**
- * Returns the length of a given paren level.
- * @param which Nesting level of subexpression
- * @return Number of characters in the parenthesized subexpression
- */
- public final int getParenLength(int which)
- {
- if (which < parenCount)
- {
- return getParenEnd(which) - getParenStart(which);
- }
- return -1;
- }
-
- /**
- * Sets the start of a paren level
- * @param which Which paren level
- * @param i Index in input array
- */
- protected final void setParenStart(int which, int i)
- {
- if (which < parenCount)
- {
- switch (which)
- {
- case 0:
- start0 = i;
- break;
-
- case 1:
- start1 = i;
- break;
-
- case 2:
- start2 = i;
- break;
-
- default:
- if (startn == null)
- {
- allocParens();
- }
- startn[which] = i;
- break;
- }
- }
- }
-
- /**
- * Sets the end of a paren level
- * @param which Which paren level
- * @param i Index in input array
- */
- protected final void setParenEnd(int which, int i)
- {
- if (which < parenCount)
- {
- switch (which)
- {
- case 0:
- end0 = i;
- break;
-
- case 1:
- end1 = i;
- break;
-
- case 2:
- end2 = i;
- break;
-
- default:
- if (endn == null)
- {
- allocParens();
- }
- endn[which] = i;
- break;
- }
- }
- }
-
- /**
- * Throws an Error representing an internal error condition probably resulting
- * from a bug in the regular expression compiler (or possibly data corruption).
- * In practice, this should be very rare.
- * @param s Error description
- */
- protected void internalError(String s) throws Error
- {
- throw new Error("RE internal error: " + s);
- }
-
- /**
- * Performs lazy allocation of subexpression arrays
- */
- private final void allocParens()
- {
- // Allocate arrays for subexpressions
- startn = new int[maxParen];
- endn = new int[maxParen];
-
- // Set sub-expression pointers to invalid values
- for (int i = 0; i < maxParen; i++)
- {
- startn[i] = -1;
- endn[i] = -1;
- }
- }
-
- /**
- * Try to match a string against a subset of nodes in the program
- * @param firstNode Node to start at in program
- * @param lastNode Last valid node (used for matching a subexpression without
- * matching the rest of the program as well).
- * @param idxStart Starting position in character array
- * @return Final input array index if match succeeded. -1 if not.
- */
- protected int matchNodes(int firstNode, int lastNode, int idxStart)
- {
- // Our current place in the string
- int idx = idxStart;
-
- // Loop while node is valid
- int next, opcode, opdata;
- int idxNew;
- char[] instruction = program.instruction;
- for (int node = firstNode; node < lastNode; )
- {
- opcode = instruction[node + offsetOpcode];
- next = node + (short)instruction[node + offsetNext];
- opdata = instruction[node + offsetOpdata];
-
- switch (opcode)
- {
- case OP_RELUCTANTMAYBE:
- {
- int once = 0;
- do
- {
- // Try to match the rest without using the reluctant subexpr
- if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
- {
- return idxNew;
- }
- }
- while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
- return -1;
- }
-
- case OP_RELUCTANTPLUS:
- while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
- {
- // Try to match the rest without using the reluctant subexpr
- if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
- {
- return idxNew;
- }
- }
- return -1;
-
- case OP_RELUCTANTSTAR:
- do
- {
- // Try to match the rest without using the reluctant subexpr
- if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
- {
- return idxNew;
- }
- }
- while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
- return -1;
-
- case OP_OPEN:
-
- // Match subexpression
- if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
- {
- startBackref[opdata] = idx;
- }
- if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
- {
- // Increase valid paren count
- if ((opdata + 1) > parenCount)
- {
- parenCount = opdata + 1;
- }
-
- // Don't set paren if already set later on
- if (getParenStart(opdata) == -1)
- {
- setParenStart(opdata, idx);
- }
- }
- return idxNew;
-
- case OP_CLOSE:
-
- // Done matching subexpression
- if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
- {
- endBackref[opdata] = idx;
- }
- if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
- {
- // Increase valid paren count
- if ((opdata + 1) > parenCount)
- {
- parenCount = opdata + 1;
- }
-
- // Don't set paren if already set later on
- if (getParenEnd(opdata) == -1)
- {
- setParenEnd(opdata, idx);
- }
- }
- return idxNew;
-
- case OP_BACKREF:
- {
- // Get the start and end of the backref
- int s = startBackref[opdata];
- int e = endBackref[opdata];
-
- // We don't know the backref yet
- if (s == -1 || e == -1)
- {
- return -1;
- }
-
- // The backref is empty size
- if (s == e)
- {
- break;
- }
-
- // Get the length of the backref
- int l = e - s;
-
- // If there's not enough input left, give up.
- if (search.isEnd(idx + l - 1))
- {
- return -1;
- }
-
- // Case fold the backref?
- if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
- {
- // Compare backref to input, case-folding as we go
- for (int i = 0; i < l; i++)
- {
- if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
- {
- return -1;
- }
- }
- }
- else
- {
- // Compare backref to input
- for (int i = 0; i < l; i++)
- {
- if (search.charAt(idx++) != search.charAt(s + i))
- {
- return -1;
- }
- }
- }
- }
- break;
-
- case OP_BOL:
-
- // Fail if we're not at the start of the string
- if (idx != 0)
- {
- // If we're multiline matching, we could still be at the start of a line
- if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
- {
- // If not at start of line, give up
- if (idx <= 0 || !isNewline(idx - 1)) {
- return -1;
- } else {
- break;
- }
- }
- return -1;
- }
- break;
-
- case OP_EOL:
-
- // If we're not at the end of string
- if (!search.isEnd(0) && !search.isEnd(idx))
- {
- // If we're multi-line matching
- if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
- {
- // Give up if we're not at the end of a line
- if (! isNewline(idx)) {
- return -1;
- } else {
- break;
- }
- }
- return -1;
- }
- break;
-
- case OP_ESCAPE:
-
- // Which escape?
- switch (opdata)
- {
- // Word boundary match
- case E_NBOUND:
- case E_BOUND:
- {
- char cLast = ((idx == getParenStart(0)) ? '\n' : search.charAt(idx - 1));
- char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
- if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
- {
- return -1;
- }
- }
- break;
-
- // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
- case E_ALNUM:
- case E_NALNUM:
- case E_DIGIT:
- case E_NDIGIT:
- case E_SPACE:
- case E_NSPACE:
-
- // Give up if out of input
- if (search.isEnd(idx))
- {
- return -1;
- }
-
- // Switch on escape
- switch (opdata)
- {
- case E_ALNUM:
- case E_NALNUM:
- if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM)))
- {
- return -1;
- }
- break;
-
- case E_DIGIT:
- case E_NDIGIT:
- if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT)))
- {
- return -1;
- }
- break;
-
- case E_SPACE:
- case E_NSPACE:
- if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE)))
- {
- return -1;
- }
- break;
- }
- idx++;
- break;
-
- default:
- internalError("Unrecognized escape '" + opdata + "'");
- }
- break;
-
- case OP_ANY:
-
- if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
- // Match anything
- if(search.isEnd(idx))
- {
- return -1;
- }
- idx++;
- break;
- }
- else
- {
- // Match anything but a newline
- if (search.isEnd(idx) || search.charAt(idx++) == '\n')
- {
- return -1;
- }
- break;
- }
-
- case OP_ATOM:
- {
- // Match an atom value
- if (search.isEnd(idx))
- {
- return -1;
- }
-
- // Get length of atom and starting index
- int lenAtom = opdata;
- int startAtom = node + nodeSize;
-
- // Give up if not enough input remains to have a match
- if (search.isEnd(lenAtom + idx - 1))
- {
- return -1;
- }
-
- // Match atom differently depending on casefolding flag
- if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
- {
- for (int i = 0; i < lenAtom; i++)
- {
- if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
- {
- return -1;
- }
- }
- }
- else
- {
- for (int i = 0; i < lenAtom; i++)
- {
- if (search.charAt(idx++) != instruction[startAtom + i])
- {
- return -1;
- }
- }
- }
- }
- break;
-
- case OP_POSIXCLASS:
- {
- // Out of input?
- if (search.isEnd(idx))
- {
- return -1;
- }
-
- switch (opdata)
- {
- case POSIX_CLASS_ALNUM:
- if (!Character.isLetterOrDigit(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_ALPHA:
- if (!Character.isLetter(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_DIGIT:
- if (!Character.isDigit(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
- if (!Character.isSpaceChar(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_SPACE:
- if (!Character.isWhitespace(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_CNTRL:
- if (Character.getType(search.charAt(idx)) != Character.CONTROL)
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_GRAPH: // JWL - bugbug???
- switch (Character.getType(search.charAt(idx)))
- {
- case Character.MATH_SYMBOL:
- case Character.CURRENCY_SYMBOL:
- case Character.MODIFIER_SYMBOL:
- case Character.OTHER_SYMBOL:
- break;
-
- default:
- return -1;
- }
- break;
-
- case POSIX_CLASS_LOWER:
- if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_UPPER:
- if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_PRINT:
- if (Character.getType(search.charAt(idx)) == Character.CONTROL)
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_PUNCT:
- {
- int type = Character.getType(search.charAt(idx));
- switch(type)
- {
- case Character.DASH_PUNCTUATION:
- case Character.START_PUNCTUATION:
- case Character.END_PUNCTUATION:
- case Character.CONNECTOR_PUNCTUATION:
- case Character.OTHER_PUNCTUATION:
- break;
-
- default:
- return -1;
- }
- }
- break;
-
- case POSIX_CLASS_XDIGIT: // JWL - bugbug??
- {
- boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
- (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
- (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
- if (!isXDigit)
- {
- return -1;
- }
- }
- break;
-
- case POSIX_CLASS_JSTART:
- if (!Character.isJavaIdentifierStart(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- case POSIX_CLASS_JPART:
- if (!Character.isJavaIdentifierPart(search.charAt(idx)))
- {
- return -1;
- }
- break;
-
- default:
- internalError("Bad posix class");
- break;
- }
-
- // Matched.
- idx++;
- }
- break;
-
- case OP_ANYOF:
- {
- // Out of input?
- if (search.isEnd(idx))
- {
- return -1;
- }
-
- // Get character to match against character class and maybe casefold
- char c = search.charAt(idx);
- boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
- if (caseFold)
- {
- c = Character.toLowerCase(c);
- }
-
- // Loop through character class checking our match character
- int idxRange = node + nodeSize;
- int idxEnd = idxRange + (opdata * 2);
- boolean match = false;
- for (int i = idxRange; i < idxEnd; )
- {
- // Get start, end and match characters
- char s = instruction[i++];
- char e = instruction[i++];
-
- // Fold ends of range and match character
- if (caseFold)
- {
- s = Character.toLowerCase(s);
- e = Character.toLowerCase(e);
- }
-
- // If the match character is in range, break out
- if (c >= s && c <= e)
- {
- match = true;
- break;
- }
- }
-
- // Fail if we didn't match the character class
- if (!match)
- {
- return -1;
- }
- idx++;
- }
- break;
-
- case OP_BRANCH:
- {
- // Check for choices
- if (instruction[next + offsetOpcode] != OP_BRANCH)
- {
- // If there aren't any other choices, just evaluate this branch.
- node += nodeSize;
- continue;
- }
-
- // Try all available branches
- short nextBranch;
- do
- {
- // Try matching the branch against the string
- if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
- {
- return idxNew;
- }
-
- // Go to next branch (if any)
- nextBranch = (short)instruction[node + offsetNext];
- node += nextBranch;
- }
- while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
-
- // Failed to match any branch!
- return -1;
- }
-
- case OP_NOTHING:
- case OP_GOTO:
-
- // Just advance to the next node without doing anything
- break;
-
- case OP_END:
-
- // Match has succeeded!
- setParenEnd(0, idx);
- return idx;
-
- default:
-
- // Corrupt program
- internalError("Invalid opcode '" + opcode + "'");
- }
-
- // Advance to the next node in the program
- node = next;
- }
-
- // We "should" never end up here
- internalError("Corrupt program");
- return -1;
- }
-
- /**
- * Match the current regular expression program against the current
- * input string, starting at index i of the input string. This method
- * is only meant for internal use.
- * @param i The input string index to start matching at
- * @return True if the input matched the expression
- */
- protected boolean matchAt(int i)
- {
- // Initialize start pointer, paren cache and paren count
- start0 = -1;
- end0 = -1;
- start1 = -1;
- end1 = -1;
- start2 = -1;
- end2 = -1;
- startn = null;
- endn = null;
- parenCount = 1;
- setParenStart(0, i);
-
- // Allocate backref arrays (unless optimizations indicate otherwise)
- if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
- {
- startBackref = new int[maxParen];
- endBackref = new int[maxParen];
- }
-
- // Match against string
- int idx;
- if ((idx = matchNodes(0, maxNode, i)) != -1)
- {
- setParenEnd(0, idx);
- return true;
- }
-
- // Didn't match
- parenCount = 0;
- return false;
- }
-
- /**
- * Matches the current regular expression program against a character array,
- * starting at a given index.
- * @param search String to match against
- * @param i Index to start searching at
- * @return True if string matched
- */
- public boolean match(String search, int i)
- {
- return match(new StringCharacterIterator(search), i);
- }
-
- /**
- * Matches the current regular expression program against a character array,
- * starting at a given index.
- * @param search String to match against
- * @param i Index to start searching at
- * @return True if string matched
- */
- public boolean match(CharacterIterator search, int i)
- {
- // There is no compiled program to search with!
- if (program == null)
- {
- // This should be uncommon enough to be an error case rather
- // than an exception (which would have to be handled everywhere)
- internalError("No RE program to run!");
- }
-
- // Save string to search
- this.search = search;
-
- // Can we optimize the search by looking for a prefix string?
- if (program.prefix == null)
- {
- // Unprefixed matching must try for a match at each character
- for ( ;! search.isEnd(i - 1); i++)
- {
- // Try a match at index i
- if (matchAt(i))
- {
- return true;
- }
- }
- return false;
- }
- else
- {
- // Prefix-anchored matching is possible
- boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
- char[] prefix = program.prefix;
- for ( ;! search.isEnd(i + prefix.length - 1); i++)
- {
- // If the first character of the prefix matches
- boolean match = false;
- if (caseIndependent)
- match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
- else
- match = search.charAt(i) == prefix[0];
- if (match)
- {
- // Save first character position
- int firstChar = i++;
- int k;
- for (k = 1; k < prefix.length; )
- {
- // If there's a mismatch of any character in the prefix, give up
- if (caseIndependent)
- match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
- else
- match = search.charAt(i++) == prefix[k++];
- if (!match)
- {
- break;
- }
- }
-
- // See if the whole prefix string matched
- if (k == prefix.length)
- {
- // We matched the full prefix at firstChar, so try it
- if (matchAt(firstChar))
- {
- return true;
- }
- }
-
- // Match failed, reset i to continue the search
- i = firstChar;
- }
- }
- return false;
- }
- }
-
- /**
- * Matches the current regular expression program against a String.
- * @param search String to match against
- * @return True if string matched
- */
- public boolean match(String search)
- {
- return match(search, 0);
- }
-
- /**
- * Splits a string into an array of strings on regular expression boundaries.
- * This function works the same way as the Perl function of the same name.
- * Given a regular expression of "[ab]+" and a string to split of
- * "xyzzyababbayyzabbbab123", the result would be the array of Strings
- * "[xyzzy, yyz, 123]".
- * @param s String to split on this regular exression
- * @return Array of strings
- */
- public String[] split(String s)
- {
- // Create new vector
- Vector v = new Vector();
-
- // Start at position 0 and search the whole string
- int pos = 0;
- int len = s.length();
-
- // Try a match at each position
- while (pos < len && match(s, pos))
- {
- // Get start of match
- int start = getParenStart(0);
-
- // Get end of match
- int newpos = getParenEnd(0);
-
- // Check if no progress was made
- if (newpos == pos)
- {
- v.addElement(s.substring(pos, start + 1));
- newpos++;
- }
- else
- {
- v.addElement(s.substring(pos, start));
- }
-
- // Move to new position
- pos = newpos;
- }
-
- // Push remainder if it's not empty
- String remainder = s.substring(pos);
- if (remainder.length() != 0)
- {
- v.addElement(remainder);
- }
-
- // Return vector as an array of strings
- String[] ret = new String[v.size()];
- v.copyInto(ret);
- return ret;
- }
-
- /**
- * Flag bit that indicates that subst should replace all occurrences of this
- * regular expression.
- */
- public static final int REPLACE_ALL = 0x0000;
-
- /**
- * Flag bit that indicates that subst should only replace the first occurrence
- * of this regular expression.
- */
- public static final int REPLACE_FIRSTONLY = 0x0001;
-
- /**
- * Substitutes a string for this regular expression in another string.
- * This method works like the Perl function of the same name.
- * Given a regular expression of "a*b", a String to substituteIn of
- * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
- * resulting String returned by subst would be "-foo-garply-wacky-".
- * @param substituteIn String to substitute within
- * @param substitution String to substitute for all matches of this regular expression.
- * @return The string substituteIn with zero or more occurrences of the current
- * regular expression replaced with the substitution String (if this regular
- * expression object doesn't match at any position, the original String is returned
- * unchanged).
- */
- public String subst(String substituteIn, String substitution)
- {
- return subst(substituteIn, substitution, REPLACE_ALL);
- }
-
- /**
- * Substitutes a string for this regular expression in another string.
- * This method works like the Perl function of the same name.
- * Given a regular expression of "a*b", a String to substituteIn of
- * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
- * resulting String returned by subst would be "-foo-garply-wacky-".
- * @param substituteIn String to substitute within
- * @param substitution String to substitute for matches of this regular expression
- * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
- * flag bit is set, only the first occurrence of this regular expression is replaced.
- * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
- * replaced.
- * @return The string substituteIn with zero or more occurrences of the current
- * regular expression replaced with the substitution String (if this regular
- * expression object doesn't match at any position, the original String is returned
- * unchanged).
- */
- public String subst(String substituteIn, String substitution, int flags)
- {
- // String to return
- StringBuffer ret = new StringBuffer();
-
- // Start at position 0 and search the whole string
- int pos = 0;
- int len = substituteIn.length();
-
- // Try a match at each position
- while (pos < len && match(substituteIn, pos))
- {
- // Append string before match
- ret.append(substituteIn.substring(pos, getParenStart(0)));
-
- // Append substitution
- ret.append(substitution);
-
- // Move forward, skipping past match
- int newpos = getParenEnd(0);
-
- // We always want to make progress!
- if (newpos == pos)
- {
- newpos++;
- }
-
- // Try new position
- pos = newpos;
-
- // Break out if we're only supposed to replace one occurrence
- if ((flags & REPLACE_FIRSTONLY) != 0)
- {
- break;
- }
- }
-
- // If there's remaining input, append it
- if (pos < len)
- {
- ret.append(substituteIn.substring(pos));
- }
-
- // Return string buffer as string
- return ret.toString();
- }
-
- /**
- * Returns an array of Strings, whose toString representation matches a regular
- * expression. This method works like the Perl function of the same name. Given
- * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
- * aaaab], the array of Strings returned by grep would be [aab, aaaab].
- * @param search Array of Objects to search
- * @return Array of Objects whose toString value matches this regular expression.
- */
- public String[] grep(Object[] search)
- {
- // Create new vector to hold return items
- Vector v = new Vector();
-
- // Traverse array of objects
- for (int i = 0; i < search.length; i++)
- {
- // Get next object as a string
- String s = search[i].toString();
-
- // If it matches this regexp, add it to the list
- if (match(s))
- {
- v.addElement(s);
- }
- }
-
- // Return vector as an array of strings
- String[] ret = new String[v.size()];
- v.copyInto(ret);
- return ret;
- }
-
- /** @return true if at the i-th position in the 'search' a newline ends */
- private boolean isNewline(int i) {
-
- if (i < NEWLINE.length() - 1) {
- return false;
- }
-
- if (search.charAt(i) == '\n') {
- return true;
- }
-
- for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
- if (NEWLINE.charAt(j) != search.charAt(i)) {
- return false;
- }
- }
- return true;
- }
- }