- /*
- * The Apache Software License, Version 1.1
- *
- *
- * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
- * reserved.
- *
- * Redistribution and use in source and binary forms, with or without
- * modification, are permitted provided that the following conditions
- * are met:
- *
- * 1. Redistributions of source code must retain the above copyright
- * notice, this list of conditions and the following disclaimer.
- *
- * 2. Redistributions in binary form must reproduce the above copyright
- * notice, this list of conditions and the following disclaimer in
- * the documentation and/or other materials provided with the
- * distribution.
- *
- * 3. The end-user documentation included with the redistribution,
- * if any, must include the following acknowledgment:
- * "This product includes software developed by the
- * Apache Software Foundation (http://www.apache.org/)."
- * Alternately, this acknowledgment may appear in the software itself,
- * if and wherever such third-party acknowledgments normally appear.
- *
- * 4. The names "Xerces" and "Apache Software Foundation" must
- * not be used to endorse or promote products derived from this
- * software without prior written permission. For written
- * permission, please contact apache@apache.org.
- *
- * 5. Products derived from this software may not be called "Apache",
- * nor may "Apache" appear in their name, without prior written
- * permission of the Apache Software Foundation.
- *
- * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
- * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
- * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
- * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
- * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
- * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
- * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
- * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
- * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
- * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
- * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
- * SUCH DAMAGE.
- * ====================================================================
- *
- * This software consists of voluntary contributions made by many
- * individuals on behalf of the Apache Software Foundation and was
- * originally based on software copyright (c) 1999, International
- * Business Machines, Inc., http://www.apache.org. For more
- * information on the Apache Software Foundation, please see
- * <http://www.apache.org/>.
- */
-
- package com.sun.org.apache.xerces.internal.impl.xpath.regex;
-
- import java.util.Vector;
- import java.util.Hashtable;
-
- /**
- * This class represents a node in parse tree.
- *
- * @version $Id: Token.java,v 1.7 2003/02/25 14:43:13 sandygao Exp $
- */
- class Token implements java.io.Serializable {
- static final boolean COUNTTOKENS = true;
- static int tokens = 0;
-
- static final int CHAR = 0; // Literal char
- static final int DOT = 11; // .
- static final int CONCAT = 1; // XY
- static final int UNION = 2; // X|Y|Z
- static final int CLOSURE = 3; // X*
- static final int RANGE = 4; // [a-zA-Z] etc.
- static final int NRANGE = 5; // [^a-zA-Z] etc.
- static final int PAREN = 6; // (X) or (?:X)
- static final int EMPTY = 7; //
- static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
- static final int NONGREEDYCLOSURE = 9; // *? +?
- static final int STRING = 10; // strings
- static final int BACKREFERENCE = 12; // back references
- static final int LOOKAHEAD = 20; // (?=...)
- static final int NEGATIVELOOKAHEAD = 21; // (?!...)
- static final int LOOKBEHIND = 22; // (?<=...)
- static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
- static final int INDEPENDENT = 24; // (?>...)
- static final int MODIFIERGROUP = 25; // (?ims-ims:...)
- static final int CONDITION = 26; // (?(...)yes|no)
-
- static final int UTF16_MAX = 0x10ffff;
-
- int type;
-
- static Token token_dot;
- static Token token_0to9;
- static Token token_wordchars;
- static Token token_not_0to9;
- static Token token_not_wordchars;
- static Token token_spaces;
- static Token token_not_spaces;
- static Token token_empty;
- static Token token_linebeginning;
- static Token token_linebeginning2;
- static Token token_lineend;
- static Token token_stringbeginning;
- static Token token_stringend;
- static Token token_stringend2;
- static Token token_wordedge;
- static Token token_not_wordedge;
- static Token token_wordbeginning;
- static Token token_wordend;
- static {
- Token.token_empty = new Token(Token.EMPTY);
-
- Token.token_linebeginning = Token.createAnchor('^');
- Token.token_linebeginning2 = Token.createAnchor('@');
- Token.token_lineend = Token.createAnchor('$');
- Token.token_stringbeginning = Token.createAnchor('A');
- Token.token_stringend = Token.createAnchor('z');
- Token.token_stringend2 = Token.createAnchor('Z');
- Token.token_wordedge = Token.createAnchor('b');
- Token.token_not_wordedge = Token.createAnchor('B');
- Token.token_wordbeginning = Token.createAnchor('<');
- Token.token_wordend = Token.createAnchor('>');
-
- Token.token_dot = new Token(Token.DOT);
-
- Token.token_0to9 = Token.createRange();
- Token.token_0to9.addRange('0', '9');
- Token.token_wordchars = Token.createRange();
- Token.token_wordchars.addRange('0', '9');
- Token.token_wordchars.addRange('A', 'Z');
- Token.token_wordchars.addRange('_', '_');
- Token.token_wordchars.addRange('a', 'z');
- Token.token_spaces = Token.createRange();
- Token.token_spaces.addRange('\t', '\t');
- Token.token_spaces.addRange('\n', '\n');
- Token.token_spaces.addRange('\f', '\f');
- Token.token_spaces.addRange('\r', '\r');
- Token.token_spaces.addRange(' ', ' ');
-
- Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
- Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
- Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
- }
-
- static Token.ParenToken createLook(int type, Token child) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ParenToken(type, child, 0);
- }
- static Token.ParenToken createParen(Token child, int pnumber) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ParenToken(Token.PAREN, child, pnumber);
- }
- static Token.ClosureToken createClosure(Token tok) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ClosureToken(Token.CLOSURE, tok);
- }
- static Token.ClosureToken createNGClosure(Token tok) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
- }
- static Token.ConcatToken createConcat(Token tok1, Token tok2) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ConcatToken(tok1, tok2);
- }
- static Token.UnionToken createConcat() {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
- }
- static Token.UnionToken createUnion() {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.UnionToken(Token.UNION);
- }
- static Token createEmpty() {
- return Token.token_empty;
- }
- static RangeToken createRange() {
- if (COUNTTOKENS) Token.tokens ++;
- return new RangeToken(Token.RANGE);
- }
- static RangeToken createNRange() {
- if (COUNTTOKENS) Token.tokens ++;
- return new RangeToken(Token.NRANGE);
- }
- static Token.CharToken createChar(int ch) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.CharToken(Token.CHAR, ch);
- }
- static private Token.CharToken createAnchor(int ch) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.CharToken(Token.ANCHOR, ch);
- }
- static Token.StringToken createBackReference(int refno) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.StringToken(Token.BACKREFERENCE, null, refno);
- }
- static Token.StringToken createString(String str) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.StringToken(Token.STRING, str, 0);
- }
- static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ModifierToken(child, add, mask);
- }
- static Token.ConditionToken createCondition(int refno, Token condition,
- Token yespat, Token nopat) {
- if (COUNTTOKENS) Token.tokens ++;
- return new Token.ConditionToken(refno, condition, yespat, nopat);
- }
-
- protected Token(int type) {
- this.type = type;
- }
-
- /**
- * A number of children.
- */
- int size() {
- return 0;
- }
- Token getChild(int index) {
- return null;
- }
- void addChild(Token tok) {
- throw new RuntimeException("Not supported.");
- }
-
- // for RANGE or NRANGE
- protected void addRange(int start, int end) {
- throw new RuntimeException("Not supported.");
- }
- protected void sortRanges() {
- throw new RuntimeException("Not supported.");
- }
- protected void compactRanges() {
- throw new RuntimeException("Not supported.");
- }
- protected void mergeRanges(Token tok) {
- throw new RuntimeException("Not supported.");
- }
- protected void subtractRanges(Token tok) {
- throw new RuntimeException("Not supported.");
- }
- protected void intersectRanges(Token tok) {
- throw new RuntimeException("Not supported.");
- }
- static Token complementRanges(Token tok) {
- return RangeToken.complementRanges(tok);
- }
-
-
- void setMin(int min) { // for CLOSURE
- }
- void setMax(int max) { // for CLOSURE
- }
- int getMin() { // for CLOSURE
- return -1;
- }
- int getMax() { // for CLOSURE
- return -1;
- }
- int getReferenceNumber() { // for STRING
- return 0;
- }
- String getString() { // for STRING
- return null;
- }
-
- int getParenNumber() {
- return 0;
- }
- int getChar() {
- return -1;
- }
-
- public String toString() {
- return this.toString(0);
- }
- public String toString(int options) {
- return this.type == Token.DOT ? "." : "";
- }
-
- /**
- * How many characters are needed?
- */
- final int getMinLength() {
- switch (this.type) {
- case CONCAT:
- int sum = 0;
- for (int i = 0; i < this.size(); i ++)
- sum += this.getChild(i).getMinLength();
- return sum;
-
- case CONDITION:
- case UNION:
- if (this.size() == 0)
- return 0;
- int ret = this.getChild(0).getMinLength();
- for (int i = 1; i < this.size(); i ++) {
- int min = this.getChild(i).getMinLength();
- if (min < ret) ret = min;
- }
- return ret;
-
- case CLOSURE:
- case NONGREEDYCLOSURE:
- if (this.getMin() >= 0)
- return this.getMin() * this.getChild(0).getMinLength();
- return 0;
-
- case EMPTY:
- case ANCHOR:
- return 0;
-
- case DOT:
- case CHAR:
- case RANGE:
- case NRANGE:
- return 1;
-
- case INDEPENDENT:
- case PAREN:
- case MODIFIERGROUP:
- return this.getChild(0).getMinLength();
-
- case BACKREFERENCE:
- return 0; // *******
-
- case STRING:
- return this.getString().length();
-
- case LOOKAHEAD:
- case NEGATIVELOOKAHEAD:
- case LOOKBEHIND:
- case NEGATIVELOOKBEHIND:
- return 0; // ***** Really?
-
- default:
- throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
- }
- }
-
- final int getMaxLength() {
- switch (this.type) {
- case CONCAT:
- int sum = 0;
- for (int i = 0; i < this.size(); i ++) {
- int d = this.getChild(i).getMaxLength();
- if (d < 0) return -1;
- sum += d;
- }
- return sum;
-
- case CONDITION:
- case UNION:
- if (this.size() == 0)
- return 0;
- int ret = this.getChild(0).getMaxLength();
- for (int i = 1; ret >= 0 && i < this.size(); i ++) {
- int max = this.getChild(i).getMaxLength();
- if (max < 0) { // infinity
- ret = -1;
- break;
- }
- if (max > ret) ret = max;
- }
- return ret;
-
- case CLOSURE:
- case NONGREEDYCLOSURE:
- if (this.getMax() >= 0)
- // When this.child.getMaxLength() < 0,
- // this returns minus value
- return this.getMax() * this.getChild(0).getMaxLength();
- return -1;
-
- case EMPTY:
- case ANCHOR:
- return 0;
-
- case CHAR:
- return 1;
- case DOT:
- case RANGE:
- case NRANGE:
- return 2;
-
- case INDEPENDENT:
- case PAREN:
- case MODIFIERGROUP:
- return this.getChild(0).getMaxLength();
-
- case BACKREFERENCE:
- return -1; // ******
-
- case STRING:
- return this.getString().length();
-
- case LOOKAHEAD:
- case NEGATIVELOOKAHEAD:
- case LOOKBEHIND:
- case NEGATIVELOOKBEHIND:
- return 0; // ***** Really?
-
- default:
- throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
- }
- }
-
- static final int FC_CONTINUE = 0;
- static final int FC_TERMINAL = 1;
- static final int FC_ANY = 2;
- private static final boolean isSet(int options, int flag) {
- return (options & flag) == flag;
- }
- final int analyzeFirstCharacter(RangeToken result, int options) {
- switch (this.type) {
- case CONCAT:
- int ret = FC_CONTINUE;
- for (int i = 0; i < this.size(); i ++)
- if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
- break;
- return ret;
-
- case UNION:
- if (this.size() == 0)
- return FC_CONTINUE;
- /*
- * a|b|c -> FC_TERMINAL
- * a|.|c -> FC_ANY
- * a|b| -> FC_CONTINUE
- */
- int ret2 = FC_CONTINUE;
- boolean hasEmpty = false;
- for (int i = 0; i < this.size(); i ++) {
- ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
- if (ret2 == FC_ANY)
- break;
- else if (ret2 == FC_CONTINUE)
- hasEmpty = true;
- }
- return hasEmpty ? FC_CONTINUE : ret2;
-
- case CONDITION:
- int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
- if (this.size() == 1) return FC_CONTINUE;
- if (ret3 == FC_ANY) return ret3;
- int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
- if (ret4 == FC_ANY) return ret4;
- return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
-
- case CLOSURE:
- case NONGREEDYCLOSURE:
- this.getChild(0).analyzeFirstCharacter(result, options);
- return FC_CONTINUE;
-
- case EMPTY:
- case ANCHOR:
- return FC_CONTINUE;
-
- case CHAR:
- int ch = this.getChar();
- result.addRange(ch, ch);
- if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
- ch = Character.toUpperCase((char)ch);
- result.addRange(ch, ch);
- ch = Character.toLowerCase((char)ch);
- result.addRange(ch, ch);
- }
- return FC_TERMINAL;
-
- case DOT: // ****
- if (isSet(options, RegularExpression.SINGLE_LINE)) {
- return FC_CONTINUE; // **** We can not optimize.
- } else {
- return FC_CONTINUE;
- /*
- result.addRange(0, RegularExpression.LINE_FEED-1);
- result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
- result.addRange(RegularExpression.CARRIAGE_RETURN+1,
- RegularExpression.LINE_SEPARATOR-1);
- result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
- return 1;
- */
- }
-
- case RANGE:
- if (isSet(options, RegularExpression.IGNORE_CASE)) {
- result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
- } else {
- result.mergeRanges(this);
- }
- return FC_TERMINAL;
-
- case NRANGE: // ****
- if (isSet(options, RegularExpression.IGNORE_CASE)) {
- result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
- } else {
- result.mergeRanges(Token.complementRanges(this));
- }
- return FC_TERMINAL;
-
- case INDEPENDENT:
- case PAREN:
- return this.getChild(0).analyzeFirstCharacter(result, options);
-
- case MODIFIERGROUP:
- options |= ((ModifierToken)this).getOptions();
- options &= ~((ModifierToken)this).getOptionsMask();
- return this.getChild(0).analyzeFirstCharacter(result, options);
-
- case BACKREFERENCE:
- result.addRange(0, UTF16_MAX); // **** We can not optimize.
- return FC_ANY;
-
- case STRING:
- int cha = this.getString().charAt(0);
- int ch2;
- if (REUtil.isHighSurrogate(cha)
- && this.getString().length() >= 2
- && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
- cha = REUtil.composeFromSurrogates(cha, ch2);
- result.addRange(cha, cha);
- if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
- cha = Character.toUpperCase((char)cha);
- result.addRange(cha, cha);
- cha = Character.toLowerCase((char)cha);
- result.addRange(cha, cha);
- }
- return FC_TERMINAL;
-
- case LOOKAHEAD:
- case NEGATIVELOOKAHEAD:
- case LOOKBEHIND:
- case NEGATIVELOOKBEHIND:
- return FC_CONTINUE;
-
- default:
- throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
- }
- }
-
- private final boolean isShorterThan(Token tok) {
- if (tok == null) return false;
- /*
- int mylength;
- if (this.type == STRING) mylength = this.getString().length();
- else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
- else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
- int otherlength;
- if (tok.type == STRING) otherlength = tok.getString().length();
- else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
- else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
- */
- int mylength;
- if (this.type == STRING) mylength = this.getString().length();
- else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
- int otherlength;
- if (tok.type == STRING) otherlength = tok.getString().length();
- else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
- return mylength < otherlength;
- }
-
- static class FixedStringContainer {
- Token token = null;
- int options = 0;
- FixedStringContainer() {
- }
- }
-
- final void findFixedString(FixedStringContainer container, int options) {
- switch (this.type) {
- case CONCAT:
- Token prevToken = null;
- int prevOptions = 0;
- for (int i = 0; i < this.size(); i ++) {
- this.getChild(i).findFixedString(container, options);
- if (prevToken == null || prevToken.isShorterThan(container.token)) {
- prevToken = container.token;
- prevOptions = container.options;
- }
- }
- container.token = prevToken;
- container.options = prevOptions;
- return;
-
- case UNION:
- case CLOSURE:
- case NONGREEDYCLOSURE:
- case EMPTY:
- case ANCHOR:
- case RANGE:
- case DOT:
- case NRANGE:
- case BACKREFERENCE:
- case LOOKAHEAD:
- case NEGATIVELOOKAHEAD:
- case LOOKBEHIND:
- case NEGATIVELOOKBEHIND:
- case CONDITION:
- container.token = null;
- return;
-
- case CHAR: // Ignore CHAR tokens.
- container.token = null; // **
- return; // **
-
- case STRING:
- container.token = this;
- container.options = options;
- return;
-
- case INDEPENDENT:
- case PAREN:
- this.getChild(0).findFixedString(container, options);
- return;
-
- case MODIFIERGROUP:
- options |= ((ModifierToken)this).getOptions();
- options &= ~((ModifierToken)this).getOptionsMask();
- this.getChild(0).findFixedString(container, options);
- return;
-
- default:
- throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
- }
- }
-
- boolean match(int ch) {
- throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
- }
-
- // ------------------------------------------------------
- private final static Hashtable categories = new Hashtable();
- private final static Hashtable categories2 = new Hashtable();
- private static final String[] categoryNames = {
- "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
- "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
- "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
- "Pi", "Pf", // 29, 30
- "L", "M", "N", "Z", "C", "P", "S", // 31-37
- };
-
- // Schema Rec. {Datatypes} - Punctuation
- static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
- static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
- static final int CHAR_LETTER = 31;
- static final int CHAR_MARK = 32;
- static final int CHAR_NUMBER = 33;
- static final int CHAR_SEPARATOR = 34;
- static final int CHAR_OTHER = 35;
- static final int CHAR_PUNCTUATION = 36;
- static final int CHAR_SYMBOL = 37;
-
- //blockNames in UNICODE 3.1 that supported by XML Schema REC
- private static final String[] blockNames = {
- /*0000..007F;*/ "Basic Latin",
- /*0080..00FF;*/ "Latin-1 Supplement",
- /*0100..017F;*/ "Latin Extended-A",
- /*0180..024F;*/ "Latin Extended-B",
- /*0250..02AF;*/ "IPA Extensions",
- /*02B0..02FF;*/ "Spacing Modifier Letters",
- /*0300..036F;*/ "Combining Diacritical Marks",
- /*0370..03FF;*/ "Greek",
- /*0400..04FF;*/ "Cyrillic",
- /*0530..058F;*/ "Armenian",
- /*0590..05FF;*/ "Hebrew",
- /*0600..06FF;*/ "Arabic",
- /*0700..074F;*/ "Syriac",
- /*0780..07BF;*/ "Thaana",
- /*0900..097F;*/ "Devanagari",
- /*0980..09FF;*/ "Bengali",
- /*0A00..0A7F;*/ "Gurmukhi",
- /*0A80..0AFF;*/ "Gujarati",
- /*0B00..0B7F;*/ "Oriya",
- /*0B80..0BFF;*/ "Tamil",
- /*0C00..0C7F;*/ "Telugu",
- /*0C80..0CFF;*/ "Kannada",
- /*0D00..0D7F;*/ "Malayalam",
- /*0D80..0DFF;*/ "Sinhala",
- /*0E00..0E7F;*/ "Thai",
- /*0E80..0EFF;*/ "Lao",
- /*0F00..0FFF;*/ "Tibetan",
- /*1000..109F;*/ "Myanmar",
- /*10A0..10FF;*/ "Georgian",
- /*1100..11FF;*/ "Hangul Jamo",
- /*1200..137F;*/ "Ethiopic",
- /*13A0..13FF;*/ "Cherokee",
- /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
- /*1680..169F;*/ "Ogham",
- /*16A0..16FF;*/ "Runic",
- /*1780..17FF;*/ "Khmer",
- /*1800..18AF;*/ "Mongolian",
- /*1E00..1EFF;*/ "Latin Extended Additional",
- /*1F00..1FFF;*/ "Greek Extended",
- /*2000..206F;*/ "General Punctuation",
- /*2070..209F;*/ "Superscripts and Subscripts",
- /*20A0..20CF;*/ "Currency Symbols",
- /*20D0..20FF;*/ "Combining Marks for Symbols",
- /*2100..214F;*/ "Letterlike Symbols",
- /*2150..218F;*/ "Number Forms",
- /*2190..21FF;*/ "Arrows",
- /*2200..22FF;*/ "Mathematical Operators",
- /*2300..23FF;*/ "Miscellaneous Technical",
- /*2400..243F;*/ "Control Pictures",
- /*2440..245F;*/ "Optical Character Recognition",
- /*2460..24FF;*/ "Enclosed Alphanumerics",
- /*2500..257F;*/ "Box Drawing",
- /*2580..259F;*/ "Block Elements",
- /*25A0..25FF;*/ "Geometric Shapes",
- /*2600..26FF;*/ "Miscellaneous Symbols",
- /*2700..27BF;*/ "Dingbats",
- /*2800..28FF;*/ "Braille Patterns",
- /*2E80..2EFF;*/ "CJK Radicals Supplement",
- /*2F00..2FDF;*/ "Kangxi Radicals",
- /*2FF0..2FFF;*/ "Ideographic Description Characters",
- /*3000..303F;*/ "CJK Symbols and Punctuation",
- /*3040..309F;*/ "Hiragana",
- /*30A0..30FF;*/ "Katakana",
- /*3100..312F;*/ "Bopomofo",
- /*3130..318F;*/ "Hangul Compatibility Jamo",
- /*3190..319F;*/ "Kanbun",
- /*31A0..31BF;*/ "Bopomofo Extended",
- /*3200..32FF;*/ "Enclosed CJK Letters and Months",
- /*3300..33FF;*/ "CJK Compatibility",
- /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
- /*4E00..9FFF;*/ "CJK Unified Ideographs",
- /*A000..A48F;*/ "Yi Syllables",
- /*A490..A4CF;*/ "Yi Radicals",
- /*AC00..D7A3;*/ "Hangul Syllables",
- /*E000..F8FF;*/ "Private Use",
- /*F900..FAFF;*/ "CJK Compatibility Ideographs",
- /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
- /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
- /*FE20..FE2F;*/ "Combining Half Marks",
- /*FE30..FE4F;*/ "CJK Compatibility Forms",
- /*FE50..FE6F;*/ "Small Form Variants",
- /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
- /*FEFF..FEFF;*/ "Specials",
- /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
- //missing Specials add manually
- /*10300..1032F;*/ "Old Italic", // 84
- /*10330..1034F;*/ "Gothic",
- /*10400..1044F;*/ "Deseret",
- /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
- /*1D100..1D1FF;*/ "Musical Symbols",
- /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
- /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
- /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
- /*E0000..E007F;*/ "Tags",
- //missing 2 private use add manually
-
- };
- //ADD THOSE MANUALLY
- //F0000..FFFFD; "Private Use",
- //100000..10FFFD; "Private Use"
- //FFF0..FFFD; "Specials",
- static final String blockRanges =
- "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
- +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
- +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
- +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
- +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
- +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
- +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
- +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
- +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
- +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
- +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
- static final int[] nonBMPBlockRanges = {
- 0x10300, 0x1032F, // 84
- 0x10330, 0x1034F,
- 0x10400, 0x1044F,
- 0x1D000, 0x1D0FF,
- 0x1D100, 0x1D1FF,
- 0x1D400, 0x1D7FF,
- 0x20000, 0x2A6D6,
- 0x2F800, 0x2FA1F,
- 0xE0000, 0xE007F
- };
- private static final int NONBMP_BLOCK_START = 84;
-
- static protected RangeToken getRange(String name, boolean positive) {
- if (Token.categories.size() == 0) {
- synchronized (Token.categories) {
- Token[] ranges = new Token[Token.categoryNames.length];
- for (int i = 0; i < ranges.length; i ++) {
- ranges[i] = Token.createRange();
- }
- int type;
- for (int i = 0; i < 0x10000; i ++) {
- type = Character.getType((char)i);
- if (type == Character.START_PUNCTUATION ||
- type == Character.END_PUNCTUATION) {
- //build table of Pi values
- if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
- i == 0x201F || i == 0x2039) {
- type = CHAR_INIT_QUOTE;
- }
- //build table of Pf values
- if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
- type = CHAR_FINAL_QUOTE;
- }
- }
- ranges[type].addRange(i, i);
- switch (type) {
- case Character.UPPERCASE_LETTER:
- case Character.LOWERCASE_LETTER:
- case Character.TITLECASE_LETTER:
- case Character.MODIFIER_LETTER:
- case Character.OTHER_LETTER:
- type = CHAR_LETTER;
- break;
- case Character.NON_SPACING_MARK:
- case Character.COMBINING_SPACING_MARK:
- case Character.ENCLOSING_MARK:
- type = CHAR_MARK;
- break;
- case Character.DECIMAL_DIGIT_NUMBER:
- case Character.LETTER_NUMBER:
- case Character.OTHER_NUMBER:
- type = CHAR_NUMBER;
- break;
- case Character.SPACE_SEPARATOR:
- case Character.LINE_SEPARATOR:
- case Character.PARAGRAPH_SEPARATOR:
- type = CHAR_SEPARATOR;
- break;
- case Character.CONTROL:
- case Character.FORMAT:
- case Character.SURROGATE:
- case Character.PRIVATE_USE:
- case Character.UNASSIGNED:
- type = CHAR_OTHER;
- break;
- case Character.CONNECTOR_PUNCTUATION:
- case Character.DASH_PUNCTUATION:
- case Character.START_PUNCTUATION:
- case Character.END_PUNCTUATION:
- case CHAR_INIT_QUOTE:
- case CHAR_FINAL_QUOTE:
- case Character.OTHER_PUNCTUATION:
- type = CHAR_PUNCTUATION;
- break;
- case Character.MATH_SYMBOL:
- case Character.CURRENCY_SYMBOL:
- case Character.MODIFIER_SYMBOL:
- case Character.OTHER_SYMBOL:
- type = CHAR_SYMBOL;
- break;
- default:
- throw new RuntimeException("com.sun.org.apache.xerces.internal.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
- }
- ranges[type].addRange(i, i);
- } // for all characters
- ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
-
- for (int i = 0; i < ranges.length; i ++) {
- if (Token.categoryNames[i] != null) {
- if (i == Character.UNASSIGNED) { // Unassigned
- ranges[i].addRange(0x10000, Token.UTF16_MAX);
- }
- Token.categories.put(Token.categoryNames[i], ranges[i]);
- Token.categories2.put(Token.categoryNames[i],
- Token.complementRanges(ranges[i]));
- }
- }
- //REVISIT: do we really need to support block names as in Unicode 3.1
- // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
- //
- StringBuffer buffer = new StringBuffer(50);
- for (int i = 0; i < Token.blockNames.length; i ++) {
- Token r1 = Token.createRange();
- int location;
- if (i < NONBMP_BLOCK_START) {
- location = i*2;
- int rstart = Token.blockRanges.charAt(location);
- int rend = Token.blockRanges.charAt(location+1);
- //DEBUGING
- //System.out.println(n+" " +Integer.toHexString(rstart)
- // +"-"+ Integer.toHexString(rend));
- r1.addRange(rstart, rend);
- } else {
- location = (i - NONBMP_BLOCK_START) * 2;
- r1.addRange(Token.nonBMPBlockRanges[location],
- Token.nonBMPBlockRanges[location + 1]);
- }
- String n = Token.blockNames[i];
- if (n.equals("Specials"))
- r1.addRange(0xfff0, 0xfffd);
- if (n.equals("Private Use")) {
- r1.addRange(0xF0000,0xFFFFD);
- r1.addRange(0x100000,0x10FFFD);
- }
- Token.categories.put(n, r1);
- Token.categories2.put(n, Token.complementRanges(r1));
- buffer.setLength(0);
- buffer.append("Is");
- if (n.indexOf(' ') >= 0) {
- for (int ci = 0; ci < n.length(); ci ++)
- if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
- }
- else {
- buffer.append(n);
- }
- Token.setAlias(buffer.toString(), n, true);
- }
-
- // TR#18 1.2
- Token.setAlias("ASSIGNED", "Cn", false);
- Token.setAlias("UNASSIGNED", "Cn", true);
- Token all = Token.createRange();
- all.addRange(0, Token.UTF16_MAX);
- Token.categories.put("ALL", all);
- Token.categories2.put("ALL", Token.complementRanges(all));
- Token.registerNonXS("ASSIGNED");
- Token.registerNonXS("UNASSIGNED");
- Token.registerNonXS("ALL");
-
- Token isalpha = Token.createRange();
- isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
- isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
- isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
- Token.categories.put("IsAlpha", isalpha);
- Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
- Token.registerNonXS("IsAlpha");
-
- Token isalnum = Token.createRange();
- isalnum.mergeRanges(isalpha); // Lu Ll Lo
- isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
- Token.categories.put("IsAlnum", isalnum);
- Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
- Token.registerNonXS("IsAlnum");
-
- Token isspace = Token.createRange();
- isspace.mergeRanges(Token.token_spaces);
- isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
- Token.categories.put("IsSpace", isspace);
- Token.categories2.put("IsSpace", Token.complementRanges(isspace));
- Token.registerNonXS("IsSpace");
-
- Token isword = Token.createRange();
- isword.mergeRanges(isalnum); // Lu Ll Lo Nd
- isword.addRange('_', '_');
- Token.categories.put("IsWord", isword);
- Token.categories2.put("IsWord", Token.complementRanges(isword));
- Token.registerNonXS("IsWord");
-
- Token isascii = Token.createRange();
- isascii.addRange(0, 127);
- Token.categories.put("IsASCII", isascii);
- Token.categories2.put("IsASCII", Token.complementRanges(isascii));
- Token.registerNonXS("IsASCII");
-
- Token isnotgraph = Token.createRange();
- isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
- isnotgraph.addRange(' ', ' ');
- Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
- Token.categories2.put("IsGraph", isnotgraph);
- Token.registerNonXS("IsGraph");
-
- Token isxdigit = Token.createRange();
- isxdigit.addRange('0', '9');
- isxdigit.addRange('A', 'F');
- isxdigit.addRange('a', 'f');
- Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
- Token.categories2.put("IsXDigit", isxdigit);
- Token.registerNonXS("IsXDigit");
-
- Token.setAlias("IsDigit", "Nd", true);
- Token.setAlias("IsUpper", "Lu", true);
- Token.setAlias("IsLower", "Ll", true);
- Token.setAlias("IsCntrl", "C", true);
- Token.setAlias("IsPrint", "C", false);
- Token.setAlias("IsPunct", "P", true);
- Token.registerNonXS("IsDigit");
- Token.registerNonXS("IsUpper");
- Token.registerNonXS("IsLower");
- Token.registerNonXS("IsCntrl");
- Token.registerNonXS("IsPrint");
- Token.registerNonXS("IsPunct");
-
- Token.setAlias("alpha", "IsAlpha", true);
- Token.setAlias("alnum", "IsAlnum", true);
- Token.setAlias("ascii", "IsASCII", true);
- Token.setAlias("cntrl", "IsCntrl", true);
- Token.setAlias("digit", "IsDigit", true);
- Token.setAlias("graph", "IsGraph", true);
- Token.setAlias("lower", "IsLower", true);
- Token.setAlias("print", "IsPrint", true);
- Token.setAlias("punct", "IsPunct", true);
- Token.setAlias("space", "IsSpace", true);
- Token.setAlias("upper", "IsUpper", true);
- Token.setAlias("word", "IsWord", true); // Perl extension
- Token.setAlias("xdigit", "IsXDigit", true);
- Token.registerNonXS("alpha");
- Token.registerNonXS("alnum");
- Token.registerNonXS("ascii");
- Token.registerNonXS("cntrl");
- Token.registerNonXS("digit");
- Token.registerNonXS("graph");
- Token.registerNonXS("lower");
- Token.registerNonXS("print");
- Token.registerNonXS("punct");
- Token.registerNonXS("space");
- Token.registerNonXS("upper");
- Token.registerNonXS("word");
- Token.registerNonXS("xdigit");
- } // synchronized
- } // if null
- RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
- : (RangeToken)Token.categories2.get(name);
- //if (tok == null) System.out.println(name);
- return tok;
- }
- static protected RangeToken getRange(String name, boolean positive, boolean xs) {
- RangeToken range = Token.getRange(name, positive);
- if (xs && range != null && Token.isRegisterNonXS(name))
- range = null;
- return range;
- }
-
- static Hashtable nonxs = null;
- /**
- * This method is called by only getRange().
- * So this method need not MT-safe.
- */
- static protected void registerNonXS(String name) {
- if (Token.nonxs == null)
- Token.nonxs = new Hashtable();
- Token.nonxs.put(name, name);
- }
- static protected boolean isRegisterNonXS(String name) {
- if (Token.nonxs == null)
- return false;
- //DEBUG
- //System.err.println("isRegisterNonXS: "+name);
- return Token.nonxs.containsKey(name);
- }
-
- private static void setAlias(String newName, String name, boolean positive) {
- Token t1 = (Token)Token.categories.get(name);
- Token t2 = (Token)Token.categories2.get(name);
- if (positive) {
- Token.categories.put(newName, t1);
- Token.categories2.put(newName, t2);
- } else {
- Token.categories2.put(newName, t1);
- Token.categories.put(newName, t2);
- }
- }
-
- // ------------------------------------------------------
-
- static final String viramaString =
- "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
- +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
- +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
-
- static private Token token_grapheme = null;
- static synchronized Token getGraphemePattern() {
- if (Token.token_grapheme != null)
- return Token.token_grapheme;
-
- Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
- base_char.mergeRanges(Token.getRange("ASSIGNED", true));
- base_char.subtractRanges(Token.getRange("M", true));
- base_char.subtractRanges(Token.getRange("C", true));
-
- Token virama = Token.createRange();
- for (int i = 0; i < Token.viramaString.length(); i ++) {
- int ch = viramaString.charAt(i);
- virama.addRange(i, i);
- }
-
- Token combiner_wo_virama = Token.createRange();
- combiner_wo_virama.mergeRanges(Token.getRange("M", true));
- combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
- combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
-
- Token left = Token.createUnion(); // base_char?
- left.addChild(base_char);
- left.addChild(Token.token_empty);
-
- Token foo = Token.createUnion();
- foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
- foo.addChild(combiner_wo_virama);
-
- foo = Token.createClosure(foo);
-
- foo = Token.createConcat(left, foo);
-
- Token.token_grapheme = foo;
- return Token.token_grapheme;
- }
-
- /**
- * Combing Character Sequence in Perl 5.6.
- */
- static private Token token_ccs = null;
- static synchronized Token getCombiningCharacterSequence() {
- if (Token.token_ccs != null)
- return Token.token_ccs;
-
- Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
- foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
- Token.token_ccs = foo;
- return Token.token_ccs;
- }
-
- // ------------------------------------------------------
-
- // ------------------------------------------------------
- /**
- * This class represents a node in parse tree.
- */
- static class StringToken extends Token implements java.io.Serializable {
- String string;
- int refNumber;
-
- StringToken(int type, String str, int n) {
- super(type);
- this.string = str;
- this.refNumber = n;
- }
-
- int getReferenceNumber() { // for STRING
- return this.refNumber;
- }
- String getString() { // for STRING
- return this.string;
- }
-
- public String toString(int options) {
- if (this.type == BACKREFERENCE)
- return "\\"+this.refNumber;
- else
- return REUtil.quoteMeta(this.string);
- }
- }
-
- /**
- * This class represents a node in parse tree.
- */
- static class ConcatToken extends Token implements java.io.Serializable {
- Token child;
- Token child2;
-
- ConcatToken(Token t1, Token t2) {
- super(Token.CONCAT);
- this.child = t1;
- this.child2 = t2;
- }
-
- int size() {
- return 2;
- }
- Token getChild(int index) {
- return index == 0 ? this.child : this.child2;
- }
-
- public String toString(int options) {
- String ret;
- if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
- ret = this.child.toString(options)+"+";
- } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
- ret = this.child.toString(options)+"+?";
- } else
- ret = this.child.toString(options)+this.child2.toString(options);
- return ret;
- }
- }
-
- /**
- * This class represents a node in parse tree.
- */
- static class CharToken extends Token implements java.io.Serializable {
- int chardata;
-
- CharToken(int type, int ch) {
- super(type);
- this.chardata = ch;
- }
-
- int getChar() {
- return this.chardata;
- }
-
- public String toString(int options) {
- String ret;
- switch (this.type) {
- case CHAR:
- switch (this.chardata) {
- case '|': case '*': case '+': case '?':
- case '(': case ')': case '.': case '[':
- case '{': case '\\':
- ret = "\\"+(char)this.chardata;
- break;
- case '\f': ret = "\\f"; break;
- case '\n': ret = "\\n"; break;
- case '\r': ret = "\\r"; break;
- case '\t': ret = "\\t"; break;
- case 0x1b: ret = "\\e"; break;
- //case 0x0b: ret = "\\v"; break;
- default:
- if (this.chardata >= 0x10000) {
- String pre = "0"+Integer.toHexString(this.chardata);
- ret = "\\v"+pre.substring(pre.length()-6, pre.length());
- } else
- ret = ""+(char)this.chardata;
- }
- break;
-
- case ANCHOR:
- if (this == Token.token_linebeginning || this == Token.token_lineend)
- ret = ""+(char)this.chardata;
- else
- ret = "\\"+(char)this.chardata;
- break;
-
- default:
- ret = null;
- }
- return ret;
- }
-
- boolean match(int ch) {
- if (this.type == CHAR) {
- return ch == this.chardata;
- } else
- throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
- }
- }
-
- /**
- * This class represents a node in parse tree.
- */
- static class ClosureToken extends Token implements java.io.Serializable {
- int min;
- int max;
- Token child;
-
- ClosureToken(int type, Token tok) {
- super(type);
- this.child = tok;
- this.setMin(-1);
- this.setMax(-1);
- }
-
- int size() {
- return 1;
- }
- Token getChild(int index) {
- return this.child;
- }
-
- final void setMin(int min) {
- this.min = min;
- }
- final void setMax(int max) {
- this.max = max;
- }
- final int getMin() {
- return this.min;
- }
- final int getMax() {
- return this.max;
- }
-
- public String toString(int options) {
- String ret;
- if (this.type == CLOSURE) {
- if (this.getMin() < 0 && this.getMax() < 0) {
- ret = this.child.toString(options)+"*";
- } else if (this.getMin() == this.getMax()) {
- ret = this.child.toString(options)+"{"+this.getMin()+"}";
- } else if (this.getMin() >= 0 && this.getMax() >= 0) {
- ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
- } else if (this.getMin() >= 0 && this.getMax() < 0) {
- ret = this.child.toString(options)+"{"+this.getMin()+",}";
- } else
- throw new RuntimeException("Token#toString(): CLOSURE "
- +this.getMin()+", "+this.getMax());
- } else {
- if (this.getMin() < 0 && this.getMax() < 0) {
- ret = this.child.toString(options)+"*?";
- } else if (this.getMin() == this.getMax()) {
- ret = this.child.toString(options)+"{"+this.getMin()+"}?";
- } else if (this.getMin() >= 0 && this.getMax() >= 0) {
- ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
- } else if (this.getMin() >= 0 && this.getMax() < 0) {
- ret = this.child.toString(options)+"{"+this.getMin()+",}?";
- } else
- throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
- +this.getMin()+", "+this.getMax());
- }
- return ret;
- }
- }
-
- /**
- * This class represents a node in parse tree.
- */
- static class ParenToken extends Token implements java.io.Serializable {
- Token child;
- int parennumber;
-
- ParenToken(int type, Token tok, int paren) {
- super(type);
- this.child = tok;
- this.parennumber = paren;
- }
-
- int size() {
- return 1;
- }
- Token getChild(int index) {
- return this.child;
- }
-
- int getParenNumber() {
- return this.parennumber;
- }
-
- public String toString(int options) {
- String ret = null;
- switch (this.type) {
- case PAREN:
- if (this.parennumber == 0) {
- ret = "(?:"+this.child.toString(options)+")";
- } else {
- ret = "("+this.child.toString(options)+")";
- }
- break;
-
- case LOOKAHEAD:
- ret = "(?="+this.child.toString(options)+")";
- break;
- case NEGATIVELOOKAHEAD:
- ret = "(?!"+this.child.toString(options)+")";
- break;
- case LOOKBEHIND:
- ret = "(?<="+this.child.toString(options)+")";
- break;
- case NEGATIVELOOKBEHIND:
- ret = "(?<!"+this.child.toString(options)+")";
- break;
- case INDEPENDENT:
- ret = "(?>"+this.child.toString(options)+")";
- break;
- }
- return ret;
- }
- }
-
- /**
- * (?(condition)yes-pattern|no-pattern)
- */
- static class ConditionToken extends Token implements java.io.Serializable {
- int refNumber;
- Token condition;
- Token yes;
- Token no;
- ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
- super(Token.CONDITION);
- this.refNumber = refno;
- this.condition = cond;
- this.yes = yespat;
- this.no = nopat;
- }
- int size() {
- return this.no == null ? 1 : 2;
- }
- Token getChild(int index) {
- if (index == 0) return this.yes;
- if (index == 1) return this.no;
- throw new RuntimeException("Internal Error: "+index);
- }
-
- public String toString(int options) {
- String ret;
- if (refNumber > 0) {
- ret = "(?("+refNumber+")";
- } else if (this.condition.type == Token.ANCHOR) {
- ret = "(?("+this.condition+")";
- } else {
- ret = "(?"+this.condition;
- }
-
- if (this.no == null) {
- ret += this.yes+")";
- } else {
- ret += this.yes+"|"+this.no+")";
- }
- return ret;
- }
- }
-
- /**
- * (ims-ims: .... )
- */
- static class ModifierToken extends Token implements java.io.Serializable {
- Token child;
- int add;
- int mask;
-
- ModifierToken(Token tok, int add, int mask) {
- super(Token.MODIFIERGROUP);
- this.child = tok;
- this.add = add;
- this.mask = mask;
- }
-
- int size() {
- return 1;
- }
- Token getChild(int index) {
- return this.child;
- }
-
- int getOptions() {
- return this.add;
- }
- int getOptionsMask() {
- return this.mask;
- }
-
- public String toString(int options) {
- return "(?"
- +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
- +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
- +":"
- +this.child.toString(options)
- +")";
- }
- }
-
- /**
- * This class represents a node in parse tree.
- * for UNION or CONCAT.
- */
- static class UnionToken extends Token implements java.io.Serializable {
- Vector children;
-
- UnionToken(int type) {
- super(type);
- }
-
- void addChild(Token tok) {
- if (tok == null) return;
- if (this.children == null) this.children = new Vector();
- if (this.type == UNION) {
- this.children.addElement(tok);
- return;
- }
- // This is CONCAT, and new child is CONCAT.
- if (tok.type == CONCAT) {
- for (int i = 0; i < tok.size(); i ++)
- this.addChild(tok.getChild(i)); // Recursion
- return;
- }
- int size = this.children.size();
- if (size == 0) {
- this.children.addElement(tok);
- return;
- }
- Token previous = (Token)this.children.elementAt(size-1);
- if (!((previous.type == CHAR || previous.type == STRING)
- && (tok.type == CHAR || tok.type == STRING))) {
- this.children.addElement(tok);
- return;
- }
-
- //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
-
- StringBuffer buffer;
- int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
- if (previous.type == CHAR) { // Replace previous token by STRING
- buffer = new StringBuffer(2 + nextMaxLength);
- int ch = previous.getChar();
- if (ch >= 0x10000)
- buffer.append(REUtil.decomposeToSurrogates(ch));
- else
- buffer.append((char)ch);
- previous = Token.createString(null);
- this.children.setElementAt(previous, size-1);
- } else { // STRING
- buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
- buffer.append(previous.getString());
- }
-
- if (tok.type == CHAR) {
- int ch = tok.getChar();
- if (ch >= 0x10000)
- buffer.append(REUtil.decomposeToSurrogates(ch));
- else
- buffer.append((char)ch);
- } else {
- buffer.append(tok.getString());
- }
-
- ((StringToken)previous).string = new String(buffer);
- }
-
- int size() {
- return this.children == null ? 0 : this.children.size();
- }
- Token getChild(int index) {
- return (Token)this.children.elementAt(index);
- }
-
- public String toString(int options) {
- String ret;
- if (this.type == CONCAT) {
- if (this.children.size() == 2) {
- Token ch = this.getChild(0);
- Token ch2 = this.getChild(1);
- if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
- ret = ch.toString(options)+"+";
- } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
- ret = ch.toString(options)+"+?";
- } else
- ret = ch.toString(options)+ch2.toString(options);
- } else {
- StringBuffer sb = new StringBuffer();
- for (int i = 0; i < this.children.size(); i ++) {
- sb.append(((Token)this.children.elementAt(i)).toString(options));
- }
- ret = new String(sb);
- }
- return ret;
- }
- if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
- ret = this.getChild(0).toString(options)+"?";
- } else if (this.children.size() == 2
- && this.getChild(0).type == EMPTY) {
- ret = this.getChild(1).toString(options)+"??";
- } else {
- StringBuffer sb = new StringBuffer();
- sb.append(((Token)this.children.elementAt(0)).toString(options));
- for (int i = 1; i < this.children.size(); i ++) {
- sb.append((char)'|');
- sb.append(((Token)this.children.elementAt(i)).toString(options));
- }
- ret = new String(sb);
- }
- return ret;
- }
- }
- }