1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  58. import java.util.Vector;
  59. import java.util.Hashtable;
  60. /**
  61. * This class represents a node in parse tree.
  62. *
  63. * @version $Id: Token.java,v 1.7 2003/02/25 14:43:13 sandygao Exp $
  64. */
  65. class Token implements java.io.Serializable {
  66. static final boolean COUNTTOKENS = true;
  67. static int tokens = 0;
  68. static final int CHAR = 0; // Literal char
  69. static final int DOT = 11; // .
  70. static final int CONCAT = 1; // XY
  71. static final int UNION = 2; // X|Y|Z
  72. static final int CLOSURE = 3; // X*
  73. static final int RANGE = 4; // [a-zA-Z] etc.
  74. static final int NRANGE = 5; // [^a-zA-Z] etc.
  75. static final int PAREN = 6; // (X) or (?:X)
  76. static final int EMPTY = 7; //
  77. static final int ANCHOR = 8; // ^ $ \b \B \< \> \A \Z \z
  78. static final int NONGREEDYCLOSURE = 9; // *? +?
  79. static final int STRING = 10; // strings
  80. static final int BACKREFERENCE = 12; // back references
  81. static final int LOOKAHEAD = 20; // (?=...)
  82. static final int NEGATIVELOOKAHEAD = 21; // (?!...)
  83. static final int LOOKBEHIND = 22; // (?<=...)
  84. static final int NEGATIVELOOKBEHIND = 23; // (?<!...)
  85. static final int INDEPENDENT = 24; // (?>...)
  86. static final int MODIFIERGROUP = 25; // (?ims-ims:...)
  87. static final int CONDITION = 26; // (?(...)yes|no)
  88. static final int UTF16_MAX = 0x10ffff;
  89. int type;
  90. static Token token_dot;
  91. static Token token_0to9;
  92. static Token token_wordchars;
  93. static Token token_not_0to9;
  94. static Token token_not_wordchars;
  95. static Token token_spaces;
  96. static Token token_not_spaces;
  97. static Token token_empty;
  98. static Token token_linebeginning;
  99. static Token token_linebeginning2;
  100. static Token token_lineend;
  101. static Token token_stringbeginning;
  102. static Token token_stringend;
  103. static Token token_stringend2;
  104. static Token token_wordedge;
  105. static Token token_not_wordedge;
  106. static Token token_wordbeginning;
  107. static Token token_wordend;
  108. static {
  109. Token.token_empty = new Token(Token.EMPTY);
  110. Token.token_linebeginning = Token.createAnchor('^');
  111. Token.token_linebeginning2 = Token.createAnchor('@');
  112. Token.token_lineend = Token.createAnchor('$');
  113. Token.token_stringbeginning = Token.createAnchor('A');
  114. Token.token_stringend = Token.createAnchor('z');
  115. Token.token_stringend2 = Token.createAnchor('Z');
  116. Token.token_wordedge = Token.createAnchor('b');
  117. Token.token_not_wordedge = Token.createAnchor('B');
  118. Token.token_wordbeginning = Token.createAnchor('<');
  119. Token.token_wordend = Token.createAnchor('>');
  120. Token.token_dot = new Token(Token.DOT);
  121. Token.token_0to9 = Token.createRange();
  122. Token.token_0to9.addRange('0', '9');
  123. Token.token_wordchars = Token.createRange();
  124. Token.token_wordchars.addRange('0', '9');
  125. Token.token_wordchars.addRange('A', 'Z');
  126. Token.token_wordchars.addRange('_', '_');
  127. Token.token_wordchars.addRange('a', 'z');
  128. Token.token_spaces = Token.createRange();
  129. Token.token_spaces.addRange('\t', '\t');
  130. Token.token_spaces.addRange('\n', '\n');
  131. Token.token_spaces.addRange('\f', '\f');
  132. Token.token_spaces.addRange('\r', '\r');
  133. Token.token_spaces.addRange(' ', ' ');
  134. Token.token_not_0to9 = Token.complementRanges(Token.token_0to9);
  135. Token.token_not_wordchars = Token.complementRanges(Token.token_wordchars);
  136. Token.token_not_spaces = Token.complementRanges(Token.token_spaces);
  137. }
  138. static Token.ParenToken createLook(int type, Token child) {
  139. if (COUNTTOKENS) Token.tokens ++;
  140. return new Token.ParenToken(type, child, 0);
  141. }
  142. static Token.ParenToken createParen(Token child, int pnumber) {
  143. if (COUNTTOKENS) Token.tokens ++;
  144. return new Token.ParenToken(Token.PAREN, child, pnumber);
  145. }
  146. static Token.ClosureToken createClosure(Token tok) {
  147. if (COUNTTOKENS) Token.tokens ++;
  148. return new Token.ClosureToken(Token.CLOSURE, tok);
  149. }
  150. static Token.ClosureToken createNGClosure(Token tok) {
  151. if (COUNTTOKENS) Token.tokens ++;
  152. return new Token.ClosureToken(Token.NONGREEDYCLOSURE, tok);
  153. }
  154. static Token.ConcatToken createConcat(Token tok1, Token tok2) {
  155. if (COUNTTOKENS) Token.tokens ++;
  156. return new Token.ConcatToken(tok1, tok2);
  157. }
  158. static Token.UnionToken createConcat() {
  159. if (COUNTTOKENS) Token.tokens ++;
  160. return new Token.UnionToken(Token.CONCAT); // *** It is not a bug.
  161. }
  162. static Token.UnionToken createUnion() {
  163. if (COUNTTOKENS) Token.tokens ++;
  164. return new Token.UnionToken(Token.UNION);
  165. }
  166. static Token createEmpty() {
  167. return Token.token_empty;
  168. }
  169. static RangeToken createRange() {
  170. if (COUNTTOKENS) Token.tokens ++;
  171. return new RangeToken(Token.RANGE);
  172. }
  173. static RangeToken createNRange() {
  174. if (COUNTTOKENS) Token.tokens ++;
  175. return new RangeToken(Token.NRANGE);
  176. }
  177. static Token.CharToken createChar(int ch) {
  178. if (COUNTTOKENS) Token.tokens ++;
  179. return new Token.CharToken(Token.CHAR, ch);
  180. }
  181. static private Token.CharToken createAnchor(int ch) {
  182. if (COUNTTOKENS) Token.tokens ++;
  183. return new Token.CharToken(Token.ANCHOR, ch);
  184. }
  185. static Token.StringToken createBackReference(int refno) {
  186. if (COUNTTOKENS) Token.tokens ++;
  187. return new Token.StringToken(Token.BACKREFERENCE, null, refno);
  188. }
  189. static Token.StringToken createString(String str) {
  190. if (COUNTTOKENS) Token.tokens ++;
  191. return new Token.StringToken(Token.STRING, str, 0);
  192. }
  193. static Token.ModifierToken createModifierGroup(Token child, int add, int mask) {
  194. if (COUNTTOKENS) Token.tokens ++;
  195. return new Token.ModifierToken(child, add, mask);
  196. }
  197. static Token.ConditionToken createCondition(int refno, Token condition,
  198. Token yespat, Token nopat) {
  199. if (COUNTTOKENS) Token.tokens ++;
  200. return new Token.ConditionToken(refno, condition, yespat, nopat);
  201. }
  202. protected Token(int type) {
  203. this.type = type;
  204. }
  205. /**
  206. * A number of children.
  207. */
  208. int size() {
  209. return 0;
  210. }
  211. Token getChild(int index) {
  212. return null;
  213. }
  214. void addChild(Token tok) {
  215. throw new RuntimeException("Not supported.");
  216. }
  217. // for RANGE or NRANGE
  218. protected void addRange(int start, int end) {
  219. throw new RuntimeException("Not supported.");
  220. }
  221. protected void sortRanges() {
  222. throw new RuntimeException("Not supported.");
  223. }
  224. protected void compactRanges() {
  225. throw new RuntimeException("Not supported.");
  226. }
  227. protected void mergeRanges(Token tok) {
  228. throw new RuntimeException("Not supported.");
  229. }
  230. protected void subtractRanges(Token tok) {
  231. throw new RuntimeException("Not supported.");
  232. }
  233. protected void intersectRanges(Token tok) {
  234. throw new RuntimeException("Not supported.");
  235. }
  236. static Token complementRanges(Token tok) {
  237. return RangeToken.complementRanges(tok);
  238. }
  239. void setMin(int min) { // for CLOSURE
  240. }
  241. void setMax(int max) { // for CLOSURE
  242. }
  243. int getMin() { // for CLOSURE
  244. return -1;
  245. }
  246. int getMax() { // for CLOSURE
  247. return -1;
  248. }
  249. int getReferenceNumber() { // for STRING
  250. return 0;
  251. }
  252. String getString() { // for STRING
  253. return null;
  254. }
  255. int getParenNumber() {
  256. return 0;
  257. }
  258. int getChar() {
  259. return -1;
  260. }
  261. public String toString() {
  262. return this.toString(0);
  263. }
  264. public String toString(int options) {
  265. return this.type == Token.DOT ? "." : "";
  266. }
  267. /**
  268. * How many characters are needed?
  269. */
  270. final int getMinLength() {
  271. switch (this.type) {
  272. case CONCAT:
  273. int sum = 0;
  274. for (int i = 0; i < this.size(); i ++)
  275. sum += this.getChild(i).getMinLength();
  276. return sum;
  277. case CONDITION:
  278. case UNION:
  279. if (this.size() == 0)
  280. return 0;
  281. int ret = this.getChild(0).getMinLength();
  282. for (int i = 1; i < this.size(); i ++) {
  283. int min = this.getChild(i).getMinLength();
  284. if (min < ret) ret = min;
  285. }
  286. return ret;
  287. case CLOSURE:
  288. case NONGREEDYCLOSURE:
  289. if (this.getMin() >= 0)
  290. return this.getMin() * this.getChild(0).getMinLength();
  291. return 0;
  292. case EMPTY:
  293. case ANCHOR:
  294. return 0;
  295. case DOT:
  296. case CHAR:
  297. case RANGE:
  298. case NRANGE:
  299. return 1;
  300. case INDEPENDENT:
  301. case PAREN:
  302. case MODIFIERGROUP:
  303. return this.getChild(0).getMinLength();
  304. case BACKREFERENCE:
  305. return 0; // *******
  306. case STRING:
  307. return this.getString().length();
  308. case LOOKAHEAD:
  309. case NEGATIVELOOKAHEAD:
  310. case LOOKBEHIND:
  311. case NEGATIVELOOKBEHIND:
  312. return 0; // ***** Really?
  313. default:
  314. throw new RuntimeException("Token#getMinLength(): Invalid Type: "+this.type);
  315. }
  316. }
  317. final int getMaxLength() {
  318. switch (this.type) {
  319. case CONCAT:
  320. int sum = 0;
  321. for (int i = 0; i < this.size(); i ++) {
  322. int d = this.getChild(i).getMaxLength();
  323. if (d < 0) return -1;
  324. sum += d;
  325. }
  326. return sum;
  327. case CONDITION:
  328. case UNION:
  329. if (this.size() == 0)
  330. return 0;
  331. int ret = this.getChild(0).getMaxLength();
  332. for (int i = 1; ret >= 0 && i < this.size(); i ++) {
  333. int max = this.getChild(i).getMaxLength();
  334. if (max < 0) { // infinity
  335. ret = -1;
  336. break;
  337. }
  338. if (max > ret) ret = max;
  339. }
  340. return ret;
  341. case CLOSURE:
  342. case NONGREEDYCLOSURE:
  343. if (this.getMax() >= 0)
  344. // When this.child.getMaxLength() < 0,
  345. // this returns minus value
  346. return this.getMax() * this.getChild(0).getMaxLength();
  347. return -1;
  348. case EMPTY:
  349. case ANCHOR:
  350. return 0;
  351. case CHAR:
  352. return 1;
  353. case DOT:
  354. case RANGE:
  355. case NRANGE:
  356. return 2;
  357. case INDEPENDENT:
  358. case PAREN:
  359. case MODIFIERGROUP:
  360. return this.getChild(0).getMaxLength();
  361. case BACKREFERENCE:
  362. return -1; // ******
  363. case STRING:
  364. return this.getString().length();
  365. case LOOKAHEAD:
  366. case NEGATIVELOOKAHEAD:
  367. case LOOKBEHIND:
  368. case NEGATIVELOOKBEHIND:
  369. return 0; // ***** Really?
  370. default:
  371. throw new RuntimeException("Token#getMaxLength(): Invalid Type: "+this.type);
  372. }
  373. }
  374. static final int FC_CONTINUE = 0;
  375. static final int FC_TERMINAL = 1;
  376. static final int FC_ANY = 2;
  377. private static final boolean isSet(int options, int flag) {
  378. return (options & flag) == flag;
  379. }
  380. final int analyzeFirstCharacter(RangeToken result, int options) {
  381. switch (this.type) {
  382. case CONCAT:
  383. int ret = FC_CONTINUE;
  384. for (int i = 0; i < this.size(); i ++)
  385. if ((ret = this.getChild(i).analyzeFirstCharacter(result, options)) != FC_CONTINUE)
  386. break;
  387. return ret;
  388. case UNION:
  389. if (this.size() == 0)
  390. return FC_CONTINUE;
  391. /*
  392. * a|b|c -> FC_TERMINAL
  393. * a|.|c -> FC_ANY
  394. * a|b| -> FC_CONTINUE
  395. */
  396. int ret2 = FC_CONTINUE;
  397. boolean hasEmpty = false;
  398. for (int i = 0; i < this.size(); i ++) {
  399. ret2 = this.getChild(i).analyzeFirstCharacter(result, options);
  400. if (ret2 == FC_ANY)
  401. break;
  402. else if (ret2 == FC_CONTINUE)
  403. hasEmpty = true;
  404. }
  405. return hasEmpty ? FC_CONTINUE : ret2;
  406. case CONDITION:
  407. int ret3 = this.getChild(0).analyzeFirstCharacter(result, options);
  408. if (this.size() == 1) return FC_CONTINUE;
  409. if (ret3 == FC_ANY) return ret3;
  410. int ret4 = this.getChild(1).analyzeFirstCharacter(result, options);
  411. if (ret4 == FC_ANY) return ret4;
  412. return ret3 == FC_CONTINUE || ret4 == FC_CONTINUE ? FC_CONTINUE : FC_TERMINAL;
  413. case CLOSURE:
  414. case NONGREEDYCLOSURE:
  415. this.getChild(0).analyzeFirstCharacter(result, options);
  416. return FC_CONTINUE;
  417. case EMPTY:
  418. case ANCHOR:
  419. return FC_CONTINUE;
  420. case CHAR:
  421. int ch = this.getChar();
  422. result.addRange(ch, ch);
  423. if (ch < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  424. ch = Character.toUpperCase((char)ch);
  425. result.addRange(ch, ch);
  426. ch = Character.toLowerCase((char)ch);
  427. result.addRange(ch, ch);
  428. }
  429. return FC_TERMINAL;
  430. case DOT: // ****
  431. if (isSet(options, RegularExpression.SINGLE_LINE)) {
  432. return FC_CONTINUE; // **** We can not optimize.
  433. } else {
  434. return FC_CONTINUE;
  435. /*
  436. result.addRange(0, RegularExpression.LINE_FEED-1);
  437. result.addRange(RegularExpression.LINE_FEED+1, RegularExpression.CARRIAGE_RETURN-1);
  438. result.addRange(RegularExpression.CARRIAGE_RETURN+1,
  439. RegularExpression.LINE_SEPARATOR-1);
  440. result.addRange(RegularExpression.PARAGRAPH_SEPARATOR+1, UTF16_MAX);
  441. return 1;
  442. */
  443. }
  444. case RANGE:
  445. if (isSet(options, RegularExpression.IGNORE_CASE)) {
  446. result.mergeRanges(((RangeToken)this).getCaseInsensitiveToken());
  447. } else {
  448. result.mergeRanges(this);
  449. }
  450. return FC_TERMINAL;
  451. case NRANGE: // ****
  452. if (isSet(options, RegularExpression.IGNORE_CASE)) {
  453. result.mergeRanges(Token.complementRanges(((RangeToken)this).getCaseInsensitiveToken()));
  454. } else {
  455. result.mergeRanges(Token.complementRanges(this));
  456. }
  457. return FC_TERMINAL;
  458. case INDEPENDENT:
  459. case PAREN:
  460. return this.getChild(0).analyzeFirstCharacter(result, options);
  461. case MODIFIERGROUP:
  462. options |= ((ModifierToken)this).getOptions();
  463. options &= ~((ModifierToken)this).getOptionsMask();
  464. return this.getChild(0).analyzeFirstCharacter(result, options);
  465. case BACKREFERENCE:
  466. result.addRange(0, UTF16_MAX); // **** We can not optimize.
  467. return FC_ANY;
  468. case STRING:
  469. int cha = this.getString().charAt(0);
  470. int ch2;
  471. if (REUtil.isHighSurrogate(cha)
  472. && this.getString().length() >= 2
  473. && REUtil.isLowSurrogate((ch2 = this.getString().charAt(1))))
  474. cha = REUtil.composeFromSurrogates(cha, ch2);
  475. result.addRange(cha, cha);
  476. if (cha < 0x10000 && isSet(options, RegularExpression.IGNORE_CASE)) {
  477. cha = Character.toUpperCase((char)cha);
  478. result.addRange(cha, cha);
  479. cha = Character.toLowerCase((char)cha);
  480. result.addRange(cha, cha);
  481. }
  482. return FC_TERMINAL;
  483. case LOOKAHEAD:
  484. case NEGATIVELOOKAHEAD:
  485. case LOOKBEHIND:
  486. case NEGATIVELOOKBEHIND:
  487. return FC_CONTINUE;
  488. default:
  489. throw new RuntimeException("Token#analyzeHeadCharacter(): Invalid Type: "+this.type);
  490. }
  491. }
  492. private final boolean isShorterThan(Token tok) {
  493. if (tok == null) return false;
  494. /*
  495. int mylength;
  496. if (this.type == STRING) mylength = this.getString().length();
  497. else if (this.type == CHAR) mylength = this.getChar() >= 0x10000 ? 2 : 1;
  498. else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  499. int otherlength;
  500. if (tok.type == STRING) otherlength = tok.getString().length();
  501. else if (tok.type == CHAR) otherlength = tok.getChar() >= 0x10000 ? 2 : 1;
  502. else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  503. */
  504. int mylength;
  505. if (this.type == STRING) mylength = this.getString().length();
  506. else throw new RuntimeException("Internal Error: Illegal type: "+this.type);
  507. int otherlength;
  508. if (tok.type == STRING) otherlength = tok.getString().length();
  509. else throw new RuntimeException("Internal Error: Illegal type: "+tok.type);
  510. return mylength < otherlength;
  511. }
  512. static class FixedStringContainer {
  513. Token token = null;
  514. int options = 0;
  515. FixedStringContainer() {
  516. }
  517. }
  518. final void findFixedString(FixedStringContainer container, int options) {
  519. switch (this.type) {
  520. case CONCAT:
  521. Token prevToken = null;
  522. int prevOptions = 0;
  523. for (int i = 0; i < this.size(); i ++) {
  524. this.getChild(i).findFixedString(container, options);
  525. if (prevToken == null || prevToken.isShorterThan(container.token)) {
  526. prevToken = container.token;
  527. prevOptions = container.options;
  528. }
  529. }
  530. container.token = prevToken;
  531. container.options = prevOptions;
  532. return;
  533. case UNION:
  534. case CLOSURE:
  535. case NONGREEDYCLOSURE:
  536. case EMPTY:
  537. case ANCHOR:
  538. case RANGE:
  539. case DOT:
  540. case NRANGE:
  541. case BACKREFERENCE:
  542. case LOOKAHEAD:
  543. case NEGATIVELOOKAHEAD:
  544. case LOOKBEHIND:
  545. case NEGATIVELOOKBEHIND:
  546. case CONDITION:
  547. container.token = null;
  548. return;
  549. case CHAR: // Ignore CHAR tokens.
  550. container.token = null; // **
  551. return; // **
  552. case STRING:
  553. container.token = this;
  554. container.options = options;
  555. return;
  556. case INDEPENDENT:
  557. case PAREN:
  558. this.getChild(0).findFixedString(container, options);
  559. return;
  560. case MODIFIERGROUP:
  561. options |= ((ModifierToken)this).getOptions();
  562. options &= ~((ModifierToken)this).getOptionsMask();
  563. this.getChild(0).findFixedString(container, options);
  564. return;
  565. default:
  566. throw new RuntimeException("Token#findFixedString(): Invalid Type: "+this.type);
  567. }
  568. }
  569. boolean match(int ch) {
  570. throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
  571. }
  572. // ------------------------------------------------------
  573. private final static Hashtable categories = new Hashtable();
  574. private final static Hashtable categories2 = new Hashtable();
  575. private static final String[] categoryNames = {
  576. "Cn", "Lu", "Ll", "Lt", "Lm", "Lo", "Mn", "Me", "Mc", "Nd",
  577. "Nl", "No", "Zs", "Zl", "Zp", "Cc", "Cf", null, "Co", "Cs",
  578. "Pd", "Ps", "Pe", "Pc", "Po", "Sm", "Sc", "Sk", "So", // 28
  579. "Pi", "Pf", // 29, 30
  580. "L", "M", "N", "Z", "C", "P", "S", // 31-37
  581. };
  582. // Schema Rec. {Datatypes} - Punctuation
  583. static final int CHAR_INIT_QUOTE = 29; // Pi - initial quote
  584. static final int CHAR_FINAL_QUOTE = 30; // Pf - final quote
  585. static final int CHAR_LETTER = 31;
  586. static final int CHAR_MARK = 32;
  587. static final int CHAR_NUMBER = 33;
  588. static final int CHAR_SEPARATOR = 34;
  589. static final int CHAR_OTHER = 35;
  590. static final int CHAR_PUNCTUATION = 36;
  591. static final int CHAR_SYMBOL = 37;
  592. //blockNames in UNICODE 3.1 that supported by XML Schema REC
  593. private static final String[] blockNames = {
  594. /*0000..007F;*/ "Basic Latin",
  595. /*0080..00FF;*/ "Latin-1 Supplement",
  596. /*0100..017F;*/ "Latin Extended-A",
  597. /*0180..024F;*/ "Latin Extended-B",
  598. /*0250..02AF;*/ "IPA Extensions",
  599. /*02B0..02FF;*/ "Spacing Modifier Letters",
  600. /*0300..036F;*/ "Combining Diacritical Marks",
  601. /*0370..03FF;*/ "Greek",
  602. /*0400..04FF;*/ "Cyrillic",
  603. /*0530..058F;*/ "Armenian",
  604. /*0590..05FF;*/ "Hebrew",
  605. /*0600..06FF;*/ "Arabic",
  606. /*0700..074F;*/ "Syriac",
  607. /*0780..07BF;*/ "Thaana",
  608. /*0900..097F;*/ "Devanagari",
  609. /*0980..09FF;*/ "Bengali",
  610. /*0A00..0A7F;*/ "Gurmukhi",
  611. /*0A80..0AFF;*/ "Gujarati",
  612. /*0B00..0B7F;*/ "Oriya",
  613. /*0B80..0BFF;*/ "Tamil",
  614. /*0C00..0C7F;*/ "Telugu",
  615. /*0C80..0CFF;*/ "Kannada",
  616. /*0D00..0D7F;*/ "Malayalam",
  617. /*0D80..0DFF;*/ "Sinhala",
  618. /*0E00..0E7F;*/ "Thai",
  619. /*0E80..0EFF;*/ "Lao",
  620. /*0F00..0FFF;*/ "Tibetan",
  621. /*1000..109F;*/ "Myanmar",
  622. /*10A0..10FF;*/ "Georgian",
  623. /*1100..11FF;*/ "Hangul Jamo",
  624. /*1200..137F;*/ "Ethiopic",
  625. /*13A0..13FF;*/ "Cherokee",
  626. /*1400..167F;*/ "Unified Canadian Aboriginal Syllabics",
  627. /*1680..169F;*/ "Ogham",
  628. /*16A0..16FF;*/ "Runic",
  629. /*1780..17FF;*/ "Khmer",
  630. /*1800..18AF;*/ "Mongolian",
  631. /*1E00..1EFF;*/ "Latin Extended Additional",
  632. /*1F00..1FFF;*/ "Greek Extended",
  633. /*2000..206F;*/ "General Punctuation",
  634. /*2070..209F;*/ "Superscripts and Subscripts",
  635. /*20A0..20CF;*/ "Currency Symbols",
  636. /*20D0..20FF;*/ "Combining Marks for Symbols",
  637. /*2100..214F;*/ "Letterlike Symbols",
  638. /*2150..218F;*/ "Number Forms",
  639. /*2190..21FF;*/ "Arrows",
  640. /*2200..22FF;*/ "Mathematical Operators",
  641. /*2300..23FF;*/ "Miscellaneous Technical",
  642. /*2400..243F;*/ "Control Pictures",
  643. /*2440..245F;*/ "Optical Character Recognition",
  644. /*2460..24FF;*/ "Enclosed Alphanumerics",
  645. /*2500..257F;*/ "Box Drawing",
  646. /*2580..259F;*/ "Block Elements",
  647. /*25A0..25FF;*/ "Geometric Shapes",
  648. /*2600..26FF;*/ "Miscellaneous Symbols",
  649. /*2700..27BF;*/ "Dingbats",
  650. /*2800..28FF;*/ "Braille Patterns",
  651. /*2E80..2EFF;*/ "CJK Radicals Supplement",
  652. /*2F00..2FDF;*/ "Kangxi Radicals",
  653. /*2FF0..2FFF;*/ "Ideographic Description Characters",
  654. /*3000..303F;*/ "CJK Symbols and Punctuation",
  655. /*3040..309F;*/ "Hiragana",
  656. /*30A0..30FF;*/ "Katakana",
  657. /*3100..312F;*/ "Bopomofo",
  658. /*3130..318F;*/ "Hangul Compatibility Jamo",
  659. /*3190..319F;*/ "Kanbun",
  660. /*31A0..31BF;*/ "Bopomofo Extended",
  661. /*3200..32FF;*/ "Enclosed CJK Letters and Months",
  662. /*3300..33FF;*/ "CJK Compatibility",
  663. /*3400..4DB5;*/ "CJK Unified Ideographs Extension A",
  664. /*4E00..9FFF;*/ "CJK Unified Ideographs",
  665. /*A000..A48F;*/ "Yi Syllables",
  666. /*A490..A4CF;*/ "Yi Radicals",
  667. /*AC00..D7A3;*/ "Hangul Syllables",
  668. /*E000..F8FF;*/ "Private Use",
  669. /*F900..FAFF;*/ "CJK Compatibility Ideographs",
  670. /*FB00..FB4F;*/ "Alphabetic Presentation Forms",
  671. /*FB50..FDFF;*/ "Arabic Presentation Forms-A",
  672. /*FE20..FE2F;*/ "Combining Half Marks",
  673. /*FE30..FE4F;*/ "CJK Compatibility Forms",
  674. /*FE50..FE6F;*/ "Small Form Variants",
  675. /*FE70..FEFE;*/ "Arabic Presentation Forms-B",
  676. /*FEFF..FEFF;*/ "Specials",
  677. /*FF00..FFEF;*/ "Halfwidth and Fullwidth Forms",
  678. //missing Specials add manually
  679. /*10300..1032F;*/ "Old Italic", // 84
  680. /*10330..1034F;*/ "Gothic",
  681. /*10400..1044F;*/ "Deseret",
  682. /*1D000..1D0FF;*/ "Byzantine Musical Symbols",
  683. /*1D100..1D1FF;*/ "Musical Symbols",
  684. /*1D400..1D7FF;*/ "Mathematical Alphanumeric Symbols",
  685. /*20000..2A6D6;*/ "CJK Unified Ideographs Extension B",
  686. /*2F800..2FA1F;*/ "CJK Compatibility Ideographs Supplement",
  687. /*E0000..E007F;*/ "Tags",
  688. //missing 2 private use add manually
  689. };
  690. //ADD THOSE MANUALLY
  691. //F0000..FFFFD; "Private Use",
  692. //100000..10FFFD; "Private Use"
  693. //FFF0..FFFD; "Specials",
  694. static final String blockRanges =
  695. "\u0000\u007F\u0080\u00FF\u0100\u017F\u0180\u024F\u0250\u02AF\u02B0\u02FF\u0300\u036F"
  696. +"\u0370\u03FF\u0400\u04FF\u0530\u058F\u0590\u05FF\u0600\u06FF\u0700\u074F\u0780\u07BF"
  697. +"\u0900\u097F\u0980\u09FF\u0A00\u0A7F\u0A80\u0AFF\u0B00\u0B7F\u0B80\u0BFF\u0C00\u0C7F\u0C80\u0CFF"
  698. +"\u0D00\u0D7F\u0D80\u0DFF\u0E00\u0E7F\u0E80\u0EFF\u0F00\u0FFF\u1000\u109F\u10A0\u10FF\u1100\u11FF"
  699. +"\u1200\u137F\u13A0\u13FF\u1400\u167F\u1680\u169F\u16A0\u16FF\u1780\u17FF\u1800\u18AF\u1E00\u1EFF"
  700. +"\u1F00\u1FFF\u2000\u206F\u2070\u209F\u20A0\u20CF\u20D0\u20FF\u2100\u214F\u2150\u218F\u2190\u21FF\u2200\u22FF"
  701. +"\u2300\u23FF\u2400\u243F\u2440\u245F\u2460\u24FF\u2500\u257F\u2580\u259F\u25A0\u25FF\u2600\u26FF\u2700\u27BF"
  702. +"\u2800\u28FF\u2E80\u2EFF\u2F00\u2FDF\u2FF0\u2FFF\u3000\u303F\u3040\u309F\u30A0\u30FF\u3100\u312F\u3130\u318F"
  703. +"\u3190\u319F\u31A0\u31BF\u3200\u32FF\u3300\u33FF\u3400\u4DB5\u4E00\u9FFF\uA000\uA48F\uA490\uA4CF"
  704. +"\uAC00\uD7A3\uE000\uF8FF\uF900\uFAFF\uFB00\uFB4F\uFB50\uFDFF"
  705. +"\uFE20\uFE2F\uFE30\uFE4F\uFE50\uFE6F\uFE70\uFEFE\uFEFF\uFEFF\uFF00\uFFEF";
  706. static final int[] nonBMPBlockRanges = {
  707. 0x10300, 0x1032F, // 84
  708. 0x10330, 0x1034F,
  709. 0x10400, 0x1044F,
  710. 0x1D000, 0x1D0FF,
  711. 0x1D100, 0x1D1FF,
  712. 0x1D400, 0x1D7FF,
  713. 0x20000, 0x2A6D6,
  714. 0x2F800, 0x2FA1F,
  715. 0xE0000, 0xE007F
  716. };
  717. private static final int NONBMP_BLOCK_START = 84;
  718. static protected RangeToken getRange(String name, boolean positive) {
  719. if (Token.categories.size() == 0) {
  720. synchronized (Token.categories) {
  721. Token[] ranges = new Token[Token.categoryNames.length];
  722. for (int i = 0; i < ranges.length; i ++) {
  723. ranges[i] = Token.createRange();
  724. }
  725. int type;
  726. for (int i = 0; i < 0x10000; i ++) {
  727. type = Character.getType((char)i);
  728. if (type == Character.START_PUNCTUATION ||
  729. type == Character.END_PUNCTUATION) {
  730. //build table of Pi values
  731. if (i == 0x00AB || i == 0x2018 || i == 0x201B || i == 0x201C ||
  732. i == 0x201F || i == 0x2039) {
  733. type = CHAR_INIT_QUOTE;
  734. }
  735. //build table of Pf values
  736. if (i == 0x00BB || i == 0x2019 || i == 0x201D || i == 0x203A ) {
  737. type = CHAR_FINAL_QUOTE;
  738. }
  739. }
  740. ranges[type].addRange(i, i);
  741. switch (type) {
  742. case Character.UPPERCASE_LETTER:
  743. case Character.LOWERCASE_LETTER:
  744. case Character.TITLECASE_LETTER:
  745. case Character.MODIFIER_LETTER:
  746. case Character.OTHER_LETTER:
  747. type = CHAR_LETTER;
  748. break;
  749. case Character.NON_SPACING_MARK:
  750. case Character.COMBINING_SPACING_MARK:
  751. case Character.ENCLOSING_MARK:
  752. type = CHAR_MARK;
  753. break;
  754. case Character.DECIMAL_DIGIT_NUMBER:
  755. case Character.LETTER_NUMBER:
  756. case Character.OTHER_NUMBER:
  757. type = CHAR_NUMBER;
  758. break;
  759. case Character.SPACE_SEPARATOR:
  760. case Character.LINE_SEPARATOR:
  761. case Character.PARAGRAPH_SEPARATOR:
  762. type = CHAR_SEPARATOR;
  763. break;
  764. case Character.CONTROL:
  765. case Character.FORMAT:
  766. case Character.SURROGATE:
  767. case Character.PRIVATE_USE:
  768. case Character.UNASSIGNED:
  769. type = CHAR_OTHER;
  770. break;
  771. case Character.CONNECTOR_PUNCTUATION:
  772. case Character.DASH_PUNCTUATION:
  773. case Character.START_PUNCTUATION:
  774. case Character.END_PUNCTUATION:
  775. case CHAR_INIT_QUOTE:
  776. case CHAR_FINAL_QUOTE:
  777. case Character.OTHER_PUNCTUATION:
  778. type = CHAR_PUNCTUATION;
  779. break;
  780. case Character.MATH_SYMBOL:
  781. case Character.CURRENCY_SYMBOL:
  782. case Character.MODIFIER_SYMBOL:
  783. case Character.OTHER_SYMBOL:
  784. type = CHAR_SYMBOL;
  785. break;
  786. default:
  787. throw new RuntimeException("com.sun.org.apache.xerces.internal.utils.regex.Token#getRange(): Unknown Unicode category: "+type);
  788. }
  789. ranges[type].addRange(i, i);
  790. } // for all characters
  791. ranges[Character.UNASSIGNED].addRange(0x10000, Token.UTF16_MAX);
  792. for (int i = 0; i < ranges.length; i ++) {
  793. if (Token.categoryNames[i] != null) {
  794. if (i == Character.UNASSIGNED) { // Unassigned
  795. ranges[i].addRange(0x10000, Token.UTF16_MAX);
  796. }
  797. Token.categories.put(Token.categoryNames[i], ranges[i]);
  798. Token.categories2.put(Token.categoryNames[i],
  799. Token.complementRanges(ranges[i]));
  800. }
  801. }
  802. //REVISIT: do we really need to support block names as in Unicode 3.1
  803. // or we can just create all the names in IsBLOCKNAME format (XML Schema REC)?
  804. //
  805. StringBuffer buffer = new StringBuffer(50);
  806. for (int i = 0; i < Token.blockNames.length; i ++) {
  807. Token r1 = Token.createRange();
  808. int location;
  809. if (i < NONBMP_BLOCK_START) {
  810. location = i*2;
  811. int rstart = Token.blockRanges.charAt(location);
  812. int rend = Token.blockRanges.charAt(location+1);
  813. //DEBUGING
  814. //System.out.println(n+" " +Integer.toHexString(rstart)
  815. // +"-"+ Integer.toHexString(rend));
  816. r1.addRange(rstart, rend);
  817. } else {
  818. location = (i - NONBMP_BLOCK_START) * 2;
  819. r1.addRange(Token.nonBMPBlockRanges[location],
  820. Token.nonBMPBlockRanges[location + 1]);
  821. }
  822. String n = Token.blockNames[i];
  823. if (n.equals("Specials"))
  824. r1.addRange(0xfff0, 0xfffd);
  825. if (n.equals("Private Use")) {
  826. r1.addRange(0xF0000,0xFFFFD);
  827. r1.addRange(0x100000,0x10FFFD);
  828. }
  829. Token.categories.put(n, r1);
  830. Token.categories2.put(n, Token.complementRanges(r1));
  831. buffer.setLength(0);
  832. buffer.append("Is");
  833. if (n.indexOf(' ') >= 0) {
  834. for (int ci = 0; ci < n.length(); ci ++)
  835. if (n.charAt(ci) != ' ') buffer.append((char)n.charAt(ci));
  836. }
  837. else {
  838. buffer.append(n);
  839. }
  840. Token.setAlias(buffer.toString(), n, true);
  841. }
  842. // TR#18 1.2
  843. Token.setAlias("ASSIGNED", "Cn", false);
  844. Token.setAlias("UNASSIGNED", "Cn", true);
  845. Token all = Token.createRange();
  846. all.addRange(0, Token.UTF16_MAX);
  847. Token.categories.put("ALL", all);
  848. Token.categories2.put("ALL", Token.complementRanges(all));
  849. Token.registerNonXS("ASSIGNED");
  850. Token.registerNonXS("UNASSIGNED");
  851. Token.registerNonXS("ALL");
  852. Token isalpha = Token.createRange();
  853. isalpha.mergeRanges(ranges[Character.UPPERCASE_LETTER]); // Lu
  854. isalpha.mergeRanges(ranges[Character.LOWERCASE_LETTER]); // Ll
  855. isalpha.mergeRanges(ranges[Character.OTHER_LETTER]); // Lo
  856. Token.categories.put("IsAlpha", isalpha);
  857. Token.categories2.put("IsAlpha", Token.complementRanges(isalpha));
  858. Token.registerNonXS("IsAlpha");
  859. Token isalnum = Token.createRange();
  860. isalnum.mergeRanges(isalpha); // Lu Ll Lo
  861. isalnum.mergeRanges(ranges[Character.DECIMAL_DIGIT_NUMBER]); // Nd
  862. Token.categories.put("IsAlnum", isalnum);
  863. Token.categories2.put("IsAlnum", Token.complementRanges(isalnum));
  864. Token.registerNonXS("IsAlnum");
  865. Token isspace = Token.createRange();
  866. isspace.mergeRanges(Token.token_spaces);
  867. isspace.mergeRanges(ranges[CHAR_SEPARATOR]); // Z
  868. Token.categories.put("IsSpace", isspace);
  869. Token.categories2.put("IsSpace", Token.complementRanges(isspace));
  870. Token.registerNonXS("IsSpace");
  871. Token isword = Token.createRange();
  872. isword.mergeRanges(isalnum); // Lu Ll Lo Nd
  873. isword.addRange('_', '_');
  874. Token.categories.put("IsWord", isword);
  875. Token.categories2.put("IsWord", Token.complementRanges(isword));
  876. Token.registerNonXS("IsWord");
  877. Token isascii = Token.createRange();
  878. isascii.addRange(0, 127);
  879. Token.categories.put("IsASCII", isascii);
  880. Token.categories2.put("IsASCII", Token.complementRanges(isascii));
  881. Token.registerNonXS("IsASCII");
  882. Token isnotgraph = Token.createRange();
  883. isnotgraph.mergeRanges(ranges[CHAR_OTHER]);
  884. isnotgraph.addRange(' ', ' ');
  885. Token.categories.put("IsGraph", Token.complementRanges(isnotgraph));
  886. Token.categories2.put("IsGraph", isnotgraph);
  887. Token.registerNonXS("IsGraph");
  888. Token isxdigit = Token.createRange();
  889. isxdigit.addRange('0', '9');
  890. isxdigit.addRange('A', 'F');
  891. isxdigit.addRange('a', 'f');
  892. Token.categories.put("IsXDigit", Token.complementRanges(isxdigit));
  893. Token.categories2.put("IsXDigit", isxdigit);
  894. Token.registerNonXS("IsXDigit");
  895. Token.setAlias("IsDigit", "Nd", true);
  896. Token.setAlias("IsUpper", "Lu", true);
  897. Token.setAlias("IsLower", "Ll", true);
  898. Token.setAlias("IsCntrl", "C", true);
  899. Token.setAlias("IsPrint", "C", false);
  900. Token.setAlias("IsPunct", "P", true);
  901. Token.registerNonXS("IsDigit");
  902. Token.registerNonXS("IsUpper");
  903. Token.registerNonXS("IsLower");
  904. Token.registerNonXS("IsCntrl");
  905. Token.registerNonXS("IsPrint");
  906. Token.registerNonXS("IsPunct");
  907. Token.setAlias("alpha", "IsAlpha", true);
  908. Token.setAlias("alnum", "IsAlnum", true);
  909. Token.setAlias("ascii", "IsASCII", true);
  910. Token.setAlias("cntrl", "IsCntrl", true);
  911. Token.setAlias("digit", "IsDigit", true);
  912. Token.setAlias("graph", "IsGraph", true);
  913. Token.setAlias("lower", "IsLower", true);
  914. Token.setAlias("print", "IsPrint", true);
  915. Token.setAlias("punct", "IsPunct", true);
  916. Token.setAlias("space", "IsSpace", true);
  917. Token.setAlias("upper", "IsUpper", true);
  918. Token.setAlias("word", "IsWord", true); // Perl extension
  919. Token.setAlias("xdigit", "IsXDigit", true);
  920. Token.registerNonXS("alpha");
  921. Token.registerNonXS("alnum");
  922. Token.registerNonXS("ascii");
  923. Token.registerNonXS("cntrl");
  924. Token.registerNonXS("digit");
  925. Token.registerNonXS("graph");
  926. Token.registerNonXS("lower");
  927. Token.registerNonXS("print");
  928. Token.registerNonXS("punct");
  929. Token.registerNonXS("space");
  930. Token.registerNonXS("upper");
  931. Token.registerNonXS("word");
  932. Token.registerNonXS("xdigit");
  933. } // synchronized
  934. } // if null
  935. RangeToken tok = positive ? (RangeToken)Token.categories.get(name)
  936. : (RangeToken)Token.categories2.get(name);
  937. //if (tok == null) System.out.println(name);
  938. return tok;
  939. }
  940. static protected RangeToken getRange(String name, boolean positive, boolean xs) {
  941. RangeToken range = Token.getRange(name, positive);
  942. if (xs && range != null && Token.isRegisterNonXS(name))
  943. range = null;
  944. return range;
  945. }
  946. static Hashtable nonxs = null;
  947. /**
  948. * This method is called by only getRange().
  949. * So this method need not MT-safe.
  950. */
  951. static protected void registerNonXS(String name) {
  952. if (Token.nonxs == null)
  953. Token.nonxs = new Hashtable();
  954. Token.nonxs.put(name, name);
  955. }
  956. static protected boolean isRegisterNonXS(String name) {
  957. if (Token.nonxs == null)
  958. return false;
  959. //DEBUG
  960. //System.err.println("isRegisterNonXS: "+name);
  961. return Token.nonxs.containsKey(name);
  962. }
  963. private static void setAlias(String newName, String name, boolean positive) {
  964. Token t1 = (Token)Token.categories.get(name);
  965. Token t2 = (Token)Token.categories2.get(name);
  966. if (positive) {
  967. Token.categories.put(newName, t1);
  968. Token.categories2.put(newName, t2);
  969. } else {
  970. Token.categories2.put(newName, t1);
  971. Token.categories.put(newName, t2);
  972. }
  973. }
  974. // ------------------------------------------------------
  975. static final String viramaString =
  976. "\u094D"// ;DEVANAGARI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  977. +"\u09CD"//;BENGALI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  978. +"\u0A4D"//;GURMUKHI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  979. +"\u0ACD"//;GUJARATI SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  980. +"\u0B4D"//;ORIYA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  981. +"\u0BCD"//;TAMIL SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  982. +"\u0C4D"//;TELUGU SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  983. +"\u0CCD"//;KANNADA SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  984. +"\u0D4D"//;MALAYALAM SIGN VIRAMA;Mn;9;ON;;;;;N;;;;;
  985. +"\u0E3A"//;THAI CHARACTER PHINTHU;Mn;9;ON;;;;;N;THAI VOWEL SIGN PHINTHU;;;;
  986. +"\u0F84";//;TIBETAN MARK HALANTA;Mn;9;ON;;;;;N;TIBETAN VIRAMA;;;;
  987. static private Token token_grapheme = null;
  988. static synchronized Token getGraphemePattern() {
  989. if (Token.token_grapheme != null)
  990. return Token.token_grapheme;
  991. Token base_char = Token.createRange(); // [{ASSIGNED}]-[{M},{C}]
  992. base_char.mergeRanges(Token.getRange("ASSIGNED", true));
  993. base_char.subtractRanges(Token.getRange("M", true));
  994. base_char.subtractRanges(Token.getRange("C", true));
  995. Token virama = Token.createRange();
  996. for (int i = 0; i < Token.viramaString.length(); i ++) {
  997. int ch = viramaString.charAt(i);
  998. virama.addRange(i, i);
  999. }
  1000. Token combiner_wo_virama = Token.createRange();
  1001. combiner_wo_virama.mergeRanges(Token.getRange("M", true));
  1002. combiner_wo_virama.addRange(0x1160, 0x11ff); // hangul_medial and hangul_final
  1003. combiner_wo_virama.addRange(0xff9e, 0xff9f); // extras
  1004. Token left = Token.createUnion(); // base_char?
  1005. left.addChild(base_char);
  1006. left.addChild(Token.token_empty);
  1007. Token foo = Token.createUnion();
  1008. foo.addChild(Token.createConcat(virama, Token.getRange("L", true)));
  1009. foo.addChild(combiner_wo_virama);
  1010. foo = Token.createClosure(foo);
  1011. foo = Token.createConcat(left, foo);
  1012. Token.token_grapheme = foo;
  1013. return Token.token_grapheme;
  1014. }
  1015. /**
  1016. * Combing Character Sequence in Perl 5.6.
  1017. */
  1018. static private Token token_ccs = null;
  1019. static synchronized Token getCombiningCharacterSequence() {
  1020. if (Token.token_ccs != null)
  1021. return Token.token_ccs;
  1022. Token foo = Token.createClosure(Token.getRange("M", true)); // \pM*
  1023. foo = Token.createConcat(Token.getRange("M", false), foo); // \PM + \pM*
  1024. Token.token_ccs = foo;
  1025. return Token.token_ccs;
  1026. }
  1027. // ------------------------------------------------------
  1028. // ------------------------------------------------------
  1029. /**
  1030. * This class represents a node in parse tree.
  1031. */
  1032. static class StringToken extends Token implements java.io.Serializable {
  1033. String string;
  1034. int refNumber;
  1035. StringToken(int type, String str, int n) {
  1036. super(type);
  1037. this.string = str;
  1038. this.refNumber = n;
  1039. }
  1040. int getReferenceNumber() { // for STRING
  1041. return this.refNumber;
  1042. }
  1043. String getString() { // for STRING
  1044. return this.string;
  1045. }
  1046. public String toString(int options) {
  1047. if (this.type == BACKREFERENCE)
  1048. return "\\"+this.refNumber;
  1049. else
  1050. return REUtil.quoteMeta(this.string);
  1051. }
  1052. }
  1053. /**
  1054. * This class represents a node in parse tree.
  1055. */
  1056. static class ConcatToken extends Token implements java.io.Serializable {
  1057. Token child;
  1058. Token child2;
  1059. ConcatToken(Token t1, Token t2) {
  1060. super(Token.CONCAT);
  1061. this.child = t1;
  1062. this.child2 = t2;
  1063. }
  1064. int size() {
  1065. return 2;
  1066. }
  1067. Token getChild(int index) {
  1068. return index == 0 ? this.child : this.child2;
  1069. }
  1070. public String toString(int options) {
  1071. String ret;
  1072. if (this.child2.type == CLOSURE && this.child2.getChild(0) == this.child) {
  1073. ret = this.child.toString(options)+"+";
  1074. } else if (this.child2.type == NONGREEDYCLOSURE && this.child2.getChild(0) == this.child) {
  1075. ret = this.child.toString(options)+"+?";
  1076. } else
  1077. ret = this.child.toString(options)+this.child2.toString(options);
  1078. return ret;
  1079. }
  1080. }
  1081. /**
  1082. * This class represents a node in parse tree.
  1083. */
  1084. static class CharToken extends Token implements java.io.Serializable {
  1085. int chardata;
  1086. CharToken(int type, int ch) {
  1087. super(type);
  1088. this.chardata = ch;
  1089. }
  1090. int getChar() {
  1091. return this.chardata;
  1092. }
  1093. public String toString(int options) {
  1094. String ret;
  1095. switch (this.type) {
  1096. case CHAR:
  1097. switch (this.chardata) {
  1098. case '|': case '*': case '+': case '?':
  1099. case '(': case ')': case '.': case '[':
  1100. case '{': case '\\':
  1101. ret = "\\"+(char)this.chardata;
  1102. break;
  1103. case '\f': ret = "\\f"; break;
  1104. case '\n': ret = "\\n"; break;
  1105. case '\r': ret = "\\r"; break;
  1106. case '\t': ret = "\\t"; break;
  1107. case 0x1b: ret = "\\e"; break;
  1108. //case 0x0b: ret = "\\v"; break;
  1109. default:
  1110. if (this.chardata >= 0x10000) {
  1111. String pre = "0"+Integer.toHexString(this.chardata);
  1112. ret = "\\v"+pre.substring(pre.length()-6, pre.length());
  1113. } else
  1114. ret = ""+(char)this.chardata;
  1115. }
  1116. break;
  1117. case ANCHOR:
  1118. if (this == Token.token_linebeginning || this == Token.token_lineend)
  1119. ret = ""+(char)this.chardata;
  1120. else
  1121. ret = "\\"+(char)this.chardata;
  1122. break;
  1123. default:
  1124. ret = null;
  1125. }
  1126. return ret;
  1127. }
  1128. boolean match(int ch) {
  1129. if (this.type == CHAR) {
  1130. return ch == this.chardata;
  1131. } else
  1132. throw new RuntimeException("NFAArrow#match(): Internal error: "+this.type);
  1133. }
  1134. }
  1135. /**
  1136. * This class represents a node in parse tree.
  1137. */
  1138. static class ClosureToken extends Token implements java.io.Serializable {
  1139. int min;
  1140. int max;
  1141. Token child;
  1142. ClosureToken(int type, Token tok) {
  1143. super(type);
  1144. this.child = tok;
  1145. this.setMin(-1);
  1146. this.setMax(-1);
  1147. }
  1148. int size() {
  1149. return 1;
  1150. }
  1151. Token getChild(int index) {
  1152. return this.child;
  1153. }
  1154. final void setMin(int min) {
  1155. this.min = min;
  1156. }
  1157. final void setMax(int max) {
  1158. this.max = max;
  1159. }
  1160. final int getMin() {
  1161. return this.min;
  1162. }
  1163. final int getMax() {
  1164. return this.max;
  1165. }
  1166. public String toString(int options) {
  1167. String ret;
  1168. if (this.type == CLOSURE) {
  1169. if (this.getMin() < 0 && this.getMax() < 0) {
  1170. ret = this.child.toString(options)+"*";
  1171. } else if (this.getMin() == this.getMax()) {
  1172. ret = this.child.toString(options)+"{"+this.getMin()+"}";
  1173. } else if (this.getMin() >= 0 && this.getMax() >= 0) {
  1174. ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}";
  1175. } else if (this.getMin() >= 0 && this.getMax() < 0) {
  1176. ret = this.child.toString(options)+"{"+this.getMin()+",}";
  1177. } else
  1178. throw new RuntimeException("Token#toString(): CLOSURE "
  1179. +this.getMin()+", "+this.getMax());
  1180. } else {
  1181. if (this.getMin() < 0 && this.getMax() < 0) {
  1182. ret = this.child.toString(options)+"*?";
  1183. } else if (this.getMin() == this.getMax()) {
  1184. ret = this.child.toString(options)+"{"+this.getMin()+"}?";
  1185. } else if (this.getMin() >= 0 && this.getMax() >= 0) {
  1186. ret = this.child.toString(options)+"{"+this.getMin()+","+this.getMax()+"}?";
  1187. } else if (this.getMin() >= 0 && this.getMax() < 0) {
  1188. ret = this.child.toString(options)+"{"+this.getMin()+",}?";
  1189. } else
  1190. throw new RuntimeException("Token#toString(): NONGREEDYCLOSURE "
  1191. +this.getMin()+", "+this.getMax());
  1192. }
  1193. return ret;
  1194. }
  1195. }
  1196. /**
  1197. * This class represents a node in parse tree.
  1198. */
  1199. static class ParenToken extends Token implements java.io.Serializable {
  1200. Token child;
  1201. int parennumber;
  1202. ParenToken(int type, Token tok, int paren) {
  1203. super(type);
  1204. this.child = tok;
  1205. this.parennumber = paren;
  1206. }
  1207. int size() {
  1208. return 1;
  1209. }
  1210. Token getChild(int index) {
  1211. return this.child;
  1212. }
  1213. int getParenNumber() {
  1214. return this.parennumber;
  1215. }
  1216. public String toString(int options) {
  1217. String ret = null;
  1218. switch (this.type) {
  1219. case PAREN:
  1220. if (this.parennumber == 0) {
  1221. ret = "(?:"+this.child.toString(options)+")";
  1222. } else {
  1223. ret = "("+this.child.toString(options)+")";
  1224. }
  1225. break;
  1226. case LOOKAHEAD:
  1227. ret = "(?="+this.child.toString(options)+")";
  1228. break;
  1229. case NEGATIVELOOKAHEAD:
  1230. ret = "(?!"+this.child.toString(options)+")";
  1231. break;
  1232. case LOOKBEHIND:
  1233. ret = "(?<="+this.child.toString(options)+")";
  1234. break;
  1235. case NEGATIVELOOKBEHIND:
  1236. ret = "(?<!"+this.child.toString(options)+")";
  1237. break;
  1238. case INDEPENDENT:
  1239. ret = "(?>"+this.child.toString(options)+")";
  1240. break;
  1241. }
  1242. return ret;
  1243. }
  1244. }
  1245. /**
  1246. * (?(condition)yes-pattern|no-pattern)
  1247. */
  1248. static class ConditionToken extends Token implements java.io.Serializable {
  1249. int refNumber;
  1250. Token condition;
  1251. Token yes;
  1252. Token no;
  1253. ConditionToken(int refno, Token cond, Token yespat, Token nopat) {
  1254. super(Token.CONDITION);
  1255. this.refNumber = refno;
  1256. this.condition = cond;
  1257. this.yes = yespat;
  1258. this.no = nopat;
  1259. }
  1260. int size() {
  1261. return this.no == null ? 1 : 2;
  1262. }
  1263. Token getChild(int index) {
  1264. if (index == 0) return this.yes;
  1265. if (index == 1) return this.no;
  1266. throw new RuntimeException("Internal Error: "+index);
  1267. }
  1268. public String toString(int options) {
  1269. String ret;
  1270. if (refNumber > 0) {
  1271. ret = "(?("+refNumber+")";
  1272. } else if (this.condition.type == Token.ANCHOR) {
  1273. ret = "(?("+this.condition+")";
  1274. } else {
  1275. ret = "(?"+this.condition;
  1276. }
  1277. if (this.no == null) {
  1278. ret += this.yes+")";
  1279. } else {
  1280. ret += this.yes+"|"+this.no+")";
  1281. }
  1282. return ret;
  1283. }
  1284. }
  1285. /**
  1286. * (ims-ims: .... )
  1287. */
  1288. static class ModifierToken extends Token implements java.io.Serializable {
  1289. Token child;
  1290. int add;
  1291. int mask;
  1292. ModifierToken(Token tok, int add, int mask) {
  1293. super(Token.MODIFIERGROUP);
  1294. this.child = tok;
  1295. this.add = add;
  1296. this.mask = mask;
  1297. }
  1298. int size() {
  1299. return 1;
  1300. }
  1301. Token getChild(int index) {
  1302. return this.child;
  1303. }
  1304. int getOptions() {
  1305. return this.add;
  1306. }
  1307. int getOptionsMask() {
  1308. return this.mask;
  1309. }
  1310. public String toString(int options) {
  1311. return "(?"
  1312. +(this.add == 0 ? "" : REUtil.createOptionString(this.add))
  1313. +(this.mask == 0 ? "" : REUtil.createOptionString(this.mask))
  1314. +":"
  1315. +this.child.toString(options)
  1316. +")";
  1317. }
  1318. }
  1319. /**
  1320. * This class represents a node in parse tree.
  1321. * for UNION or CONCAT.
  1322. */
  1323. static class UnionToken extends Token implements java.io.Serializable {
  1324. Vector children;
  1325. UnionToken(int type) {
  1326. super(type);
  1327. }
  1328. void addChild(Token tok) {
  1329. if (tok == null) return;
  1330. if (this.children == null) this.children = new Vector();
  1331. if (this.type == UNION) {
  1332. this.children.addElement(tok);
  1333. return;
  1334. }
  1335. // This is CONCAT, and new child is CONCAT.
  1336. if (tok.type == CONCAT) {
  1337. for (int i = 0; i < tok.size(); i ++)
  1338. this.addChild(tok.getChild(i)); // Recursion
  1339. return;
  1340. }
  1341. int size = this.children.size();
  1342. if (size == 0) {
  1343. this.children.addElement(tok);
  1344. return;
  1345. }
  1346. Token previous = (Token)this.children.elementAt(size-1);
  1347. if (!((previous.type == CHAR || previous.type == STRING)
  1348. && (tok.type == CHAR || tok.type == STRING))) {
  1349. this.children.addElement(tok);
  1350. return;
  1351. }
  1352. //System.err.println("Merge '"+previous+"' and '"+tok+"'.");
  1353. StringBuffer buffer;
  1354. int nextMaxLength = (tok.type == CHAR ? 2 : tok.getString().length());
  1355. if (previous.type == CHAR) { // Replace previous token by STRING
  1356. buffer = new StringBuffer(2 + nextMaxLength);
  1357. int ch = previous.getChar();
  1358. if (ch >= 0x10000)
  1359. buffer.append(REUtil.decomposeToSurrogates(ch));
  1360. else
  1361. buffer.append((char)ch);
  1362. previous = Token.createString(null);
  1363. this.children.setElementAt(previous, size-1);
  1364. } else { // STRING
  1365. buffer = new StringBuffer(previous.getString().length() + nextMaxLength);
  1366. buffer.append(previous.getString());
  1367. }
  1368. if (tok.type == CHAR) {
  1369. int ch = tok.getChar();
  1370. if (ch >= 0x10000)
  1371. buffer.append(REUtil.decomposeToSurrogates(ch));
  1372. else
  1373. buffer.append((char)ch);
  1374. } else {
  1375. buffer.append(tok.getString());
  1376. }
  1377. ((StringToken)previous).string = new String(buffer);
  1378. }
  1379. int size() {
  1380. return this.children == null ? 0 : this.children.size();
  1381. }
  1382. Token getChild(int index) {
  1383. return (Token)this.children.elementAt(index);
  1384. }
  1385. public String toString(int options) {
  1386. String ret;
  1387. if (this.type == CONCAT) {
  1388. if (this.children.size() == 2) {
  1389. Token ch = this.getChild(0);
  1390. Token ch2 = this.getChild(1);
  1391. if (ch2.type == CLOSURE && ch2.getChild(0) == ch) {
  1392. ret = ch.toString(options)+"+";
  1393. } else if (ch2.type == NONGREEDYCLOSURE && ch2.getChild(0) == ch) {
  1394. ret = ch.toString(options)+"+?";
  1395. } else
  1396. ret = ch.toString(options)+ch2.toString(options);
  1397. } else {
  1398. StringBuffer sb = new StringBuffer();
  1399. for (int i = 0; i < this.children.size(); i ++) {
  1400. sb.append(((Token)this.children.elementAt(i)).toString(options));
  1401. }
  1402. ret = new String(sb);
  1403. }
  1404. return ret;
  1405. }
  1406. if (this.children.size() == 2 && this.getChild(1).type == EMPTY) {
  1407. ret = this.getChild(0).toString(options)+"?";
  1408. } else if (this.children.size() == 2
  1409. && this.getChild(0).type == EMPTY) {
  1410. ret = this.getChild(1).toString(options)+"??";
  1411. } else {
  1412. StringBuffer sb = new StringBuffer();
  1413. sb.append(((Token)this.children.elementAt(0)).toString(options));
  1414. for (int i = 1; i < this.children.size(); i ++) {
  1415. sb.append((char)'|');
  1416. sb.append(((Token)this.children.elementAt(i)).toString(options));
  1417. }
  1418. ret = new String(sb);
  1419. }
  1420. return ret;
  1421. }
  1422. }
  1423. }