1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2002 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  58. /**
  59. * This class represents a character class such as [a-z] or a period.
  60. *
  61. * @version $Id: RangeToken.java,v 1.4 2002/08/09 15:18:17 neilg Exp $
  62. */
  63. final class RangeToken extends Token implements java.io.Serializable {
  64. int[] ranges;
  65. boolean sorted;
  66. boolean compacted;
  67. RangeToken icaseCache = null;
  68. int[] map = null;
  69. int nonMapIndex;
  70. RangeToken(int type) {
  71. super(type);
  72. this.setSorted(false);
  73. }
  74. // for RANGE or NRANGE
  75. protected void addRange(int start, int end) {
  76. this.icaseCache = null;
  77. //System.err.println("Token#addRange(): "+start+" "+end);
  78. int r1, r2;
  79. if (start <= end) {
  80. r1 = start;
  81. r2 = end;
  82. } else {
  83. r1 = end;
  84. r2 = start;
  85. }
  86. int pos = 0;
  87. if (this.ranges == null) {
  88. this.ranges = new int[2];
  89. this.ranges[0] = r1;
  90. this.ranges[1] = r2;
  91. this.setSorted(true);
  92. } else {
  93. pos = this.ranges.length;
  94. if (this.ranges[pos-1]+1 == r1) {
  95. this.ranges[pos-1] = r2;
  96. return;
  97. }
  98. int[] temp = new int[pos+2];
  99. System.arraycopy(this.ranges, 0, temp, 0, pos);
  100. this.ranges = temp;
  101. if (this.ranges[pos-1] >= r1)
  102. this.setSorted(false);
  103. this.ranges[pos++] = r1;
  104. this.ranges[pos] = r2;
  105. if (!this.sorted)
  106. this.sortRanges();
  107. }
  108. }
  109. private final boolean isSorted() {
  110. return this.sorted;
  111. }
  112. private final void setSorted(boolean sort) {
  113. this.sorted = sort;
  114. if (!sort) this.compacted = false;
  115. }
  116. private final boolean isCompacted() {
  117. return this.compacted;
  118. }
  119. private final void setCompacted() {
  120. this.compacted = true;
  121. }
  122. protected void sortRanges() {
  123. if (this.isSorted())
  124. return;
  125. if (this.ranges == null)
  126. return;
  127. //System.err.println("Do sorting: "+this.ranges.length);
  128. // Bubble sort
  129. // Why? -- In many cases,
  130. // this.ranges has few elements.
  131. for (int i = this.ranges.length-4; i >= 0; i -= 2) {
  132. for (int j = 0; j <= i; j += 2) {
  133. if (this.ranges[j] > this.ranges[j+2]
  134. || this.ranges[j] == this.ranges[j+2] && this.ranges[j+1] > this.ranges[j+3]) {
  135. int tmp;
  136. tmp = this.ranges[j+2];
  137. this.ranges[j+2] = this.ranges[j];
  138. this.ranges[j] = tmp;
  139. tmp = this.ranges[j+3];
  140. this.ranges[j+3] = this.ranges[j+1];
  141. this.ranges[j+1] = tmp;
  142. }
  143. }
  144. }
  145. this.setSorted(true);
  146. }
  147. /**
  148. * this.ranges is sorted.
  149. */
  150. protected void compactRanges() {
  151. boolean DEBUG = false;
  152. if (this.ranges == null || this.ranges.length <= 2)
  153. return;
  154. if (this.isCompacted())
  155. return;
  156. int base = 0; // Index of writing point
  157. int target = 0; // Index of processing point
  158. while (target < this.ranges.length) {
  159. if (base != target) {
  160. this.ranges[base] = this.ranges[target++];
  161. this.ranges[base+1] = this.ranges[target++];
  162. } else
  163. target += 2;
  164. int baseend = this.ranges[base+1];
  165. while (target < this.ranges.length) {
  166. if (baseend+1 < this.ranges[target])
  167. break;
  168. if (baseend+1 == this.ranges[target]) {
  169. if (DEBUG)
  170. System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  171. +", "+this.ranges[base+1]
  172. +"], ["+this.ranges[target]
  173. +", "+this.ranges[target+1]
  174. +"] -> ["+this.ranges[base]
  175. +", "+this.ranges[target+1]
  176. +"]");
  177. this.ranges[base+1] = this.ranges[target+1];
  178. baseend = this.ranges[base+1];
  179. target += 2;
  180. } else if (baseend >= this.ranges[target+1]) {
  181. if (DEBUG)
  182. System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  183. +", "+this.ranges[base+1]
  184. +"], ["+this.ranges[target]
  185. +", "+this.ranges[target+1]
  186. +"] -> ["+this.ranges[base]
  187. +", "+this.ranges[base+1]
  188. +"]");
  189. target += 2;
  190. } else if (baseend < this.ranges[target+1]) {
  191. if (DEBUG)
  192. System.err.println("Token#compactRanges(): Compaction: ["+this.ranges[base]
  193. +", "+this.ranges[base+1]
  194. +"], ["+this.ranges[target]
  195. +", "+this.ranges[target+1]
  196. +"] -> ["+this.ranges[base]
  197. +", "+this.ranges[target+1]
  198. +"]");
  199. this.ranges[base+1] = this.ranges[target+1];
  200. baseend = this.ranges[base+1];
  201. target += 2;
  202. } else {
  203. throw new RuntimeException("Token#compactRanges(): Internel Error: ["
  204. +this.ranges[base]
  205. +","+this.ranges[base+1]
  206. +"] ["+this.ranges[target]
  207. +","+this.ranges[target+1]+"]");
  208. }
  209. } // while
  210. base += 2;
  211. }
  212. if (base != this.ranges.length) {
  213. int[] result = new int[base];
  214. System.arraycopy(this.ranges, 0, result, 0, base);
  215. this.ranges = result;
  216. }
  217. this.setCompacted();
  218. }
  219. protected void mergeRanges(Token token) {
  220. RangeToken tok = (RangeToken)token;
  221. this.sortRanges();
  222. tok.sortRanges();
  223. if (tok.ranges == null)
  224. return;
  225. this.icaseCache = null;
  226. this.setSorted(true);
  227. if (this.ranges == null) {
  228. this.ranges = new int[tok.ranges.length];
  229. System.arraycopy(tok.ranges, 0, this.ranges, 0, tok.ranges.length);
  230. return;
  231. }
  232. int[] result = new int[this.ranges.length+tok.ranges.length];
  233. for (int i = 0, j = 0, k = 0; i < this.ranges.length || j < tok.ranges.length;) {
  234. if (i >= this.ranges.length) {
  235. result[k++] = tok.ranges[j++];
  236. result[k++] = tok.ranges[j++];
  237. } else if (j >= tok.ranges.length) {
  238. result[k++] = this.ranges[i++];
  239. result[k++] = this.ranges[i++];
  240. } else if (tok.ranges[j] < this.ranges[i]
  241. || tok.ranges[j] == this.ranges[i] && tok.ranges[j+1] < this.ranges[i+1]) {
  242. result[k++] = tok.ranges[j++];
  243. result[k++] = tok.ranges[j++];
  244. } else {
  245. result[k++] = this.ranges[i++];
  246. result[k++] = this.ranges[i++];
  247. }
  248. }
  249. this.ranges = result;
  250. }
  251. protected void subtractRanges(Token token) {
  252. if (token.type == NRANGE) {
  253. this.intersectRanges(token);
  254. return;
  255. }
  256. RangeToken tok = (RangeToken)token;
  257. if (tok.ranges == null || this.ranges == null)
  258. return;
  259. this.icaseCache = null;
  260. this.sortRanges();
  261. this.compactRanges();
  262. tok.sortRanges();
  263. tok.compactRanges();
  264. //System.err.println("Token#substractRanges(): Entry: "+this.ranges.length+", "+tok.ranges.length);
  265. int[] result = new int[this.ranges.length+tok.ranges.length];
  266. int wp = 0, src = 0, sub = 0;
  267. while (src < this.ranges.length && sub < tok.ranges.length) {
  268. int srcbegin = this.ranges[src];
  269. int srcend = this.ranges[src+1];
  270. int subbegin = tok.ranges[sub];
  271. int subend = tok.ranges[sub+1];
  272. if (srcend < subbegin) { // Not overlapped
  273. // src: o-----o
  274. // sub: o-----o
  275. // res: o-----o
  276. // Reuse sub
  277. result[wp++] = this.ranges[src++];
  278. result[wp++] = this.ranges[src++];
  279. } else if (srcend >= subbegin
  280. && srcbegin <= subend) { // Overlapped
  281. // src: o--------o
  282. // sub: o----o
  283. // sub: o----o
  284. // sub: o----o
  285. // sub: o------------o
  286. if (subbegin <= srcbegin && srcend <= subend) {
  287. // src: o--------o
  288. // sub: o------------o
  289. // res: empty
  290. // Reuse sub
  291. src += 2;
  292. } else if (subbegin <= srcbegin) {
  293. // src: o--------o
  294. // sub: o----o
  295. // res: o-----o
  296. // Reuse src(=res)
  297. this.ranges[src] = subend+1;
  298. sub += 2;
  299. } else if (srcend <= subend) {
  300. // src: o--------o
  301. // sub: o----o
  302. // res: o-----o
  303. // Reuse sub
  304. result[wp++] = srcbegin;
  305. result[wp++] = subbegin-1;
  306. src += 2;
  307. } else {
  308. // src: o--------o
  309. // sub: o----o
  310. // res: o-o o-o
  311. // Reuse src(=right res)
  312. result[wp++] = srcbegin;
  313. result[wp++] = subbegin-1;
  314. this.ranges[src] = subend+1;
  315. sub += 2;
  316. }
  317. } else if (subend < srcbegin) {
  318. // Not overlapped
  319. // src: o-----o
  320. // sub: o----o
  321. sub += 2;
  322. } else {
  323. throw new RuntimeException("Token#subtractRanges(): Internal Error: ["+this.ranges[src]
  324. +","+this.ranges[src+1]
  325. +"] - ["+tok.ranges[sub]
  326. +","+tok.ranges[sub+1]
  327. +"]");
  328. }
  329. }
  330. while (src < this.ranges.length) {
  331. result[wp++] = this.ranges[src++];
  332. result[wp++] = this.ranges[src++];
  333. }
  334. this.ranges = new int[wp];
  335. System.arraycopy(result, 0, this.ranges, 0, wp);
  336. // this.ranges is sorted and compacted.
  337. }
  338. /**
  339. * @param tok Ignore whether it is NRANGE or not.
  340. */
  341. protected void intersectRanges(Token token) {
  342. RangeToken tok = (RangeToken)token;
  343. if (tok.ranges == null || this.ranges == null)
  344. return;
  345. this.icaseCache = null;
  346. this.sortRanges();
  347. this.compactRanges();
  348. tok.sortRanges();
  349. tok.compactRanges();
  350. int[] result = new int[this.ranges.length+tok.ranges.length];
  351. int wp = 0, src1 = 0, src2 = 0;
  352. while (src1 < this.ranges.length && src2 < tok.ranges.length) {
  353. int src1begin = this.ranges[src1];
  354. int src1end = this.ranges[src1+1];
  355. int src2begin = tok.ranges[src2];
  356. int src2end = tok.ranges[src2+1];
  357. if (src1end < src2begin) { // Not overlapped
  358. // src1: o-----o
  359. // src2: o-----o
  360. // res: empty
  361. // Reuse src2
  362. src1 += 2;
  363. } else if (src1end >= src2begin
  364. && src1begin <= src2end) { // Overlapped
  365. // src1: o--------o
  366. // src2: o----o
  367. // src2: o----o
  368. // src2: o----o
  369. // src2: o------------o
  370. if (src2begin <= src2begin && src1end <= src2end) {
  371. // src1: o--------o
  372. // src2: o------------o
  373. // res: o--------o
  374. // Reuse src2
  375. result[wp++] = src1begin;
  376. result[wp++] = src1end;
  377. src1 += 2;
  378. } else if (src2begin <= src1begin) {
  379. // src1: o--------o
  380. // src2: o----o
  381. // res: o--o
  382. // Reuse the rest of src1
  383. result[wp++] = src1begin;
  384. result[wp++] = src2end;
  385. this.ranges[src1] = src2end+1;
  386. src2 += 2;
  387. } else if (src1end <= src2end) {
  388. // src1: o--------o
  389. // src2: o----o
  390. // res: o--o
  391. // Reuse src2
  392. result[wp++] = src2begin;
  393. result[wp++] = src1end;
  394. src1 += 2;
  395. } else {
  396. // src1: o--------o
  397. // src2: o----o
  398. // res: o----o
  399. // Reuse the rest of src1
  400. result[wp++] = src2begin;
  401. result[wp++] = src2end;
  402. this.ranges[src1] = src2end+1;
  403. }
  404. } else if (src2end < src1begin) {
  405. // Not overlapped
  406. // src1: o-----o
  407. // src2: o----o
  408. src2 += 2;
  409. } else {
  410. throw new RuntimeException("Token#intersectRanges(): Internal Error: ["
  411. +this.ranges[src1]
  412. +","+this.ranges[src1+1]
  413. +"] & ["+tok.ranges[src2]
  414. +","+tok.ranges[src2+1]
  415. +"]");
  416. }
  417. }
  418. while (src1 < this.ranges.length) {
  419. result[wp++] = this.ranges[src1++];
  420. result[wp++] = this.ranges[src1++];
  421. }
  422. this.ranges = new int[wp];
  423. System.arraycopy(result, 0, this.ranges, 0, wp);
  424. // this.ranges is sorted and compacted.
  425. }
  426. /**
  427. * for RANGE: Creates complement.
  428. * for NRANGE: Creates the same meaning RANGE.
  429. */
  430. static Token complementRanges(Token token) {
  431. if (token.type != RANGE && token.type != NRANGE)
  432. throw new IllegalArgumentException("Token#complementRanges(): must be RANGE: "+token.type);
  433. RangeToken tok = (RangeToken)token;
  434. tok.sortRanges();
  435. tok.compactRanges();
  436. int len = tok.ranges.length+2;
  437. if (tok.ranges[0] == 0)
  438. len -= 2;
  439. int last = tok.ranges[tok.ranges.length-1];
  440. if (last == UTF16_MAX)
  441. len -= 2;
  442. RangeToken ret = Token.createRange();
  443. ret.ranges = new int[len];
  444. int wp = 0;
  445. if (tok.ranges[0] > 0) {
  446. ret.ranges[wp++] = 0;
  447. ret.ranges[wp++] = tok.ranges[0]-1;
  448. }
  449. for (int i = 1; i < tok.ranges.length-2; i += 2) {
  450. ret.ranges[wp++] = tok.ranges[i]+1;
  451. ret.ranges[wp++] = tok.ranges[i+1]-1;
  452. }
  453. if (last != UTF16_MAX) {
  454. ret.ranges[wp++] = last+1;
  455. ret.ranges[wp] = UTF16_MAX;
  456. }
  457. ret.setCompacted();
  458. return ret;
  459. }
  460. synchronized RangeToken getCaseInsensitiveToken() {
  461. if (this.icaseCache != null)
  462. return this.icaseCache;
  463. RangeToken uppers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  464. for (int i = 0; i < this.ranges.length; i += 2) {
  465. for (int ch = this.ranges[i]; ch <= this.ranges[i+1]; ch ++) {
  466. if (ch > 0xffff)
  467. uppers.addRange(ch, ch);
  468. else {
  469. char uch = Character.toUpperCase((char)ch);
  470. uppers.addRange(uch, uch);
  471. }
  472. }
  473. }
  474. RangeToken lowers = this.type == Token.RANGE ? Token.createRange() : Token.createNRange();
  475. for (int i = 0; i < uppers.ranges.length; i += 2) {
  476. for (int ch = uppers.ranges[i]; ch <= uppers.ranges[i+1]; ch ++) {
  477. if (ch > 0xffff)
  478. lowers.addRange(ch, ch);
  479. else {
  480. char uch = Character.toUpperCase((char)ch);
  481. lowers.addRange(uch, uch);
  482. }
  483. }
  484. }
  485. lowers.mergeRanges(uppers);
  486. lowers.mergeRanges(this);
  487. lowers.compactRanges();
  488. this.icaseCache = lowers;
  489. return lowers;
  490. }
  491. void dumpRanges() {
  492. System.err.print("RANGE: ");
  493. if (this.ranges == null)
  494. System.err.println(" NULL");
  495. for (int i = 0; i < this.ranges.length; i += 2) {
  496. System.err.print("["+this.ranges[i]+","+this.ranges[i+1]+"] ");
  497. }
  498. System.err.println("");
  499. }
  500. boolean match(int ch) {
  501. if (this.map == null) this.createMap();
  502. boolean ret;
  503. if (this.type == RANGE) {
  504. if (ch < MAPSIZE)
  505. return (this.map[ch32] & (1<<(ch&0x1f))) != 0;
  506. ret = false;
  507. for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
  508. if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  509. return true;
  510. }
  511. } else {
  512. if (ch < MAPSIZE)
  513. return (this.map[ch32] & (1<<(ch&0x1f))) == 0;
  514. ret = true;
  515. for (int i = this.nonMapIndex; i < this.ranges.length; i += 2) {
  516. if (this.ranges[i] <= ch && ch <= this.ranges[i+1])
  517. return false;
  518. }
  519. }
  520. return ret;
  521. }
  522. private static final int MAPSIZE = 256;
  523. private void createMap() {
  524. int asize = MAPSIZE32; // 32 is the number of bits in `int'.
  525. this.map = new int[asize];
  526. this.nonMapIndex = this.ranges.length;
  527. for (int i = 0; i < asize; i ++) this.map[i] = 0;
  528. for (int i = 0; i < this.ranges.length; i += 2) {
  529. int s = this.ranges[i];
  530. int e = this.ranges[i+1];
  531. if (s < MAPSIZE) {
  532. for (int j = s; j <= e && j < MAPSIZE; j ++)
  533. this.map[j32] |= 1<<(j&0x1f); // s&0x1f : 0-31
  534. } else {
  535. this.nonMapIndex = i;
  536. break;
  537. }
  538. if (e >= MAPSIZE) {
  539. this.nonMapIndex = i;
  540. break;
  541. }
  542. }
  543. //for (int i = 0; i < asize; i ++) System.err.println("Map: "+Integer.toString(this.map[i], 16));
  544. }
  545. public String toString(int options) {
  546. String ret;
  547. if (this.type == RANGE) {
  548. if (this == Token.token_dot)
  549. ret = ".";
  550. else if (this == Token.token_0to9)
  551. ret = "\\d";
  552. else if (this == Token.token_wordchars)
  553. ret = "\\w";
  554. else if (this == Token.token_spaces)
  555. ret = "\\s";
  556. else {
  557. StringBuffer sb = new StringBuffer();
  558. sb.append("[");
  559. for (int i = 0; i < this.ranges.length; i += 2) {
  560. if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
  561. if (this.ranges[i] == this.ranges[i+1]) {
  562. sb.append(escapeCharInCharClass(this.ranges[i]));
  563. } else {
  564. sb.append(escapeCharInCharClass(this.ranges[i]));
  565. sb.append((char)'-');
  566. sb.append(escapeCharInCharClass(this.ranges[i+1]));
  567. }
  568. }
  569. sb.append("]");
  570. ret = sb.toString();
  571. }
  572. } else {
  573. if (this == Token.token_not_0to9)
  574. ret = "\\D";
  575. else if (this == Token.token_not_wordchars)
  576. ret = "\\W";
  577. else if (this == Token.token_not_spaces)
  578. ret = "\\S";
  579. else {
  580. StringBuffer sb = new StringBuffer();
  581. sb.append("[^");
  582. for (int i = 0; i < this.ranges.length; i += 2) {
  583. if ((options & RegularExpression.SPECIAL_COMMA) != 0 && i > 0) sb.append(",");
  584. if (this.ranges[i] == this.ranges[i+1]) {
  585. sb.append(escapeCharInCharClass(this.ranges[i]));
  586. } else {
  587. sb.append(escapeCharInCharClass(this.ranges[i]));
  588. sb.append('-');
  589. sb.append(escapeCharInCharClass(this.ranges[i+1]));
  590. }
  591. }
  592. sb.append("]");
  593. ret = sb.toString();
  594. }
  595. }
  596. return ret;
  597. }
  598. private static String escapeCharInCharClass(int ch) {
  599. String ret;
  600. switch (ch) {
  601. case '[': case ']': case '-': case '^':
  602. case ',': case '\\':
  603. ret = "\\"+(char)ch;
  604. break;
  605. case '\f': ret = "\\f"; break;
  606. case '\n': ret = "\\n"; break;
  607. case '\r': ret = "\\r"; break;
  608. case '\t': ret = "\\t"; break;
  609. case 0x1b: ret = "\\e"; break;
  610. //case 0x0b: ret = "\\v"; break;
  611. default:
  612. if (ch < 0x20) {
  613. String pre = "0"+Integer.toHexString(ch);
  614. ret = "\\x"+pre.substring(pre.length()-2, pre.length());
  615. } else if (ch >= 0x10000) {
  616. String pre = "0"+Integer.toHexString(ch);
  617. ret = "\\v"+pre.substring(pre.length()-6, pre.length());
  618. } else
  619. ret = ""+(char)ch;
  620. }
  621. return ret;
  622. }
  623. }