1. /*
  2. * The Apache Software License, Version 1.1
  3. *
  4. *
  5. * Copyright (c) 1999-2003 The Apache Software Foundation. All rights
  6. * reserved.
  7. *
  8. * Redistribution and use in source and binary forms, with or without
  9. * modification, are permitted provided that the following conditions
  10. * are met:
  11. *
  12. * 1. Redistributions of source code must retain the above copyright
  13. * notice, this list of conditions and the following disclaimer.
  14. *
  15. * 2. Redistributions in binary form must reproduce the above copyright
  16. * notice, this list of conditions and the following disclaimer in
  17. * the documentation and/or other materials provided with the
  18. * distribution.
  19. *
  20. * 3. The end-user documentation included with the redistribution,
  21. * if any, must include the following acknowledgment:
  22. * "This product includes software developed by the
  23. * Apache Software Foundation (http://www.apache.org/)."
  24. * Alternately, this acknowledgment may appear in the software itself,
  25. * if and wherever such third-party acknowledgments normally appear.
  26. *
  27. * 4. The names "Xerces" and "Apache Software Foundation" must
  28. * not be used to endorse or promote products derived from this
  29. * software without prior written permission. For written
  30. * permission, please contact apache@apache.org.
  31. *
  32. * 5. Products derived from this software may not be called "Apache",
  33. * nor may "Apache" appear in their name, without prior written
  34. * permission of the Apache Software Foundation.
  35. *
  36. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  37. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  38. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  39. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  40. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  41. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  42. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  43. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  44. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  45. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  46. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  47. * SUCH DAMAGE.
  48. * ====================================================================
  49. *
  50. * This software consists of voluntary contributions made by many
  51. * individuals on behalf of the Apache Software Foundation and was
  52. * originally based on software copyright (c) 1999, International
  53. * Business Machines, Inc., http://www.apache.org. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. */
  57. package com.sun.org.apache.xerces.internal.impl.xpath.regex;
  58. import java.util.Locale;
  59. import java.util.MissingResourceException;
  60. import java.util.ResourceBundle;
  61. import java.util.Vector;
  62. /**
  63. * A Regular Expression Parser.
  64. *
  65. * @version $Id: RegexParser.java,v 1.8 2003/03/25 14:47:06 sandygao Exp $
  66. */
  67. class RegexParser {
  68. static final int T_CHAR = 0;
  69. static final int T_EOF = 1;
  70. static final int T_OR = 2; // '|'
  71. static final int T_STAR = 3; // '*'
  72. static final int T_PLUS = 4; // '+'
  73. static final int T_QUESTION = 5; // '?'
  74. static final int T_LPAREN = 6; // '('
  75. static final int T_RPAREN = 7; // ')'
  76. static final int T_DOT = 8; // '.'
  77. static final int T_LBRACKET = 9; // '['
  78. static final int T_BACKSOLIDUS = 10; // '\'
  79. static final int T_CARET = 11; // '^'
  80. static final int T_DOLLAR = 12; // '$'
  81. static final int T_LPAREN2 = 13; // '(?:'
  82. static final int T_LOOKAHEAD = 14; // '(?='
  83. static final int T_NEGATIVELOOKAHEAD = 15; // '(?!'
  84. static final int T_LOOKBEHIND = 16; // '(?<='
  85. static final int T_NEGATIVELOOKBEHIND = 17; // '(?<!'
  86. static final int T_INDEPENDENT = 18; // '(?>'
  87. static final int T_SET_OPERATIONS = 19; // '(?['
  88. static final int T_POSIX_CHARCLASS_START = 20; // '[:' in a character class
  89. static final int T_COMMENT = 21; // '(?#'
  90. static final int T_MODIFIERS = 22; // '(?' [\-,a-z,A-Z]
  91. static final int T_CONDITION = 23; // '(?('
  92. static final int T_XMLSCHEMA_CC_SUBTRACTION = 24; // '-[' in a character class
  93. static class ReferencePosition {
  94. int refNumber;
  95. int position;
  96. ReferencePosition(int n, int pos) {
  97. this.refNumber = n;
  98. this.position = pos;
  99. }
  100. }
  101. int offset;
  102. String regex;
  103. int regexlen;
  104. int options;
  105. ResourceBundle resources;
  106. int chardata;
  107. int nexttoken;
  108. static protected final int S_NORMAL = 0;
  109. static protected final int S_INBRACKETS = 1;
  110. static protected final int S_INXBRACKETS = 2;
  111. int context = S_NORMAL;
  112. int parennumber = 1;
  113. boolean hasBackReferences;
  114. Vector references = null;
  115. public RegexParser() {
  116. this.setLocale(Locale.getDefault());
  117. }
  118. public RegexParser(Locale locale) {
  119. this.setLocale(locale);
  120. }
  121. public void setLocale(Locale locale) {
  122. try {
  123. this.resources = ResourceBundle.getBundle("com.sun.org.apache.xerces.internal.impl.xpath.regex.message", locale);
  124. } catch (MissingResourceException mre) {
  125. throw new RuntimeException("Installation Problem??? Couldn't load messages: "
  126. +mre.getMessage());
  127. }
  128. }
  129. final ParseException ex(String key, int loc) {
  130. return new ParseException(this.resources.getString(key), loc);
  131. }
  132. private final boolean isSet(int flag) {
  133. return (this.options & flag) == flag;
  134. }
  135. synchronized Token parse(String regex, int options) throws ParseException {
  136. this.options = options;
  137. this.offset = 0;
  138. this.setContext(S_NORMAL);
  139. this.parennumber = 1;
  140. this.hasBackReferences = false;
  141. this.regex = regex;
  142. if (this.isSet(RegularExpression.EXTENDED_COMMENT))
  143. this.regex = REUtil.stripExtendedComment(this.regex);
  144. this.regexlen = this.regex.length();
  145. this.next();
  146. Token ret = this.parseRegex();
  147. if (this.offset != this.regexlen)
  148. throw ex("parser.parse.1", this.offset);
  149. if (this.references != null) {
  150. for (int i = 0; i < this.references.size(); i ++) {
  151. ReferencePosition position = (ReferencePosition)this.references.elementAt(i);
  152. if (this.parennumber <= position.refNumber)
  153. throw ex("parser.parse.2", position.position);
  154. }
  155. this.references.removeAllElements();
  156. }
  157. return ret;
  158. }
  159. /*
  160. public RegularExpression createRegex(String regex, int options) throws ParseException {
  161. Token tok = this.parse(regex, options);
  162. return new RegularExpression(regex, tok, this.parennumber, this.hasBackReferences, options);
  163. }
  164. */
  165. protected final void setContext(int con) {
  166. this.context = con;
  167. }
  168. final int read() {
  169. return this.nexttoken;
  170. }
  171. final void next() {
  172. if (this.offset >= this.regexlen) {
  173. this.chardata = -1;
  174. this.nexttoken = T_EOF;
  175. return;
  176. }
  177. int ret;
  178. int ch = this.regex.charAt(this.offset++);
  179. this.chardata = ch;
  180. if (this.context == S_INBRACKETS) {
  181. // In a character class, this.chardata has one character, that is to say,
  182. // a pair of surrogates is composed and stored to this.chardata.
  183. switch (ch) {
  184. case '\\':
  185. ret = T_BACKSOLIDUS;
  186. if (this.offset >= this.regexlen)
  187. throw ex("parser.next.1", this.offset-1);
  188. this.chardata = this.regex.charAt(this.offset++);
  189. break;
  190. case '-':
  191. if (this.isSet(RegularExpression.XMLSCHEMA_MODE)
  192. && this.offset < this.regexlen && this.regex.charAt(this.offset) == '[') {
  193. this.offset++;
  194. ret = T_XMLSCHEMA_CC_SUBTRACTION;
  195. } else
  196. ret = T_CHAR;
  197. break;
  198. case '[':
  199. if (!this.isSet(RegularExpression.XMLSCHEMA_MODE)
  200. && this.offset < this.regexlen && this.regex.charAt(this.offset) == ':') {
  201. this.offset++;
  202. ret = T_POSIX_CHARCLASS_START;
  203. break;
  204. } // Through down
  205. default:
  206. if (REUtil.isHighSurrogate(ch) && this.offset < this.regexlen) {
  207. int low = this.regex.charAt(this.offset);
  208. if (REUtil.isLowSurrogate(low)) {
  209. this.chardata = REUtil.composeFromSurrogates(ch, low);
  210. this.offset ++;
  211. }
  212. }
  213. ret = T_CHAR;
  214. }
  215. this.nexttoken = ret;
  216. return;
  217. }
  218. switch (ch) {
  219. case '|': ret = T_OR; break;
  220. case '*': ret = T_STAR; break;
  221. case '+': ret = T_PLUS; break;
  222. case '?': ret = T_QUESTION; break;
  223. case ')': ret = T_RPAREN; break;
  224. case '.': ret = T_DOT; break;
  225. case '[': ret = T_LBRACKET; break;
  226. case '^': ret = T_CARET; break;
  227. case '$': ret = T_DOLLAR; break;
  228. case '(':
  229. ret = T_LPAREN;
  230. if (this.offset >= this.regexlen)
  231. break;
  232. if (this.regex.charAt(this.offset) != '?')
  233. break;
  234. if (++this.offset >= this.regexlen)
  235. throw ex("parser.next.2", this.offset-1);
  236. ch = this.regex.charAt(this.offset++);
  237. switch (ch) {
  238. case ':': ret = T_LPAREN2; break;
  239. case '=': ret = T_LOOKAHEAD; break;
  240. case '!': ret = T_NEGATIVELOOKAHEAD; break;
  241. case '[': ret = T_SET_OPERATIONS; break;
  242. case '>': ret = T_INDEPENDENT; break;
  243. case '<':
  244. if (this.offset >= this.regexlen)
  245. throw ex("parser.next.2", this.offset-3);
  246. ch = this.regex.charAt(this.offset++);
  247. if (ch == '=') {
  248. ret = T_LOOKBEHIND;
  249. } else if (ch == '!') {
  250. ret = T_NEGATIVELOOKBEHIND;
  251. } else
  252. throw ex("parser.next.3", this.offset-3);
  253. break;
  254. case '#':
  255. while (this.offset < this.regexlen) {
  256. ch = this.regex.charAt(this.offset++);
  257. if (ch == ')') break;
  258. }
  259. if (ch != ')')
  260. throw ex("parser.next.4", this.offset-1);
  261. ret = T_COMMENT;
  262. break;
  263. default:
  264. if (ch == '-' || 'a' <= ch && ch <= 'z' || 'A' <= ch && ch <= 'Z') {// Options
  265. this.offset --;
  266. ret = T_MODIFIERS;
  267. break;
  268. } else if (ch == '(') { // conditional
  269. ret = T_CONDITION; // this.offsets points the next of '('.
  270. break;
  271. }
  272. throw ex("parser.next.2", this.offset-2);
  273. }
  274. break;
  275. case '\\':
  276. ret = T_BACKSOLIDUS;
  277. if (this.offset >= this.regexlen)
  278. throw ex("parser.next.1", this.offset-1);
  279. this.chardata = this.regex.charAt(this.offset++);
  280. break;
  281. default:
  282. ret = T_CHAR;
  283. }
  284. this.nexttoken = ret;
  285. }
  286. /**
  287. * regex ::= term (`|` term)*
  288. * term ::= factor+
  289. * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  290. * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  291. * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
  292. * atom ::= char | '.' | range | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  293. * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
  294. */
  295. Token parseRegex() throws ParseException {
  296. Token tok = this.parseTerm();
  297. Token parent = null;
  298. while (this.read() == T_OR) {
  299. this.next(); // '|'
  300. if (parent == null) {
  301. parent = Token.createUnion();
  302. parent.addChild(tok);
  303. tok = parent;
  304. }
  305. tok.addChild(this.parseTerm());
  306. }
  307. return tok;
  308. }
  309. /**
  310. * term ::= factor+
  311. */
  312. Token parseTerm() throws ParseException {
  313. int ch = this.read();
  314. if (ch == T_OR || ch == T_RPAREN || ch == T_EOF) {
  315. return Token.createEmpty();
  316. } else {
  317. Token tok = this.parseFactor();
  318. Token concat = null;
  319. while ((ch = this.read()) != T_OR && ch != T_RPAREN && ch != T_EOF) {
  320. if (concat == null) {
  321. concat = Token.createConcat();
  322. concat.addChild(tok);
  323. tok = concat;
  324. }
  325. concat.addChild(this.parseFactor());
  326. //tok = Token.createConcat(tok, this.parseFactor());
  327. }
  328. return tok;
  329. }
  330. }
  331. // ----------------------------------------------------------------
  332. Token processCaret() throws ParseException {
  333. this.next();
  334. return Token.token_linebeginning;
  335. }
  336. Token processDollar() throws ParseException {
  337. this.next();
  338. return Token.token_lineend;
  339. }
  340. Token processLookahead() throws ParseException {
  341. this.next();
  342. Token tok = Token.createLook(Token.LOOKAHEAD, this.parseRegex());
  343. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  344. this.next(); // ')'
  345. return tok;
  346. }
  347. Token processNegativelookahead() throws ParseException {
  348. this.next();
  349. Token tok = Token.createLook(Token.NEGATIVELOOKAHEAD, this.parseRegex());
  350. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  351. this.next(); // ')'
  352. return tok;
  353. }
  354. Token processLookbehind() throws ParseException {
  355. this.next();
  356. Token tok = Token.createLook(Token.LOOKBEHIND, this.parseRegex());
  357. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  358. this.next(); // ')'
  359. return tok;
  360. }
  361. Token processNegativelookbehind() throws ParseException {
  362. this.next();
  363. Token tok = Token.createLook(Token.NEGATIVELOOKBEHIND, this.parseRegex());
  364. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  365. this.next(); // ')'
  366. return tok;
  367. }
  368. Token processBacksolidus_A() throws ParseException {
  369. this.next();
  370. return Token.token_stringbeginning;
  371. }
  372. Token processBacksolidus_Z() throws ParseException {
  373. this.next();
  374. return Token.token_stringend2;
  375. }
  376. Token processBacksolidus_z() throws ParseException {
  377. this.next();
  378. return Token.token_stringend;
  379. }
  380. Token processBacksolidus_b() throws ParseException {
  381. this.next();
  382. return Token.token_wordedge;
  383. }
  384. Token processBacksolidus_B() throws ParseException {
  385. this.next();
  386. return Token.token_not_wordedge;
  387. }
  388. Token processBacksolidus_lt() throws ParseException {
  389. this.next();
  390. return Token.token_wordbeginning;
  391. }
  392. Token processBacksolidus_gt() throws ParseException {
  393. this.next();
  394. return Token.token_wordend;
  395. }
  396. Token processStar(Token tok) throws ParseException {
  397. this.next();
  398. if (this.read() == T_QUESTION) {
  399. this.next();
  400. return Token.createNGClosure(tok);
  401. } else
  402. return Token.createClosure(tok);
  403. }
  404. Token processPlus(Token tok) throws ParseException {
  405. // X+ -> XX*
  406. this.next();
  407. if (this.read() == T_QUESTION) {
  408. this.next();
  409. return Token.createConcat(tok, Token.createNGClosure(tok));
  410. } else
  411. return Token.createConcat(tok, Token.createClosure(tok));
  412. }
  413. Token processQuestion(Token tok) throws ParseException {
  414. // X? -> X|
  415. this.next();
  416. Token par = Token.createUnion();
  417. if (this.read() == T_QUESTION) {
  418. this.next();
  419. par.addChild(Token.createEmpty());
  420. par.addChild(tok);
  421. } else {
  422. par.addChild(tok);
  423. par.addChild(Token.createEmpty());
  424. }
  425. return par;
  426. }
  427. boolean checkQuestion(int off) {
  428. return off < this.regexlen && this.regex.charAt(off) == '?';
  429. }
  430. Token processParen() throws ParseException {
  431. this.next();
  432. int p = this.parennumber++;
  433. Token tok = Token.createParen(this.parseRegex(), p);
  434. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  435. this.next(); // Skips ')'
  436. return tok;
  437. }
  438. Token processParen2() throws ParseException {
  439. this.next();
  440. Token tok = Token.createParen(this.parseRegex(), 0);
  441. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  442. this.next(); // Skips ')'
  443. return tok;
  444. }
  445. Token processCondition() throws ParseException {
  446. // this.offset points the next of '('
  447. if (this.offset+1 >= this.regexlen) throw ex("parser.factor.4", this.offset);
  448. // Parses a condition.
  449. int refno = -1;
  450. Token condition = null;
  451. int ch = this.regex.charAt(this.offset);
  452. if ('1' <= ch && ch <= '9') {
  453. refno = ch-'0';
  454. this.hasBackReferences = true;
  455. if (this.references == null) this.references = new Vector();
  456. this.references.addElement(new ReferencePosition(refno, this.offset));
  457. this.offset ++;
  458. if (this.regex.charAt(this.offset) != ')') throw ex("parser.factor.1", this.offset);
  459. this.offset ++;
  460. } else {
  461. if (ch == '?') this.offset --; // Points '('.
  462. this.next();
  463. condition = this.parseFactor();
  464. switch (condition.type) {
  465. case Token.LOOKAHEAD:
  466. case Token.NEGATIVELOOKAHEAD:
  467. case Token.LOOKBEHIND:
  468. case Token.NEGATIVELOOKBEHIND:
  469. break;
  470. case Token.ANCHOR:
  471. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  472. break;
  473. default:
  474. throw ex("parser.factor.5", this.offset);
  475. }
  476. }
  477. // Parses yes/no-patterns.
  478. this.next();
  479. Token yesPattern = this.parseRegex();
  480. Token noPattern = null;
  481. if (yesPattern.type == Token.UNION) {
  482. if (yesPattern.size() != 2) throw ex("parser.factor.6", this.offset);
  483. noPattern = yesPattern.getChild(1);
  484. yesPattern = yesPattern.getChild(0);
  485. }
  486. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  487. this.next();
  488. return Token.createCondition(refno, condition, yesPattern, noPattern);
  489. }
  490. Token processModifiers() throws ParseException {
  491. // this.offset points the next of '?'.
  492. // modifiers ::= [imsw]* ('-' [imsw]*)? ':'
  493. int add = 0, mask = 0, ch = -1;
  494. while (this.offset < this.regexlen) {
  495. ch = this.regex.charAt(this.offset);
  496. int v = REUtil.getOptionValue(ch);
  497. if (v == 0) break; // '-' or ':'?
  498. add |= v;
  499. this.offset ++;
  500. }
  501. if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
  502. if (ch == '-') {
  503. this.offset ++;
  504. while (this.offset < this.regexlen) {
  505. ch = this.regex.charAt(this.offset);
  506. int v = REUtil.getOptionValue(ch);
  507. if (v == 0) break; // ':'?
  508. mask |= v;
  509. this.offset ++;
  510. }
  511. if (this.offset >= this.regexlen) throw ex("parser.factor.2", this.offset-1);
  512. }
  513. Token tok;
  514. if (ch == ':') {
  515. this.offset ++;
  516. this.next();
  517. tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  518. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  519. this.next();
  520. } else if (ch == ')') { // such as (?-i)
  521. this.offset ++;
  522. this.next();
  523. tok = Token.createModifierGroup(this.parseRegex(), add, mask);
  524. } else
  525. throw ex("parser.factor.3", this.offset);
  526. return tok;
  527. }
  528. Token processIndependent() throws ParseException {
  529. this.next();
  530. Token tok = Token.createLook(Token.INDEPENDENT, this.parseRegex());
  531. if (this.read() != T_RPAREN) throw ex("parser.factor.1", this.offset-1);
  532. this.next(); // Skips ')'
  533. return tok;
  534. }
  535. Token processBacksolidus_c() throws ParseException {
  536. int ch2; // Must be in 0x0040-0x005f
  537. if (this.offset >= this.regexlen
  538. || ((ch2 = this.regex.charAt(this.offset++)) & 0xffe0) != 0x0040)
  539. throw ex("parser.atom.1", this.offset-1);
  540. this.next();
  541. return Token.createChar(ch2-0x40);
  542. }
  543. Token processBacksolidus_C() throws ParseException {
  544. throw ex("parser.process.1", this.offset);
  545. }
  546. Token processBacksolidus_i() throws ParseException {
  547. Token tok = Token.createChar('i');
  548. this.next();
  549. return tok;
  550. }
  551. Token processBacksolidus_I() throws ParseException {
  552. throw ex("parser.process.1", this.offset);
  553. }
  554. Token processBacksolidus_g() throws ParseException {
  555. this.next();
  556. return Token.getGraphemePattern();
  557. }
  558. Token processBacksolidus_X() throws ParseException {
  559. this.next();
  560. return Token.getCombiningCharacterSequence();
  561. }
  562. Token processBackreference() throws ParseException {
  563. int refnum = this.chardata-'0';
  564. Token tok = Token.createBackReference(refnum);
  565. this.hasBackReferences = true;
  566. if (this.references == null) this.references = new Vector();
  567. this.references.addElement(new ReferencePosition(refnum, this.offset-2));
  568. this.next();
  569. return tok;
  570. }
  571. // ----------------------------------------------------------------
  572. /**
  573. * factor ::= ('^' | '$' | '\A' | '\Z' | '\z' | '\b' | '\B' | '\<' | '\>'
  574. * | atom (('*' | '+' | '?' | minmax ) '?'? )?)
  575. * | '(?=' regex ')' | '(?!' regex ')' | '(?<=' regex ')' | '(?<!' regex ')'
  576. * | '(?#' [^)]* ')'
  577. * minmax ::= '{' min (',' max?)? '}'
  578. * min ::= [0-9]+
  579. * max ::= [0-9]+
  580. */
  581. Token parseFactor() throws ParseException {
  582. int ch = this.read();
  583. Token tok;
  584. switch (ch) {
  585. case T_CARET: return this.processCaret();
  586. case T_DOLLAR: return this.processDollar();
  587. case T_LOOKAHEAD: return this.processLookahead();
  588. case T_NEGATIVELOOKAHEAD: return this.processNegativelookahead();
  589. case T_LOOKBEHIND: return this.processLookbehind();
  590. case T_NEGATIVELOOKBEHIND: return this.processNegativelookbehind();
  591. case T_COMMENT:
  592. this.next();
  593. return Token.createEmpty();
  594. case T_BACKSOLIDUS:
  595. switch (this.chardata) {
  596. case 'A': return this.processBacksolidus_A();
  597. case 'Z': return this.processBacksolidus_Z();
  598. case 'z': return this.processBacksolidus_z();
  599. case 'b': return this.processBacksolidus_b();
  600. case 'B': return this.processBacksolidus_B();
  601. case '<': return this.processBacksolidus_lt();
  602. case '>': return this.processBacksolidus_gt();
  603. }
  604. // through down
  605. }
  606. tok = this.parseAtom();
  607. ch = this.read();
  608. switch (ch) {
  609. case T_STAR: return this.processStar(tok);
  610. case T_PLUS: return this.processPlus(tok);
  611. case T_QUESTION: return this.processQuestion(tok);
  612. case T_CHAR:
  613. if (this.chardata == '{' && this.offset < this.regexlen) {
  614. int off = this.offset; // this.offset -> next of '{'
  615. int min = 0, max = -1;
  616. if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  617. min = ch -'0';
  618. while (off < this.regexlen
  619. && (ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  620. min = min*10 +ch-'0';
  621. if (min < 0)
  622. throw ex("parser.quantifier.5", this.offset);
  623. }
  624. }
  625. else {
  626. throw ex("parser.quantifier.1", this.offset);
  627. }
  628. max = min;
  629. if (ch == ',') {
  630. if (off >= this.regexlen) {
  631. throw ex("parser.quantifier.3", this.offset);
  632. }
  633. else if ((ch = this.regex.charAt(off++)) >= '0' && ch <= '9') {
  634. max = ch -'0'; // {min,max}
  635. while (off < this.regexlen
  636. && (ch = this.regex.charAt(off++)) >= '0'
  637. && ch <= '9') {
  638. max = max*10 +ch-'0';
  639. if (max < 0)
  640. throw ex("parser.quantifier.5", this.offset);
  641. }
  642. if (min > max)
  643. throw ex("parser.quantifier.4", this.offset);
  644. }
  645. else { // assume {min,}
  646. max = -1;
  647. }
  648. }
  649. if (ch != '}')
  650. throw ex("parser.quantifier.2", this.offset);
  651. if (this.checkQuestion(off)) { // off -> next of '}'
  652. tok = Token.createNGClosure(tok);
  653. this.offset = off+1;
  654. } else {
  655. tok = Token.createClosure(tok);
  656. this.offset = off;
  657. }
  658. tok.setMin(min);
  659. tok.setMax(max);
  660. //System.err.println("CLOSURE: "+min+", "+max);
  661. this.next();
  662. }
  663. }
  664. return tok;
  665. }
  666. /**
  667. * atom ::= char | '.' | char-class | '(' regex ')' | '(?:' regex ')' | '\' [0-9]
  668. * | '\w' | '\W' | '\d' | '\D' | '\s' | '\S' | category-block
  669. * | '(?>' regex ')'
  670. * char ::= '\\' | '\' [efnrt] | bmp-code | character-1
  671. */
  672. Token parseAtom() throws ParseException {
  673. int ch = this.read();
  674. Token tok = null;
  675. switch (ch) {
  676. case T_LPAREN: return this.processParen();
  677. case T_LPAREN2: return this.processParen2(); // '(?:'
  678. case T_CONDITION: return this.processCondition(); // '(?('
  679. case T_MODIFIERS: return this.processModifiers(); // (?modifiers ... )
  680. case T_INDEPENDENT: return this.processIndependent();
  681. case T_DOT:
  682. this.next(); // Skips '.'
  683. tok = Token.token_dot;
  684. break;
  685. /**
  686. * char-class ::= '[' ( '^'? range ','?)+ ']'
  687. * range ::= '\d' | '\w' | '\s' | category-block | range-char
  688. * | range-char '-' range-char
  689. * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  690. * bmp-char ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  691. */
  692. case T_LBRACKET: return this.parseCharacterClass(true);
  693. case T_SET_OPERATIONS: return this.parseSetOperations();
  694. case T_BACKSOLIDUS:
  695. switch (this.chardata) {
  696. case 'd': case 'D':
  697. case 'w': case 'W':
  698. case 's': case 'S':
  699. tok = this.getTokenForShorthand(this.chardata);
  700. this.next();
  701. return tok;
  702. case 'e': case 'f': case 'n': case 'r':
  703. case 't': case 'u': case 'v': case 'x':
  704. {
  705. int ch2 = this.decodeEscaped();
  706. if (ch2 < 0x10000) {
  707. tok = Token.createChar(ch2);
  708. } else {
  709. tok = Token.createString(REUtil.decomposeToSurrogates(ch2));
  710. }
  711. }
  712. break;
  713. case 'c': return this.processBacksolidus_c();
  714. case 'C': return this.processBacksolidus_C();
  715. case 'i': return this.processBacksolidus_i();
  716. case 'I': return this.processBacksolidus_I();
  717. case 'g': return this.processBacksolidus_g();
  718. case 'X': return this.processBacksolidus_X();
  719. case '1': case '2': case '3': case '4':
  720. case '5': case '6': case '7': case '8': case '9':
  721. return this.processBackreference();
  722. case 'P':
  723. case 'p':
  724. int pstart = this.offset;
  725. tok = processBacksolidus_pP(this.chardata);
  726. if (tok == null) throw this.ex("parser.atom.5", pstart);
  727. break;
  728. default:
  729. tok = Token.createChar(this.chardata);
  730. }
  731. this.next();
  732. break;
  733. case T_CHAR:
  734. if (this.chardata == ']' || this.chardata == '{' || this.chardata == '}')
  735. throw this.ex("parser.atom.4", this.offset-1);
  736. tok = Token.createChar(this.chardata);
  737. int high = this.chardata;
  738. this.next();
  739. if (REUtil.isHighSurrogate(high)
  740. && this.read() == T_CHAR && REUtil.isLowSurrogate(this.chardata)) {
  741. char[] sur = new char[2];
  742. sur[0] = (char)high;
  743. sur[1] = (char)this.chardata;
  744. tok = Token.createParen(Token.createString(new String(sur)), 0);
  745. this.next();
  746. }
  747. break;
  748. default:
  749. throw this.ex("parser.atom.4", this.offset-1);
  750. }
  751. return tok;
  752. }
  753. protected RangeToken processBacksolidus_pP(int c) throws ParseException {
  754. this.next();
  755. if (this.read() != T_CHAR || this.chardata != '{')
  756. throw this.ex("parser.atom.2", this.offset-1);
  757. // handle category escape
  758. boolean positive = c == 'p';
  759. int namestart = this.offset;
  760. int nameend = this.regex.indexOf('}', namestart);
  761. if (nameend < 0)
  762. throw this.ex("parser.atom.3", this.offset);
  763. String pname = this.regex.substring(namestart, nameend);
  764. this.offset = nameend+1;
  765. return Token.getRange(pname, positive, this.isSet(RegularExpression.XMLSCHEMA_MODE));
  766. }
  767. int processCIinCharacterClass(RangeToken tok, int c) {
  768. return this.decodeEscaped();
  769. }
  770. /**
  771. * char-class ::= '[' ( '^'? range ','?)+ ']'
  772. * range ::= '\d' | '\w' | '\s' | category-block | range-char
  773. * | range-char '-' range-char
  774. * range-char ::= '\[' | '\]' | '\\' | '\' [,-efnrtv] | bmp-code | character-2
  775. * bmp-code ::= '\' 'u' [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F] [0-9a-fA-F]
  776. */
  777. protected RangeToken parseCharacterClass(boolean useNrange) throws ParseException {
  778. this.setContext(S_INBRACKETS);
  779. this.next(); // '['
  780. boolean nrange = false;
  781. RangeToken base = null;
  782. RangeToken tok;
  783. if (this.read() == T_CHAR && this.chardata == '^') {
  784. nrange = true;
  785. this.next(); // '^'
  786. if (useNrange) {
  787. tok = Token.createNRange();
  788. } else {
  789. base = Token.createRange();
  790. base.addRange(0, Token.UTF16_MAX);
  791. tok = Token.createRange();
  792. }
  793. } else {
  794. tok = Token.createRange();
  795. }
  796. int type;
  797. boolean firstloop = true;
  798. while ((type = this.read()) != T_EOF) {
  799. if (type == T_CHAR && this.chardata == ']' && !firstloop)
  800. break;
  801. firstloop = false;
  802. int c = this.chardata;
  803. boolean end = false;
  804. if (type == T_BACKSOLIDUS) {
  805. switch (c) {
  806. case 'd': case 'D':
  807. case 'w': case 'W':
  808. case 's': case 'S':
  809. tok.mergeRanges(this.getTokenForShorthand(c));
  810. end = true;
  811. break;
  812. case 'i': case 'I':
  813. case 'c': case 'C':
  814. c = this.processCIinCharacterClass(tok, c);
  815. if (c < 0) end = true;
  816. break;
  817. case 'p':
  818. case 'P':
  819. int pstart = this.offset;
  820. RangeToken tok2 = this.processBacksolidus_pP(c);
  821. if (tok2 == null) throw this.ex("parser.atom.5", pstart);
  822. tok.mergeRanges(tok2);
  823. end = true;
  824. break;
  825. default:
  826. c = this.decodeEscaped();
  827. } // \ + c
  828. } // backsolidus
  829. // POSIX Character class such as [:alnum:]
  830. else if (type == T_POSIX_CHARCLASS_START) {
  831. int nameend = this.regex.indexOf(':', this.offset);
  832. if (nameend < 0) throw this.ex("parser.cc.1", this.offset);
  833. boolean positive = true;
  834. if (this.regex.charAt(this.offset) == '^') {
  835. this.offset ++;
  836. positive = false;
  837. }
  838. String name = this.regex.substring(this.offset, nameend);
  839. RangeToken range = Token.getRange(name, positive,
  840. this.isSet(RegularExpression.XMLSCHEMA_MODE));
  841. if (range == null) throw this.ex("parser.cc.3", this.offset);
  842. tok.mergeRanges(range);
  843. end = true;
  844. if (nameend+1 >= this.regexlen || this.regex.charAt(nameend+1) != ']')
  845. throw this.ex("parser.cc.1", nameend);
  846. this.offset = nameend+2;
  847. }
  848. this.next();
  849. if (!end) { // if not shorthands...
  850. if (this.read() != T_CHAR || this.chardata != '-') { // Here is no '-'.
  851. tok.addRange(c, c);
  852. } else {
  853. this.next(); // Skips '-'
  854. if ((type = this.read()) == T_EOF) throw this.ex("parser.cc.2", this.offset);
  855. if (type == T_CHAR && this.chardata == ']') {
  856. tok.addRange(c, c);
  857. tok.addRange('-', '-');
  858. } else {
  859. int rangeend = this.chardata;
  860. if (type == T_BACKSOLIDUS)
  861. rangeend = this.decodeEscaped();
  862. this.next();
  863. tok.addRange(c, rangeend);
  864. }
  865. }
  866. }
  867. if (this.isSet(RegularExpression.SPECIAL_COMMA)
  868. && this.read() == T_CHAR && this.chardata == ',')
  869. this.next();
  870. }
  871. if (this.read() == T_EOF)
  872. throw this.ex("parser.cc.2", this.offset);
  873. if (!useNrange && nrange) {
  874. base.subtractRanges(tok);
  875. tok = base;
  876. }
  877. tok.sortRanges();
  878. tok.compactRanges();
  879. //tok.dumpRanges();
  880. /*
  881. if (this.isSet(RegularExpression.IGNORE_CASE))
  882. tok = RangeToken.createCaseInsensitiveToken(tok);
  883. */
  884. this.setContext(S_NORMAL);
  885. this.next(); // Skips ']'
  886. return tok;
  887. }
  888. /**
  889. * '(?[' ... ']' (('-' | '+' | '&') '[' ... ']')? ')'
  890. */
  891. protected RangeToken parseSetOperations() throws ParseException {
  892. RangeToken tok = this.parseCharacterClass(false);
  893. int type;
  894. while ((type = this.read()) != T_RPAREN) {
  895. int ch = this.chardata;
  896. if (type == T_CHAR && (ch == '-' || ch == '&')
  897. || type == T_PLUS) {
  898. this.next();
  899. if (this.read() != T_LBRACKET) throw ex("parser.ope.1", this.offset-1);
  900. RangeToken t2 = this.parseCharacterClass(false);
  901. if (type == T_PLUS)
  902. tok.mergeRanges(t2);
  903. else if (ch == '-')
  904. tok.subtractRanges(t2);
  905. else if (ch == '&')
  906. tok.intersectRanges(t2);
  907. else
  908. throw new RuntimeException("ASSERT");
  909. } else {
  910. throw ex("parser.ope.2", this.offset-1);
  911. }
  912. }
  913. this.next();
  914. return tok;
  915. }
  916. Token getTokenForShorthand(int ch) {
  917. Token tok;
  918. switch (ch) {
  919. case 'd':
  920. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  921. ? Token.getRange("Nd", true) : Token.token_0to9;
  922. break;
  923. case 'D':
  924. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  925. ? Token.getRange("Nd", false) : Token.token_not_0to9;
  926. break;
  927. case 'w':
  928. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  929. ? Token.getRange("IsWord", true) : Token.token_wordchars;
  930. break;
  931. case 'W':
  932. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  933. ? Token.getRange("IsWord", false) : Token.token_not_wordchars;
  934. break;
  935. case 's':
  936. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  937. ? Token.getRange("IsSpace", true) : Token.token_spaces;
  938. break;
  939. case 'S':
  940. tok = this.isSet(RegularExpression.USE_UNICODE_CATEGORY)
  941. ? Token.getRange("IsSpace", false) : Token.token_not_spaces;
  942. break;
  943. default:
  944. throw new RuntimeException("Internal Error: shorthands: \\u"+Integer.toString(ch, 16));
  945. }
  946. return tok;
  947. }
  948. /**
  949. */
  950. int decodeEscaped() throws ParseException {
  951. if (this.read() != T_BACKSOLIDUS) throw ex("parser.next.1", this.offset-1);
  952. int c = this.chardata;
  953. switch (c) {
  954. case 'e': c = 0x1b; break; // ESCAPE U+001B
  955. case 'f': c = '\f'; break; // FORM FEED U+000C
  956. case 'n': c = '\n'; break; // LINE FEED U+000A
  957. case 'r': c = '\r'; break; // CRRIAGE RETURN U+000D
  958. case 't': c = '\t'; break; // HORIZONTAL TABULATION U+0009
  959. //case 'v': c = 0x0b; break; // VERTICAL TABULATION U+000B
  960. case 'x':
  961. this.next();
  962. if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
  963. if (this.chardata == '{') {
  964. int v1 = 0;
  965. int uv = 0;
  966. do {
  967. this.next();
  968. if (this.read() != T_CHAR) throw ex("parser.descape.1", this.offset-1);
  969. if ((v1 = hexChar(this.chardata)) < 0)
  970. break;
  971. if (uv > uv*16) throw ex("parser.descape.2", this.offset-1);
  972. uv = uv*16+v1;
  973. } while (true);
  974. if (this.chardata != '}') throw ex("parser.descape.3", this.offset-1);
  975. if (uv > Token.UTF16_MAX) throw ex("parser.descape.4", this.offset-1);
  976. c = uv;
  977. } else {
  978. int v1 = 0;
  979. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  980. throw ex("parser.descape.1", this.offset-1);
  981. int uv = v1;
  982. this.next();
  983. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  984. throw ex("parser.descape.1", this.offset-1);
  985. uv = uv*16+v1;
  986. c = uv;
  987. }
  988. break;
  989. case 'u':
  990. int v1 = 0;
  991. this.next();
  992. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  993. throw ex("parser.descape.1", this.offset-1);
  994. int uv = v1;
  995. this.next();
  996. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  997. throw ex("parser.descape.1", this.offset-1);
  998. uv = uv*16+v1;
  999. this.next();
  1000. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1001. throw ex("parser.descape.1", this.offset-1);
  1002. uv = uv*16+v1;
  1003. this.next();
  1004. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1005. throw ex("parser.descape.1", this.offset-1);
  1006. uv = uv*16+v1;
  1007. c = uv;
  1008. break;
  1009. case 'v':
  1010. this.next();
  1011. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1012. throw ex("parser.descape.1", this.offset-1);
  1013. uv = v1;
  1014. this.next();
  1015. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1016. throw ex("parser.descape.1", this.offset-1);
  1017. uv = uv*16+v1;
  1018. this.next();
  1019. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1020. throw ex("parser.descape.1", this.offset-1);
  1021. uv = uv*16+v1;
  1022. this.next();
  1023. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1024. throw ex("parser.descape.1", this.offset-1);
  1025. uv = uv*16+v1;
  1026. this.next();
  1027. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1028. throw ex("parser.descape.1", this.offset-1);
  1029. uv = uv*16+v1;
  1030. this.next();
  1031. if (this.read() != T_CHAR || (v1 = hexChar(this.chardata)) < 0)
  1032. throw ex("parser.descape.1", this.offset-1);
  1033. uv = uv*16+v1;
  1034. if (uv > Token.UTF16_MAX) throw ex("parser.descappe.4", this.offset-1);
  1035. c = uv;
  1036. break;
  1037. case 'A':
  1038. case 'Z':
  1039. case 'z':
  1040. throw ex("parser.descape.5", this.offset-2);
  1041. default:
  1042. }
  1043. return c;
  1044. }
  1045. static private final int hexChar(int ch) {
  1046. if (ch < '0') return -1;
  1047. if (ch > 'f') return -1;
  1048. if (ch <= '9') return ch-'0';
  1049. if (ch < 'A') return -1;
  1050. if (ch <= 'F') return ch-'A'+10;
  1051. if (ch < 'a') return -1;
  1052. return ch-'a'+10;
  1053. }
  1054. }