1. package com.sun.org.apache.regexp.internal;
  2. /*
  3. * ====================================================================
  4. *
  5. * The Apache Software License, Version 1.1
  6. *
  7. * Copyright (c) 1999 The Apache Software Foundation. All rights
  8. * reserved.
  9. *
  10. * Redistribution and use in source and binary forms, with or without
  11. * modification, are permitted provided that the following conditions
  12. * are met:
  13. *
  14. * 1. Redistributions of source code must retain the above copyright
  15. * notice, this list of conditions and the following disclaimer.
  16. *
  17. * 2. Redistributions in binary form must reproduce the above copyright
  18. * notice, this list of conditions and the following disclaimer in
  19. * the documentation and/or other materials provided with the
  20. * distribution.
  21. *
  22. * 3. The end-user documentation included with the redistribution, if
  23. * any, must include the following acknowlegement:
  24. * "This product includes software developed by the
  25. * Apache Software Foundation (http://www.apache.org/)."
  26. * Alternately, this acknowlegement may appear in the software itself,
  27. * if and wherever such third-party acknowlegements normally appear.
  28. *
  29. * 4. The names "The Jakarta Project", "Jakarta-Regexp", and "Apache Software
  30. * Foundation" must not be used to endorse or promote products derived
  31. * from this software without prior written permission. For written
  32. * permission, please contact apache@apache.org.
  33. *
  34. * 5. Products derived from this software may not be called "Apache"
  35. * nor may "Apache" appear in their names without prior written
  36. * permission of the Apache Group.
  37. *
  38. * THIS SOFTWARE IS PROVIDED ``AS IS'' AND ANY EXPRESSED OR IMPLIED
  39. * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
  40. * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
  41. * DISCLAIMED. IN NO EVENT SHALL THE APACHE SOFTWARE FOUNDATION OR
  42. * ITS CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL,
  43. * SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT
  44. * LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF
  45. * USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND
  46. * ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
  47. * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT
  48. * OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  49. * SUCH DAMAGE.
  50. * ====================================================================
  51. *
  52. * This software consists of voluntary contributions made by many
  53. * individuals on behalf of the Apache Software Foundation. For more
  54. * information on the Apache Software Foundation, please see
  55. * <http://www.apache.org/>.
  56. *
  57. */
  58. import java.util.Vector;
  59. /**
  60. * RE is an efficient, lightweight regular expression evaluator/matcher class.
  61. * Regular expressions are pattern descriptions which enable sophisticated matching of
  62. * strings. In addition to being able to match a string against a pattern, you
  63. * can also extract parts of the match. This is especially useful in text parsing!
  64. * Details on the syntax of regular expression patterns are given below.
  65. *
  66. * <p>
  67. *
  68. * To compile a regular expression (RE), you can simply construct an RE matcher
  69. * object from the string specification of the pattern, like this:
  70. *
  71. * <pre>
  72. *
  73. * RE r = new RE("a*b");
  74. *
  75. * </pre>
  76. *
  77. * <p>
  78. *
  79. * Once you have done this, you can call either of the RE.match methods to
  80. * perform matching on a String. For example:
  81. *
  82. * <pre>
  83. *
  84. * boolean matched = r.match("aaaab");
  85. *
  86. * </pre>
  87. *
  88. * will cause the boolean matched to be set to true because the
  89. * pattern "a*b" matches the string "aaaab".
  90. *
  91. * <p>
  92. * If you were interested in the <i>number</i> of a's which matched the first
  93. * part of our example expression, you could change the expression to
  94. * "(a*)b". Then when you compiled the expression and matched it against
  95. * something like "xaaaab", you would get results like this:
  96. *
  97. * <pre>
  98. *
  99. * RE r = new RE("(a*)b"); // Compile expression
  100. * boolean matched = r.match("xaaaab"); // Match against "xaaaab"
  101. *
  102. * <br>
  103. *
  104. * String wholeExpr = r.getParen(0); // wholeExpr will be 'aaaab'
  105. * String insideParens = r.getParen(1); // insideParens will be 'aaaa'
  106. *
  107. * <br>
  108. *
  109. * int startWholeExpr = getParenStart(0); // startWholeExpr will be index 1
  110. * int endWholeExpr = getParenEnd(0); // endWholeExpr will be index 6
  111. * int lenWholeExpr = getParenLength(0); // lenWholeExpr will be 5
  112. *
  113. * <br>
  114. *
  115. * int startInside = getParenStart(1); // startInside will be index 1
  116. * int endInside = getParenEnd(1); // endInside will be index 5
  117. * int lenInside = getParenLength(1); // lenInside will be 4
  118. *
  119. * </pre>
  120. *
  121. * You can also refer to the contents of a parenthesized expression within
  122. * a regular expression itself. This is called a 'backreference'. The first
  123. * backreference in a regular expression is denoted by \1, the second by \2
  124. * and so on. So the expression:
  125. *
  126. * <pre>
  127. *
  128. * ([0-9]+)=\1
  129. *
  130. * </pre>
  131. *
  132. * will match any string of the form n=n (like 0=0 or 2=2).
  133. *
  134. * <p>
  135. *
  136. * The full regular expression syntax accepted by RE is described here:
  137. *
  138. * <pre>
  139. *
  140. * <br>
  141. *
  142. * <b><font face=times roman>Characters</font></b>
  143. *
  144. * <br>
  145. *
  146. * <i>unicodeChar</i> Matches any identical unicode character
  147. * \ Used to quote a meta-character (like '*')
  148. * \\ Matches a single '\' character
  149. * \0nnn Matches a given octal character
  150. * \xhh Matches a given 8-bit hexadecimal character
  151. * \\uhhhh Matches a given 16-bit hexadecimal character
  152. * \t Matches an ASCII tab character
  153. * \n Matches an ASCII newline character
  154. * \r Matches an ASCII return character
  155. * \f Matches an ASCII form feed character
  156. *
  157. * <br>
  158. *
  159. * <b><font face=times roman>Character Classes</font></b>
  160. *
  161. * <br>
  162. *
  163. * [abc] Simple character class
  164. * [a-zA-Z] Character class with ranges
  165. * [^abc] Negated character class
  166. *
  167. * <br>
  168. *
  169. * <b><font face=times roman>Standard POSIX Character Classes</font></b>
  170. *
  171. * <br>
  172. *
  173. * [:alnum:] Alphanumeric characters.
  174. * [:alpha:] Alphabetic characters.
  175. * [:blank:] Space and tab characters.
  176. * [:cntrl:] Control characters.
  177. * [:digit:] Numeric characters.
  178. * [:graph:] Characters that are printable and are also visible. (A space is printable, but not visible, while an `a' is both.)
  179. * [:lower:] Lower-case alphabetic characters.
  180. * [:print:] Printable characters (characters that are not control characters.)
  181. * [:punct:] Punctuation characters (characters that are not letter, digits, control characters, or space characters).
  182. * [:space:] Space characters (such as space, tab, and formfeed, to name a few).
  183. * [:upper:] Upper-case alphabetic characters.
  184. * [:xdigit:] Characters that are hexadecimal digits.
  185. *
  186. * <br>
  187. *
  188. * <b><font face=times roman>Non-standard POSIX-style Character Classes</font></b>
  189. *
  190. * <br>
  191. *
  192. * [:javastart:] Start of a Java identifier
  193. * [:javapart:] Part of a Java identifier
  194. *
  195. * <br>
  196. *
  197. * <b><font face=times roman>Predefined Classes</font></b>
  198. *
  199. * <br>
  200. *
  201. * . Matches any character other than newline
  202. * \w Matches a "word" character (alphanumeric plus "_")
  203. * \W Matches a non-word character
  204. * \s Matches a whitespace character
  205. * \S Matches a non-whitespace character
  206. * \d Matches a digit character
  207. * \D Matches a non-digit character
  208. *
  209. * <br>
  210. *
  211. * <b><font face=times roman>Boundary Matchers</font></b>
  212. *
  213. * <br>
  214. *
  215. * ^ Matches only at the beginning of a line
  216. * $ Matches only at the end of a line
  217. * \b Matches only at a word boundary
  218. * \B Matches only at a non-word boundary
  219. *
  220. * <br>
  221. *
  222. * <b><font face=times roman>Greedy Closures</font></b>
  223. *
  224. * <br>
  225. *
  226. * A* Matches A 0 or more times (greedy)
  227. * A+ Matches A 1 or more times (greedy)
  228. * A? Matches A 1 or 0 times (greedy)
  229. * A{n} Matches A exactly n times (greedy)
  230. * A{n,} Matches A at least n times (greedy)
  231. * A{n,m} Matches A at least n but not more than m times (greedy)
  232. *
  233. * <br>
  234. *
  235. * <b><font face=times roman>Reluctant Closures</font></b>
  236. *
  237. * <br>
  238. *
  239. * A*? Matches A 0 or more times (reluctant)
  240. * A+? Matches A 1 or more times (reluctant)
  241. * A?? Matches A 0 or 1 times (reluctant)
  242. *
  243. * <br>
  244. *
  245. * <b><font face=times roman>Logical Operators</font></b>
  246. *
  247. * <br>
  248. *
  249. * AB Matches A followed by B
  250. * A|B Matches either A or B
  251. * (A) Used for subexpression grouping
  252. *
  253. * <br>
  254. *
  255. * <b><font face=times roman>Backreferences</font></b>
  256. *
  257. * <br>
  258. *
  259. * \1 Backreference to 1st parenthesized subexpression
  260. * \2 Backreference to 2nd parenthesized subexpression
  261. * \3 Backreference to 3rd parenthesized subexpression
  262. * \4 Backreference to 4th parenthesized subexpression
  263. * \5 Backreference to 5th parenthesized subexpression
  264. * \6 Backreference to 6th parenthesized subexpression
  265. * \7 Backreference to 7th parenthesized subexpression
  266. * \8 Backreference to 8th parenthesized subexpression
  267. * \9 Backreference to 9th parenthesized subexpression
  268. *
  269. * <br>
  270. *
  271. * </pre>
  272. *
  273. * <p>
  274. *
  275. * All closure operators (+, *, ?, {m,n}) are greedy by default, meaning that they
  276. * match as many elements of the string as possible without causing the overall
  277. * match to fail. If you want a closure to be reluctant (non-greedy), you can
  278. * simply follow it with a '?'. A reluctant closure will match as few elements
  279. * of the string as possible when finding matches. {m,n} closures don't currently
  280. * support reluctancy.
  281. *
  282. * <p>
  283. *
  284. * RE runs programs compiled by the RECompiler class. But the RE matcher class
  285. * does not include the actual regular expression compiler for reasons of
  286. * efficiency. In fact, if you want to pre-compile one or more regular expressions,
  287. * the 'recompile' class can be invoked from the command line to produce compiled
  288. * output like this:
  289. *
  290. * <pre>
  291. *
  292. * // Pre-compiled regular expression "a*b"
  293. * char[] re1Instructions =
  294. * {
  295. * 0x007c, 0x0000, 0x001a, 0x007c, 0x0000, 0x000d, 0x0041,
  296. * 0x0001, 0x0004, 0x0061, 0x007c, 0x0000, 0x0003, 0x0047,
  297. * 0x0000, 0xfff6, 0x007c, 0x0000, 0x0003, 0x004e, 0x0000,
  298. * 0x0003, 0x0041, 0x0001, 0x0004, 0x0062, 0x0045, 0x0000,
  299. * 0x0000,
  300. * };
  301. *
  302. * <br>
  303. *
  304. * REProgram re1 = new REProgram(re1Instructions);
  305. *
  306. * </pre>
  307. *
  308. * You can then construct a regular expression matcher (RE) object from the pre-compiled
  309. * expression re1 and thus avoid the overhead of compiling the expression at runtime.
  310. * If you require more dynamic regular expressions, you can construct a single RECompiler
  311. * object and re-use it to compile each expression. Similarly, you can change the
  312. * program run by a given matcher object at any time. However, RE and RECompiler are
  313. * not threadsafe (for efficiency reasons, and because requiring thread safety in this
  314. * class is deemed to be a rare requirement), so you will need to construct a separate
  315. * compiler or matcher object for each thread (unless you do thread synchronization
  316. * yourself).
  317. *
  318. * </pre>
  319. * <br><p><br>
  320. *
  321. * <font color=red>
  322. * <i>ISSUES:</i>
  323. *
  324. * <ul>
  325. * <li>com.weusours.util.re is not currently compatible with all standard POSIX regcomp flags
  326. * <li>com.weusours.util.re does not support POSIX equivalence classes ([=foo=] syntax) (I18N/locale issue)
  327. * <li>com.weusours.util.re does not support nested POSIX character classes (definitely should, but not completely trivial)
  328. * <li>com.weusours.util.re Does not support POSIX character collation concepts ([.foo.] syntax) (I18N/locale issue)
  329. * <li>Should there be different matching styles (simple, POSIX, Perl etc?)
  330. * <li>Should RE support character iterators (for backwards RE matching!)?
  331. * <li>Should RE support reluctant {m,n} closures (does anyone care)?
  332. * <li>Not *all* possibilities are considered for greediness when backreferences
  333. * are involved (as POSIX suggests should be the case). The POSIX RE
  334. * "(ac*)c*d[ac]*\1", when matched against "acdacaa" should yield a match
  335. * of acdacaa where \1 is "a". This is not the case in this RE package,
  336. * and actually Perl doesn't go to this extent either! Until someone
  337. * actually complains about this, I'm not sure it's worth "fixing".
  338. * If it ever is fixed, test #137 in RETest.txt should be updated.
  339. * </ul>
  340. *
  341. * </font>
  342. *
  343. * @see recompile
  344. * @see RECompiler
  345. *
  346. * @author <a href="mailto:jonl@muppetlabs.com">Jonathan Locke</a>
  347. * @version $Id: RE.java,v 1.6 2000/08/22 17:19:38 jon Exp $
  348. */
  349. public class RE
  350. {
  351. /**
  352. * Specifies normal, case-sensitive matching behaviour.
  353. */
  354. public static final int MATCH_NORMAL = 0x0000;
  355. /**
  356. * Flag to indicate that matching should be case-independent (folded)
  357. */
  358. public static final int MATCH_CASEINDEPENDENT = 0x0001;
  359. /**
  360. * Newlines should match as BOL/EOL (^ and $)
  361. */
  362. public static final int MATCH_MULTILINE = 0x0002;
  363. /**
  364. * Consider all input a single body of text - newlines are matched by .
  365. */
  366. public static final int MATCH_SINGLELINE = 0x0004;
  367. /************************************************
  368. * *
  369. * The format of a node in a program is: *
  370. * *
  371. * [ OPCODE ] [ OPDATA ] [ OPNEXT ] [ OPERAND ] *
  372. * *
  373. * char OPCODE - instruction *
  374. * char OPDATA - modifying data *
  375. * char OPNEXT - next node (relative offset) *
  376. * *
  377. ************************************************/
  378. // Opcode Char Opdata/Operand Meaning
  379. // ---------- ---------- --------------- --------------------------------------------------
  380. static final char OP_END = 'E'; // end of program
  381. static final char OP_BOL = '^'; // match only if at beginning of line
  382. static final char OP_EOL = '$'; // match only if at end of line
  383. static final char OP_ANY = '.'; // match any single character except newline
  384. static final char OP_ANYOF = '['; // count/ranges match any char in the list of ranges
  385. static final char OP_BRANCH = '|'; // node match this alternative or the next one
  386. static final char OP_ATOM = 'A'; // length/string length of string followed by string itself
  387. static final char OP_STAR = '*'; // node kleene closure
  388. static final char OP_PLUS = '+'; // node positive closure
  389. static final char OP_MAYBE = '?'; // node optional closure
  390. static final char OP_ESCAPE = '\\'; // escape special escape code char class (escape is E_* code)
  391. static final char OP_OPEN = '('; // number nth opening paren
  392. static final char OP_CLOSE = ')'; // number nth closing paren
  393. static final char OP_BACKREF = '#'; // number reference nth already matched parenthesized string
  394. static final char OP_GOTO = 'G'; // nothing but a (back-)pointer
  395. static final char OP_NOTHING = 'N'; // match null string such as in '(a|)'
  396. static final char OP_RELUCTANTSTAR = '8'; // none/expr reluctant '*' (mnemonic for char is unshifted '*')
  397. static final char OP_RELUCTANTPLUS = '='; // none/expr reluctant '+' (mnemonic for char is unshifted '+')
  398. static final char OP_RELUCTANTMAYBE = '/'; // none/expr reluctant '?' (mnemonic for char is unshifted '?')
  399. static final char OP_POSIXCLASS = 'P'; // classid one of the posix character classes
  400. // Escape codes
  401. static final char E_ALNUM = 'w'; // Alphanumeric
  402. static final char E_NALNUM = 'W'; // Non-alphanumeric
  403. static final char E_BOUND = 'b'; // Word boundary
  404. static final char E_NBOUND = 'B'; // Non-word boundary
  405. static final char E_SPACE = 's'; // Whitespace
  406. static final char E_NSPACE = 'S'; // Non-whitespace
  407. static final char E_DIGIT = 'd'; // Digit
  408. static final char E_NDIGIT = 'D'; // Non-digit
  409. // Posix character classes
  410. static final char POSIX_CLASS_ALNUM = 'w'; // Alphanumerics
  411. static final char POSIX_CLASS_ALPHA = 'a'; // Alphabetics
  412. static final char POSIX_CLASS_BLANK = 'b'; // Blanks
  413. static final char POSIX_CLASS_CNTRL = 'c'; // Control characters
  414. static final char POSIX_CLASS_DIGIT = 'd'; // Digits
  415. static final char POSIX_CLASS_GRAPH = 'g'; // Graphic characters
  416. static final char POSIX_CLASS_LOWER = 'l'; // Lowercase characters
  417. static final char POSIX_CLASS_PRINT = 'p'; // Printable characters
  418. static final char POSIX_CLASS_PUNCT = '!'; // Punctuation
  419. static final char POSIX_CLASS_SPACE = 's'; // Spaces
  420. static final char POSIX_CLASS_UPPER = 'u'; // Uppercase characters
  421. static final char POSIX_CLASS_XDIGIT = 'x'; // Hexadecimal digits
  422. static final char POSIX_CLASS_JSTART = 'j'; // Java identifier start
  423. static final char POSIX_CLASS_JPART = 'k'; // Java identifier part
  424. // Limits
  425. static final int maxNode = 65536; // Maximum number of nodes in a program
  426. static final int maxParen = 16; // Number of paren pairs (only 9 can be backrefs)
  427. // Node layout constants
  428. static final int offsetOpcode = 0; // Opcode offset (first character)
  429. static final int offsetOpdata = 1; // Opdata offset (second char)
  430. static final int offsetNext = 2; // Next index offset (third char)
  431. static final int nodeSize = 3; // Node size (in chars)
  432. /** Line Separator */
  433. static final String NEWLINE = System.getProperty("line.separator");
  434. // State of current program
  435. REProgram program; // Compiled regular expression 'program'
  436. CharacterIterator search; // The string being matched against
  437. int idx; // Current index in string being searched
  438. int matchFlags; // Match behaviour flags
  439. // Parenthesized subexpressions
  440. int parenCount; // Number of subexpressions matched (num open parens + 1)
  441. int start0; // Cache of start[0]
  442. int end0; // Cache of start[0]
  443. int start1; // Cache of start[1]
  444. int end1; // Cache of start[1]
  445. int start2; // Cache of start[2]
  446. int end2; // Cache of start[2]
  447. int[] startn; // Lazy-alloced array of sub-expression starts
  448. int[] endn; // Lazy-alloced array of sub-expression ends
  449. // Backreferences
  450. int[] startBackref; // Lazy-alloced array of backref starts
  451. int[] endBackref; // Lazy-alloced array of backref ends
  452. /**
  453. * Constructs a regular expression matcher from a String by compiling it
  454. * using a new instance of RECompiler. If you will be compiling many
  455. * expressions, you may prefer to use a single RECompiler object instead.
  456. * @param pattern The regular expression pattern to compile.
  457. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  458. * @see RECompiler
  459. * @see recompile
  460. */
  461. public RE(String pattern) throws RESyntaxException
  462. {
  463. this(pattern, MATCH_NORMAL);
  464. }
  465. /**
  466. * Constructs a regular expression matcher from a String by compiling it
  467. * using a new instance of RECompiler. If you will be compiling many
  468. * expressions, you may prefer to use a single RECompiler object instead.
  469. * @param pattern The regular expression pattern to compile.
  470. * @param matchFlags The matching style
  471. * @exception RESyntaxException Thrown if the regular expression has invalid syntax.
  472. * @see RECompiler
  473. * @see recompile
  474. */
  475. public RE(String pattern, int matchFlags) throws RESyntaxException
  476. {
  477. this(new RECompiler().compile(pattern));
  478. setMatchFlags(matchFlags);
  479. }
  480. /**
  481. * Construct a matcher for a pre-compiled regular expression from program
  482. * (bytecode) data. Permits special flags to be passed in to modify matching
  483. * behaviour.
  484. * @param program Compiled regular expression program (see RECompiler and/or recompile)
  485. * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
  486. *
  487. * <pre>
  488. *
  489. * MATCH_NORMAL // Normal (case-sensitive) matching
  490. * MATCH_CASEINDEPENDENT // Case folded comparisons
  491. * MATCH_MULTILINE // Newline matches as BOL/EOL
  492. *
  493. * </pre>
  494. *
  495. * @see RECompiler
  496. * @see REProgram
  497. * @see recompile
  498. */
  499. public RE(REProgram program, int matchFlags)
  500. {
  501. setProgram(program);
  502. setMatchFlags(matchFlags);
  503. }
  504. /**
  505. * Construct a matcher for a pre-compiled regular expression from program
  506. * (bytecode) data.
  507. * @param program Compiled regular expression program
  508. * @see RECompiler
  509. * @see recompile
  510. */
  511. public RE(REProgram program)
  512. {
  513. this(program, MATCH_NORMAL);
  514. }
  515. /**
  516. * Constructs a regular expression matcher with no initial program.
  517. * This is likely to be an uncommon practice, but is still supported.
  518. */
  519. public RE()
  520. {
  521. this((REProgram)null, MATCH_NORMAL);
  522. }
  523. /**
  524. * Converts a 'simplified' regular expression to a full regular expression
  525. * @param pattern The pattern to convert
  526. * @return The full regular expression
  527. */
  528. public static String simplePatternToFullRegularExpression(String pattern)
  529. {
  530. StringBuffer buf = new StringBuffer();
  531. for (int i = 0; i < pattern.length(); i++)
  532. {
  533. char c = pattern.charAt(i);
  534. switch (c)
  535. {
  536. case '*':
  537. buf.append(".*");
  538. break;
  539. case '.':
  540. case '[':
  541. case ']':
  542. case '\\':
  543. case '+':
  544. case '?':
  545. case '{':
  546. case '}':
  547. case '$':
  548. case '^':
  549. case '|':
  550. case '(':
  551. case ')':
  552. buf.append('\\');
  553. default:
  554. buf.append(c);
  555. break;
  556. }
  557. }
  558. return buf.toString();
  559. }
  560. /**
  561. * Sets match behaviour flags which alter the way RE does matching.
  562. * @param matchFlags One or more of the RE match behaviour flags (RE.MATCH_*):
  563. *
  564. * <pre>
  565. *
  566. * MATCH_NORMAL // Normal (case-sensitive) matching
  567. * MATCH_CASEINDEPENDENT // Case folded comparisons
  568. * MATCH_MULTILINE // Newline matches as BOL/EOL
  569. *
  570. * </pre>
  571. *
  572. */
  573. public void setMatchFlags(int matchFlags)
  574. {
  575. this.matchFlags = matchFlags;
  576. }
  577. /**
  578. * Returns the current match behaviour flags.
  579. * @return Current match behaviour flags (RE.MATCH_*).
  580. *
  581. * <pre>
  582. *
  583. * MATCH_NORMAL // Normal (case-sensitive) matching
  584. * MATCH_CASEINDEPENDENT // Case folded comparisons
  585. * MATCH_MULTILINE // Newline matches as BOL/EOL
  586. *
  587. * </pre>
  588. *
  589. * @see #setMatchFlags
  590. *
  591. */
  592. public int getMatchFlags()
  593. {
  594. return matchFlags;
  595. }
  596. /**
  597. * Sets the current regular expression program used by this matcher object.
  598. * @param program Regular expression program compiled by RECompiler.
  599. * @see RECompiler
  600. * @see REProgram
  601. * @see recompile
  602. */
  603. public void setProgram(REProgram program)
  604. {
  605. this.program = program;
  606. }
  607. /**
  608. * Returns the current regular expression program in use by this matcher object.
  609. * @return Regular expression program
  610. * @see #setProgram
  611. */
  612. public REProgram getProgram()
  613. {
  614. return program;
  615. }
  616. /**
  617. * Returns the number of parenthesized subexpressions available after a successful match.
  618. * @return Number of available parenthesized subexpressions
  619. */
  620. public int getParenCount()
  621. {
  622. return parenCount;
  623. }
  624. /**
  625. * Gets the contents of a parenthesized subexpression after a successful match.
  626. * @param which Nesting level of subexpression
  627. * @return String
  628. */
  629. public String getParen(int which)
  630. {
  631. int start;
  632. if (which < parenCount && (start = getParenStart(which)) >= 0)
  633. {
  634. return search.substring(start, getParenEnd(which));
  635. }
  636. return null;
  637. }
  638. /**
  639. * Returns the start index of a given paren level.
  640. * @param which Nesting level of subexpression
  641. * @return String index
  642. */
  643. public final int getParenStart(int which)
  644. {
  645. if (which < parenCount)
  646. {
  647. switch (which)
  648. {
  649. case 0:
  650. return start0;
  651. case 1:
  652. return start1;
  653. case 2:
  654. return start2;
  655. default:
  656. if (startn == null)
  657. {
  658. allocParens();
  659. }
  660. return startn[which];
  661. }
  662. }
  663. return -1;
  664. }
  665. /**
  666. * Returns the end index of a given paren level.
  667. * @param which Nesting level of subexpression
  668. * @return String index
  669. */
  670. public final int getParenEnd(int which)
  671. {
  672. if (which < parenCount)
  673. {
  674. switch (which)
  675. {
  676. case 0:
  677. return end0;
  678. case 1:
  679. return end1;
  680. case 2:
  681. return end2;
  682. default:
  683. if (endn == null)
  684. {
  685. allocParens();
  686. }
  687. return endn[which];
  688. }
  689. }
  690. return -1;
  691. }
  692. /**
  693. * Returns the length of a given paren level.
  694. * @param which Nesting level of subexpression
  695. * @return Number of characters in the parenthesized subexpression
  696. */
  697. public final int getParenLength(int which)
  698. {
  699. if (which < parenCount)
  700. {
  701. return getParenEnd(which) - getParenStart(which);
  702. }
  703. return -1;
  704. }
  705. /**
  706. * Sets the start of a paren level
  707. * @param which Which paren level
  708. * @param i Index in input array
  709. */
  710. protected final void setParenStart(int which, int i)
  711. {
  712. if (which < parenCount)
  713. {
  714. switch (which)
  715. {
  716. case 0:
  717. start0 = i;
  718. break;
  719. case 1:
  720. start1 = i;
  721. break;
  722. case 2:
  723. start2 = i;
  724. break;
  725. default:
  726. if (startn == null)
  727. {
  728. allocParens();
  729. }
  730. startn[which] = i;
  731. break;
  732. }
  733. }
  734. }
  735. /**
  736. * Sets the end of a paren level
  737. * @param which Which paren level
  738. * @param i Index in input array
  739. */
  740. protected final void setParenEnd(int which, int i)
  741. {
  742. if (which < parenCount)
  743. {
  744. switch (which)
  745. {
  746. case 0:
  747. end0 = i;
  748. break;
  749. case 1:
  750. end1 = i;
  751. break;
  752. case 2:
  753. end2 = i;
  754. break;
  755. default:
  756. if (endn == null)
  757. {
  758. allocParens();
  759. }
  760. endn[which] = i;
  761. break;
  762. }
  763. }
  764. }
  765. /**
  766. * Throws an Error representing an internal error condition probably resulting
  767. * from a bug in the regular expression compiler (or possibly data corruption).
  768. * In practice, this should be very rare.
  769. * @param s Error description
  770. */
  771. protected void internalError(String s) throws Error
  772. {
  773. throw new Error("RE internal error: " + s);
  774. }
  775. /**
  776. * Performs lazy allocation of subexpression arrays
  777. */
  778. private final void allocParens()
  779. {
  780. // Allocate arrays for subexpressions
  781. startn = new int[maxParen];
  782. endn = new int[maxParen];
  783. // Set sub-expression pointers to invalid values
  784. for (int i = 0; i < maxParen; i++)
  785. {
  786. startn[i] = -1;
  787. endn[i] = -1;
  788. }
  789. }
  790. /**
  791. * Try to match a string against a subset of nodes in the program
  792. * @param firstNode Node to start at in program
  793. * @param lastNode Last valid node (used for matching a subexpression without
  794. * matching the rest of the program as well).
  795. * @param idxStart Starting position in character array
  796. * @return Final input array index if match succeeded. -1 if not.
  797. */
  798. protected int matchNodes(int firstNode, int lastNode, int idxStart)
  799. {
  800. // Our current place in the string
  801. int idx = idxStart;
  802. // Loop while node is valid
  803. int next, opcode, opdata;
  804. int idxNew;
  805. char[] instruction = program.instruction;
  806. for (int node = firstNode; node < lastNode; )
  807. {
  808. opcode = instruction[node + offsetOpcode];
  809. next = node + (short)instruction[node + offsetNext];
  810. opdata = instruction[node + offsetOpdata];
  811. switch (opcode)
  812. {
  813. case OP_RELUCTANTMAYBE:
  814. {
  815. int once = 0;
  816. do
  817. {
  818. // Try to match the rest without using the reluctant subexpr
  819. if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  820. {
  821. return idxNew;
  822. }
  823. }
  824. while ((once++ == 0) && (idx = matchNodes(node + nodeSize, next, idx)) != -1);
  825. return -1;
  826. }
  827. case OP_RELUCTANTPLUS:
  828. while ((idx = matchNodes(node + nodeSize, next, idx)) != -1)
  829. {
  830. // Try to match the rest without using the reluctant subexpr
  831. if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  832. {
  833. return idxNew;
  834. }
  835. }
  836. return -1;
  837. case OP_RELUCTANTSTAR:
  838. do
  839. {
  840. // Try to match the rest without using the reluctant subexpr
  841. if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  842. {
  843. return idxNew;
  844. }
  845. }
  846. while ((idx = matchNodes(node + nodeSize, next, idx)) != -1);
  847. return -1;
  848. case OP_OPEN:
  849. // Match subexpression
  850. if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
  851. {
  852. startBackref[opdata] = idx;
  853. }
  854. if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  855. {
  856. // Increase valid paren count
  857. if ((opdata + 1) > parenCount)
  858. {
  859. parenCount = opdata + 1;
  860. }
  861. // Don't set paren if already set later on
  862. if (getParenStart(opdata) == -1)
  863. {
  864. setParenStart(opdata, idx);
  865. }
  866. }
  867. return idxNew;
  868. case OP_CLOSE:
  869. // Done matching subexpression
  870. if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
  871. {
  872. endBackref[opdata] = idx;
  873. }
  874. if ((idxNew = matchNodes(next, maxNode, idx)) != -1)
  875. {
  876. // Increase valid paren count
  877. if ((opdata + 1) > parenCount)
  878. {
  879. parenCount = opdata + 1;
  880. }
  881. // Don't set paren if already set later on
  882. if (getParenEnd(opdata) == -1)
  883. {
  884. setParenEnd(opdata, idx);
  885. }
  886. }
  887. return idxNew;
  888. case OP_BACKREF:
  889. {
  890. // Get the start and end of the backref
  891. int s = startBackref[opdata];
  892. int e = endBackref[opdata];
  893. // We don't know the backref yet
  894. if (s == -1 || e == -1)
  895. {
  896. return -1;
  897. }
  898. // The backref is empty size
  899. if (s == e)
  900. {
  901. break;
  902. }
  903. // Get the length of the backref
  904. int l = e - s;
  905. // If there's not enough input left, give up.
  906. if (search.isEnd(idx + l - 1))
  907. {
  908. return -1;
  909. }
  910. // Case fold the backref?
  911. if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
  912. {
  913. // Compare backref to input, case-folding as we go
  914. for (int i = 0; i < l; i++)
  915. {
  916. if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(search.charAt(s + i)))
  917. {
  918. return -1;
  919. }
  920. }
  921. }
  922. else
  923. {
  924. // Compare backref to input
  925. for (int i = 0; i < l; i++)
  926. {
  927. if (search.charAt(idx++) != search.charAt(s + i))
  928. {
  929. return -1;
  930. }
  931. }
  932. }
  933. }
  934. break;
  935. case OP_BOL:
  936. // Fail if we're not at the start of the string
  937. if (idx != 0)
  938. {
  939. // If we're multiline matching, we could still be at the start of a line
  940. if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
  941. {
  942. // If not at start of line, give up
  943. if (idx <= 0 || !isNewline(idx - 1)) {
  944. return -1;
  945. } else {
  946. break;
  947. }
  948. }
  949. return -1;
  950. }
  951. break;
  952. case OP_EOL:
  953. // If we're not at the end of string
  954. if (!search.isEnd(0) && !search.isEnd(idx))
  955. {
  956. // If we're multi-line matching
  957. if ((matchFlags & MATCH_MULTILINE) == MATCH_MULTILINE)
  958. {
  959. // Give up if we're not at the end of a line
  960. if (! isNewline(idx)) {
  961. return -1;
  962. } else {
  963. break;
  964. }
  965. }
  966. return -1;
  967. }
  968. break;
  969. case OP_ESCAPE:
  970. // Which escape?
  971. switch (opdata)
  972. {
  973. // Word boundary match
  974. case E_NBOUND:
  975. case E_BOUND:
  976. {
  977. char cLast = ((idx == getParenStart(0)) ? '\n' : search.charAt(idx - 1));
  978. char cNext = ((search.isEnd(idx)) ? '\n' : search.charAt(idx));
  979. if ((Character.isLetterOrDigit(cLast) == Character.isLetterOrDigit(cNext)) == (opdata == E_BOUND))
  980. {
  981. return -1;
  982. }
  983. }
  984. break;
  985. // Alpha-numeric, digit, space, javaLetter, javaLetterOrDigit
  986. case E_ALNUM:
  987. case E_NALNUM:
  988. case E_DIGIT:
  989. case E_NDIGIT:
  990. case E_SPACE:
  991. case E_NSPACE:
  992. // Give up if out of input
  993. if (search.isEnd(idx))
  994. {
  995. return -1;
  996. }
  997. // Switch on escape
  998. switch (opdata)
  999. {
  1000. case E_ALNUM:
  1001. case E_NALNUM:
  1002. if (!(Character.isLetterOrDigit(search.charAt(idx)) == (opdata == E_ALNUM)))
  1003. {
  1004. return -1;
  1005. }
  1006. break;
  1007. case E_DIGIT:
  1008. case E_NDIGIT:
  1009. if (!(Character.isDigit(search.charAt(idx)) == (opdata == E_DIGIT)))
  1010. {
  1011. return -1;
  1012. }
  1013. break;
  1014. case E_SPACE:
  1015. case E_NSPACE:
  1016. if (!(Character.isWhitespace(search.charAt(idx)) == (opdata == E_SPACE)))
  1017. {
  1018. return -1;
  1019. }
  1020. break;
  1021. }
  1022. idx++;
  1023. break;
  1024. default:
  1025. internalError("Unrecognized escape '" + opdata + "'");
  1026. }
  1027. break;
  1028. case OP_ANY:
  1029. if((matchFlags & MATCH_SINGLELINE) == MATCH_SINGLELINE) {
  1030. // Match anything
  1031. if(search.isEnd(idx))
  1032. {
  1033. return -1;
  1034. }
  1035. idx++;
  1036. break;
  1037. }
  1038. else
  1039. {
  1040. // Match anything but a newline
  1041. if (search.isEnd(idx) || search.charAt(idx++) == '\n')
  1042. {
  1043. return -1;
  1044. }
  1045. break;
  1046. }
  1047. case OP_ATOM:
  1048. {
  1049. // Match an atom value
  1050. if (search.isEnd(idx))
  1051. {
  1052. return -1;
  1053. }
  1054. // Get length of atom and starting index
  1055. int lenAtom = opdata;
  1056. int startAtom = node + nodeSize;
  1057. // Give up if not enough input remains to have a match
  1058. if (search.isEnd(lenAtom + idx - 1))
  1059. {
  1060. return -1;
  1061. }
  1062. // Match atom differently depending on casefolding flag
  1063. if ((matchFlags & MATCH_CASEINDEPENDENT) != 0)
  1064. {
  1065. for (int i = 0; i < lenAtom; i++)
  1066. {
  1067. if (Character.toLowerCase(search.charAt(idx++)) != Character.toLowerCase(instruction[startAtom + i]))
  1068. {
  1069. return -1;
  1070. }
  1071. }
  1072. }
  1073. else
  1074. {
  1075. for (int i = 0; i < lenAtom; i++)
  1076. {
  1077. if (search.charAt(idx++) != instruction[startAtom + i])
  1078. {
  1079. return -1;
  1080. }
  1081. }
  1082. }
  1083. }
  1084. break;
  1085. case OP_POSIXCLASS:
  1086. {
  1087. // Out of input?
  1088. if (search.isEnd(idx))
  1089. {
  1090. return -1;
  1091. }
  1092. switch (opdata)
  1093. {
  1094. case POSIX_CLASS_ALNUM:
  1095. if (!Character.isLetterOrDigit(search.charAt(idx)))
  1096. {
  1097. return -1;
  1098. }
  1099. break;
  1100. case POSIX_CLASS_ALPHA:
  1101. if (!Character.isLetter(search.charAt(idx)))
  1102. {
  1103. return -1;
  1104. }
  1105. break;
  1106. case POSIX_CLASS_DIGIT:
  1107. if (!Character.isDigit(search.charAt(idx)))
  1108. {
  1109. return -1;
  1110. }
  1111. break;
  1112. case POSIX_CLASS_BLANK: // JWL - bugbug: is this right??
  1113. if (!Character.isSpaceChar(search.charAt(idx)))
  1114. {
  1115. return -1;
  1116. }
  1117. break;
  1118. case POSIX_CLASS_SPACE:
  1119. if (!Character.isWhitespace(search.charAt(idx)))
  1120. {
  1121. return -1;
  1122. }
  1123. break;
  1124. case POSIX_CLASS_CNTRL:
  1125. if (Character.getType(search.charAt(idx)) != Character.CONTROL)
  1126. {
  1127. return -1;
  1128. }
  1129. break;
  1130. case POSIX_CLASS_GRAPH: // JWL - bugbug???
  1131. switch (Character.getType(search.charAt(idx)))
  1132. {
  1133. case Character.MATH_SYMBOL:
  1134. case Character.CURRENCY_SYMBOL:
  1135. case Character.MODIFIER_SYMBOL:
  1136. case Character.OTHER_SYMBOL:
  1137. break;
  1138. default:
  1139. return -1;
  1140. }
  1141. break;
  1142. case POSIX_CLASS_LOWER:
  1143. if (Character.getType(search.charAt(idx)) != Character.LOWERCASE_LETTER)
  1144. {
  1145. return -1;
  1146. }
  1147. break;
  1148. case POSIX_CLASS_UPPER:
  1149. if (Character.getType(search.charAt(idx)) != Character.UPPERCASE_LETTER)
  1150. {
  1151. return -1;
  1152. }
  1153. break;
  1154. case POSIX_CLASS_PRINT:
  1155. if (Character.getType(search.charAt(idx)) == Character.CONTROL)
  1156. {
  1157. return -1;
  1158. }
  1159. break;
  1160. case POSIX_CLASS_PUNCT:
  1161. {
  1162. int type = Character.getType(search.charAt(idx));
  1163. switch(type)
  1164. {
  1165. case Character.DASH_PUNCTUATION:
  1166. case Character.START_PUNCTUATION:
  1167. case Character.END_PUNCTUATION:
  1168. case Character.CONNECTOR_PUNCTUATION:
  1169. case Character.OTHER_PUNCTUATION:
  1170. break;
  1171. default:
  1172. return -1;
  1173. }
  1174. }
  1175. break;
  1176. case POSIX_CLASS_XDIGIT: // JWL - bugbug??
  1177. {
  1178. boolean isXDigit = ((search.charAt(idx) >= '0' && search.charAt(idx) <= '9') ||
  1179. (search.charAt(idx) >= 'a' && search.charAt(idx) <= 'f') ||
  1180. (search.charAt(idx) >= 'A' && search.charAt(idx) <= 'F'));
  1181. if (!isXDigit)
  1182. {
  1183. return -1;
  1184. }
  1185. }
  1186. break;
  1187. case POSIX_CLASS_JSTART:
  1188. if (!Character.isJavaIdentifierStart(search.charAt(idx)))
  1189. {
  1190. return -1;
  1191. }
  1192. break;
  1193. case POSIX_CLASS_JPART:
  1194. if (!Character.isJavaIdentifierPart(search.charAt(idx)))
  1195. {
  1196. return -1;
  1197. }
  1198. break;
  1199. default:
  1200. internalError("Bad posix class");
  1201. break;
  1202. }
  1203. // Matched.
  1204. idx++;
  1205. }
  1206. break;
  1207. case OP_ANYOF:
  1208. {
  1209. // Out of input?
  1210. if (search.isEnd(idx))
  1211. {
  1212. return -1;
  1213. }
  1214. // Get character to match against character class and maybe casefold
  1215. char c = search.charAt(idx);
  1216. boolean caseFold = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
  1217. if (caseFold)
  1218. {
  1219. c = Character.toLowerCase(c);
  1220. }
  1221. // Loop through character class checking our match character
  1222. int idxRange = node + nodeSize;
  1223. int idxEnd = idxRange + (opdata * 2);
  1224. boolean match = false;
  1225. for (int i = idxRange; i < idxEnd; )
  1226. {
  1227. // Get start, end and match characters
  1228. char s = instruction[i++];
  1229. char e = instruction[i++];
  1230. // Fold ends of range and match character
  1231. if (caseFold)
  1232. {
  1233. s = Character.toLowerCase(s);
  1234. e = Character.toLowerCase(e);
  1235. }
  1236. // If the match character is in range, break out
  1237. if (c >= s && c <= e)
  1238. {
  1239. match = true;
  1240. break;
  1241. }
  1242. }
  1243. // Fail if we didn't match the character class
  1244. if (!match)
  1245. {
  1246. return -1;
  1247. }
  1248. idx++;
  1249. }
  1250. break;
  1251. case OP_BRANCH:
  1252. {
  1253. // Check for choices
  1254. if (instruction[next + offsetOpcode] != OP_BRANCH)
  1255. {
  1256. // If there aren't any other choices, just evaluate this branch.
  1257. node += nodeSize;
  1258. continue;
  1259. }
  1260. // Try all available branches
  1261. short nextBranch;
  1262. do
  1263. {
  1264. // Try matching the branch against the string
  1265. if ((idxNew = matchNodes(node + nodeSize, maxNode, idx)) != -1)
  1266. {
  1267. return idxNew;
  1268. }
  1269. // Go to next branch (if any)
  1270. nextBranch = (short)instruction[node + offsetNext];
  1271. node += nextBranch;
  1272. }
  1273. while (nextBranch != 0 && (instruction[node + offsetOpcode] == OP_BRANCH));
  1274. // Failed to match any branch!
  1275. return -1;
  1276. }
  1277. case OP_NOTHING:
  1278. case OP_GOTO:
  1279. // Just advance to the next node without doing anything
  1280. break;
  1281. case OP_END:
  1282. // Match has succeeded!
  1283. setParenEnd(0, idx);
  1284. return idx;
  1285. default:
  1286. // Corrupt program
  1287. internalError("Invalid opcode '" + opcode + "'");
  1288. }
  1289. // Advance to the next node in the program
  1290. node = next;
  1291. }
  1292. // We "should" never end up here
  1293. internalError("Corrupt program");
  1294. return -1;
  1295. }
  1296. /**
  1297. * Match the current regular expression program against the current
  1298. * input string, starting at index i of the input string. This method
  1299. * is only meant for internal use.
  1300. * @param i The input string index to start matching at
  1301. * @return True if the input matched the expression
  1302. */
  1303. protected boolean matchAt(int i)
  1304. {
  1305. // Initialize start pointer, paren cache and paren count
  1306. start0 = -1;
  1307. end0 = -1;
  1308. start1 = -1;
  1309. end1 = -1;
  1310. start2 = -1;
  1311. end2 = -1;
  1312. startn = null;
  1313. endn = null;
  1314. parenCount = 1;
  1315. setParenStart(0, i);
  1316. // Allocate backref arrays (unless optimizations indicate otherwise)
  1317. if ((program.flags & REProgram.OPT_HASBACKREFS) != 0)
  1318. {
  1319. startBackref = new int[maxParen];
  1320. endBackref = new int[maxParen];
  1321. }
  1322. // Match against string
  1323. int idx;
  1324. if ((idx = matchNodes(0, maxNode, i)) != -1)
  1325. {
  1326. setParenEnd(0, idx);
  1327. return true;
  1328. }
  1329. // Didn't match
  1330. parenCount = 0;
  1331. return false;
  1332. }
  1333. /**
  1334. * Matches the current regular expression program against a character array,
  1335. * starting at a given index.
  1336. * @param search String to match against
  1337. * @param i Index to start searching at
  1338. * @return True if string matched
  1339. */
  1340. public boolean match(String search, int i)
  1341. {
  1342. return match(new StringCharacterIterator(search), i);
  1343. }
  1344. /**
  1345. * Matches the current regular expression program against a character array,
  1346. * starting at a given index.
  1347. * @param search String to match against
  1348. * @param i Index to start searching at
  1349. * @return True if string matched
  1350. */
  1351. public boolean match(CharacterIterator search, int i)
  1352. {
  1353. // There is no compiled program to search with!
  1354. if (program == null)
  1355. {
  1356. // This should be uncommon enough to be an error case rather
  1357. // than an exception (which would have to be handled everywhere)
  1358. internalError("No RE program to run!");
  1359. }
  1360. // Save string to search
  1361. this.search = search;
  1362. // Can we optimize the search by looking for a prefix string?
  1363. if (program.prefix == null)
  1364. {
  1365. // Unprefixed matching must try for a match at each character
  1366. for ( ;! search.isEnd(i - 1); i++)
  1367. {
  1368. // Try a match at index i
  1369. if (matchAt(i))
  1370. {
  1371. return true;
  1372. }
  1373. }
  1374. return false;
  1375. }
  1376. else
  1377. {
  1378. // Prefix-anchored matching is possible
  1379. boolean caseIndependent = (matchFlags & MATCH_CASEINDEPENDENT) != 0;
  1380. char[] prefix = program.prefix;
  1381. for ( ;! search.isEnd(i + prefix.length - 1); i++)
  1382. {
  1383. // If the first character of the prefix matches
  1384. boolean match = false;
  1385. if (caseIndependent)
  1386. match = Character.toLowerCase(search.charAt(i)) == Character.toLowerCase(prefix[0]);
  1387. else
  1388. match = search.charAt(i) == prefix[0];
  1389. if (match)
  1390. {
  1391. // Save first character position
  1392. int firstChar = i++;
  1393. int k;
  1394. for (k = 1; k < prefix.length; )
  1395. {
  1396. // If there's a mismatch of any character in the prefix, give up
  1397. if (caseIndependent)
  1398. match = Character.toLowerCase(search.charAt(i++)) == Character.toLowerCase(prefix[k++]);
  1399. else
  1400. match = search.charAt(i++) == prefix[k++];
  1401. if (!match)
  1402. {
  1403. break;
  1404. }
  1405. }
  1406. // See if the whole prefix string matched
  1407. if (k == prefix.length)
  1408. {
  1409. // We matched the full prefix at firstChar, so try it
  1410. if (matchAt(firstChar))
  1411. {
  1412. return true;
  1413. }
  1414. }
  1415. // Match failed, reset i to continue the search
  1416. i = firstChar;
  1417. }
  1418. }
  1419. return false;
  1420. }
  1421. }
  1422. /**
  1423. * Matches the current regular expression program against a String.
  1424. * @param search String to match against
  1425. * @return True if string matched
  1426. */
  1427. public boolean match(String search)
  1428. {
  1429. return match(search, 0);
  1430. }
  1431. /**
  1432. * Splits a string into an array of strings on regular expression boundaries.
  1433. * This function works the same way as the Perl function of the same name.
  1434. * Given a regular expression of "[ab]+" and a string to split of
  1435. * "xyzzyababbayyzabbbab123", the result would be the array of Strings
  1436. * "[xyzzy, yyz, 123]".
  1437. * @param s String to split on this regular exression
  1438. * @return Array of strings
  1439. */
  1440. public String[] split(String s)
  1441. {
  1442. // Create new vector
  1443. Vector v = new Vector();
  1444. // Start at position 0 and search the whole string
  1445. int pos = 0;
  1446. int len = s.length();
  1447. // Try a match at each position
  1448. while (pos < len && match(s, pos))
  1449. {
  1450. // Get start of match
  1451. int start = getParenStart(0);
  1452. // Get end of match
  1453. int newpos = getParenEnd(0);
  1454. // Check if no progress was made
  1455. if (newpos == pos)
  1456. {
  1457. v.addElement(s.substring(pos, start + 1));
  1458. newpos++;
  1459. }
  1460. else
  1461. {
  1462. v.addElement(s.substring(pos, start));
  1463. }
  1464. // Move to new position
  1465. pos = newpos;
  1466. }
  1467. // Push remainder if it's not empty
  1468. String remainder = s.substring(pos);
  1469. if (remainder.length() != 0)
  1470. {
  1471. v.addElement(remainder);
  1472. }
  1473. // Return vector as an array of strings
  1474. String[] ret = new String[v.size()];
  1475. v.copyInto(ret);
  1476. return ret;
  1477. }
  1478. /**
  1479. * Flag bit that indicates that subst should replace all occurrences of this
  1480. * regular expression.
  1481. */
  1482. public static final int REPLACE_ALL = 0x0000;
  1483. /**
  1484. * Flag bit that indicates that subst should only replace the first occurrence
  1485. * of this regular expression.
  1486. */
  1487. public static final int REPLACE_FIRSTONLY = 0x0001;
  1488. /**
  1489. * Substitutes a string for this regular expression in another string.
  1490. * This method works like the Perl function of the same name.
  1491. * Given a regular expression of "a*b", a String to substituteIn of
  1492. * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
  1493. * resulting String returned by subst would be "-foo-garply-wacky-".
  1494. * @param substituteIn String to substitute within
  1495. * @param substitution String to substitute for all matches of this regular expression.
  1496. * @return The string substituteIn with zero or more occurrences of the current
  1497. * regular expression replaced with the substitution String (if this regular
  1498. * expression object doesn't match at any position, the original String is returned
  1499. * unchanged).
  1500. */
  1501. public String subst(String substituteIn, String substitution)
  1502. {
  1503. return subst(substituteIn, substitution, REPLACE_ALL);
  1504. }
  1505. /**
  1506. * Substitutes a string for this regular expression in another string.
  1507. * This method works like the Perl function of the same name.
  1508. * Given a regular expression of "a*b", a String to substituteIn of
  1509. * "aaaabfooaaabgarplyaaabwackyb" and the substitution String "-", the
  1510. * resulting String returned by subst would be "-foo-garply-wacky-".
  1511. * @param substituteIn String to substitute within
  1512. * @param substitution String to substitute for matches of this regular expression
  1513. * @param flags One or more bitwise flags from REPLACE_*. If the REPLACE_FIRSTONLY
  1514. * flag bit is set, only the first occurrence of this regular expression is replaced.
  1515. * If the bit is not set (REPLACE_ALL), all occurrences of this pattern will be
  1516. * replaced.
  1517. * @return The string substituteIn with zero or more occurrences of the current
  1518. * regular expression replaced with the substitution String (if this regular
  1519. * expression object doesn't match at any position, the original String is returned
  1520. * unchanged).
  1521. */
  1522. public String subst(String substituteIn, String substitution, int flags)
  1523. {
  1524. // String to return
  1525. StringBuffer ret = new StringBuffer();
  1526. // Start at position 0 and search the whole string
  1527. int pos = 0;
  1528. int len = substituteIn.length();
  1529. // Try a match at each position
  1530. while (pos < len && match(substituteIn, pos))
  1531. {
  1532. // Append string before match
  1533. ret.append(substituteIn.substring(pos, getParenStart(0)));
  1534. // Append substitution
  1535. ret.append(substitution);
  1536. // Move forward, skipping past match
  1537. int newpos = getParenEnd(0);
  1538. // We always want to make progress!
  1539. if (newpos == pos)
  1540. {
  1541. newpos++;
  1542. }
  1543. // Try new position
  1544. pos = newpos;
  1545. // Break out if we're only supposed to replace one occurrence
  1546. if ((flags & REPLACE_FIRSTONLY) != 0)
  1547. {
  1548. break;
  1549. }
  1550. }
  1551. // If there's remaining input, append it
  1552. if (pos < len)
  1553. {
  1554. ret.append(substituteIn.substring(pos));
  1555. }
  1556. // Return string buffer as string
  1557. return ret.toString();
  1558. }
  1559. /**
  1560. * Returns an array of Strings, whose toString representation matches a regular
  1561. * expression. This method works like the Perl function of the same name. Given
  1562. * a regular expression of "a*b" and an array of String objects of [foo, aab, zzz,
  1563. * aaaab], the array of Strings returned by grep would be [aab, aaaab].
  1564. * @param search Array of Objects to search
  1565. * @return Array of Objects whose toString value matches this regular expression.
  1566. */
  1567. public String[] grep(Object[] search)
  1568. {
  1569. // Create new vector to hold return items
  1570. Vector v = new Vector();
  1571. // Traverse array of objects
  1572. for (int i = 0; i < search.length; i++)
  1573. {
  1574. // Get next object as a string
  1575. String s = search[i].toString();
  1576. // If it matches this regexp, add it to the list
  1577. if (match(s))
  1578. {
  1579. v.addElement(s);
  1580. }
  1581. }
  1582. // Return vector as an array of strings
  1583. String[] ret = new String[v.size()];
  1584. v.copyInto(ret);
  1585. return ret;
  1586. }
  1587. /** @return true if at the i-th position in the 'search' a newline ends */
  1588. private boolean isNewline(int i) {
  1589. if (i < NEWLINE.length() - 1) {
  1590. return false;
  1591. }
  1592. if (search.charAt(i) == '\n') {
  1593. return true;
  1594. }
  1595. for (int j = NEWLINE.length() - 1; j >= 0; j--, i--) {
  1596. if (NEWLINE.charAt(j) != search.charAt(i)) {
  1597. return false;
  1598. }
  1599. }
  1600. return true;
  1601. }
  1602. }