1. /*
  2. * @(#)Matcher.java 1.58 04/06/28
  3. *
  4. * Copyright 2004 Sun Microsystems, Inc. All rights reserved.
  5. * SUN PROPRIETARY/CONFIDENTIAL. Use is subject to license terms.
  6. */
  7. package java.util.regex;
  8. /**
  9. * An engine that performs match operations on a {@link java.lang.CharSequence
  10. * </code>character sequence<code>} by interpreting a {@link Pattern}.
  11. *
  12. * <p> A matcher is created from a pattern by invoking the pattern's {@link
  13. * Pattern#matcher matcher} method. Once created, a matcher can be used to
  14. * perform three different kinds of match operations:
  15. *
  16. * <ul>
  17. *
  18. * <li><p> The {@link #matches matches} method attempts to match the entire
  19. * input sequence against the pattern. </p></li>
  20. *
  21. * <li><p> The {@link #lookingAt lookingAt} method attempts to match the
  22. * input sequence, starting at the beginning, against the pattern. </p></li>
  23. *
  24. * <li><p> The {@link #find find} method scans the input sequence looking for
  25. * the next subsequence that matches the pattern. </p></li>
  26. *
  27. * </ul>
  28. *
  29. * <p> Each of these methods returns a boolean indicating success or failure.
  30. * More information about a successful match can be obtained by querying the
  31. * state of the matcher.
  32. *
  33. * <p> A matcher finds matches in a subset of its input called the
  34. * <i>region</i>. By default, the region contains all of the matcher's input.
  35. * The region can be modified via the{@link #region region} method and queried
  36. * via the {@link #regionStart regionStart} and {@link #regionEnd regionEnd}
  37. * methods. The way that the region boundaries interact with some pattern
  38. * constructs can be changed. See {@link #useAnchoringBounds
  39. * useAnchoringBounds} and {@link #useTransparentBounds useTransparentBounds}
  40. * for more details.
  41. *
  42. * <p> This class also defines methods for replacing matched subsequences with
  43. * new strings whose contents can, if desired, be computed from the match
  44. * result. The {@link #appendReplacement appendReplacement} and {@link
  45. * #appendTail appendTail} methods can be used in tandem in order to collect
  46. * the result into an existing string buffer, or the more convenient {@link
  47. * #replaceAll replaceAll} method can be used to create a string in which every
  48. * matching subsequence in the input sequence is replaced.
  49. *
  50. * <p> The explicit state of a matcher includes the start and end indices of
  51. * the most recent successful match. It also includes the start and end
  52. * indices of the input subsequence captured by each <a
  53. * href="Pattern.html#cg">capturing group</a> in the pattern as well as a total
  54. * count of such subsequences. As a convenience, methods are also provided for
  55. * returning these captured subsequences in string form.
  56. *
  57. * <p> The explicit state of a matcher is initially undefined; attempting to
  58. * query any part of it before a successful match will cause an {@link
  59. * IllegalStateException} to be thrown. The explicit state of a matcher is
  60. * recomputed by every match operation.
  61. *
  62. * <p> The implicit state of a matcher includes the input character sequence as
  63. * well as the <i>append position</i>, which is initially zero and is updated
  64. * by the {@link #appendReplacement appendReplacement} method.
  65. *
  66. * <p> A matcher may be reset explicitly by invoking its {@link #reset()}
  67. * method or, if a new input sequence is desired, its {@link
  68. * #reset(java.lang.CharSequence) reset(CharSequence)} method. Resetting a
  69. * matcher discards its explicit state information and sets the append position
  70. * to zero.
  71. *
  72. * <p> Instances of this class are not safe for use by multiple concurrent
  73. * threads. </p>
  74. *
  75. *
  76. * @author Mike McCloskey
  77. * @author Mark Reinhold
  78. * @author JSR-51 Expert Group
  79. * @version 1.58, 04/06/28
  80. * @since 1.4
  81. * @spec JSR-51
  82. */
  83. public final class Matcher implements MatchResult {
  84. /**
  85. * The Pattern object that created this Matcher.
  86. */
  87. Pattern parentPattern;
  88. /**
  89. * The storage used by groups. They may contain invalid values if
  90. * a group was skipped during the matching.
  91. */
  92. int[] groups;
  93. /**
  94. * The range within the sequence that is to be matched. Anchors
  95. * will match at these "hard" boundaries. Changing the region
  96. * changes these values.
  97. */
  98. int from, to;
  99. /**
  100. * The original string being matched.
  101. */
  102. CharSequence text;
  103. /**
  104. * Matcher state used by the last node. NOANCHOR is used when a
  105. * match does not have to consume all of the input. ENDANCHOR is
  106. * the mode used for matching all the input.
  107. */
  108. static final int ENDANCHOR = 1;
  109. static final int NOANCHOR = 0;
  110. int acceptMode = NOANCHOR;
  111. /**
  112. * The range of string that last matched the pattern. If the last
  113. * match failed then first is -1; last initially holds 0 then it
  114. * holds the index of the end of the last match (which is where the
  115. * next search starts).
  116. */
  117. int first = -1, last = 0;
  118. /**
  119. * The end index of what matched in the last match operation.
  120. */
  121. int oldLast = -1;
  122. /**
  123. * The index of the last position appended in a substitution.
  124. */
  125. int lastAppendPosition = 0;
  126. /**
  127. * Storage used by nodes to tell what repetition they are on in
  128. * a pattern, and where groups begin. The nodes themselves are stateless,
  129. * so they rely on this field to hold state during a match.
  130. */
  131. int[] locals;
  132. /**
  133. * Boolean indicating whether or not more input could change
  134. * the results of the last match.
  135. *
  136. * If hitEnd is true, and a match was found, then more input
  137. * might cause a different match to be found.
  138. * If hitEnd is true and a match was not found, then more
  139. * input could cause a match to be found.
  140. * If hitEnd is false and a match was found, then more input
  141. * will not change the match.
  142. * If hitEnd is false and a match was not found, then more
  143. * input will not cause a match to be found.
  144. */
  145. boolean hitEnd;
  146. /**
  147. * Boolean indicating whether or not more input could change
  148. * a positive match into a negative one.
  149. *
  150. * If requireEnd is true, and a match was found, then more
  151. * input could cause the match to be lost.
  152. * If requireEnd is false and a match was found, then more
  153. * input might change the match but the match won't be lost.
  154. * If a match was not found, then requireEnd has no meaning.
  155. */
  156. boolean requireEnd;
  157. /**
  158. * If transparentBounds is true then the boundaries of this
  159. * matcher's region are transparent to lookahead, lookbehind,
  160. * and boundary matching constructs that try to see beyond them.
  161. */
  162. boolean transparentBounds = false;
  163. /**
  164. * If anchoringBounds is true then the boundaries of this
  165. * matcher's region match anchors such as ^ and $.
  166. */
  167. boolean anchoringBounds = true;
  168. /**
  169. * No default constructor.
  170. */
  171. Matcher() {
  172. }
  173. /**
  174. * All matchers have the state used by Pattern during a match.
  175. */
  176. Matcher(Pattern parent, CharSequence text) {
  177. this.parentPattern = parent;
  178. this.text = text;
  179. // Allocate state storage
  180. int parentGroupCount = Math.max(parent.capturingGroupCount, 10);
  181. groups = new int[parentGroupCount * 2];
  182. locals = new int[parent.localCount];
  183. // Put fields into initial states
  184. reset();
  185. }
  186. /**
  187. * Returns the pattern that is interpreted by this matcher.
  188. *
  189. * @return The pattern for which this matcher was created
  190. */
  191. public Pattern pattern() {
  192. return parentPattern;
  193. }
  194. /**
  195. * Returns the match state of this matcher as a {@link MatchResult}.
  196. * The result is unaffected by subsequent operations performed upon this
  197. * matcher.
  198. *
  199. * @return a <code>MatchResult</code> with the state of this matcher
  200. */
  201. public MatchResult toMatchResult() {
  202. Matcher result = new Matcher(this.parentPattern, text.toString());
  203. result.first = this.first;
  204. result.last = this.last;
  205. result.groups = (int[])(this.groups.clone());
  206. return result;
  207. }
  208. /**
  209. * Changes the <tt>Pattern</tt> that this <tt>Matcher</tt> uses to
  210. * find matches with.
  211. *
  212. * <p> This method causes this matcher to lose information
  213. * about the groups of the last match that occurred. The
  214. * matcher's position in the input is maintained and its
  215. * last append position is unaffected.</p>
  216. *
  217. * @param newPattern
  218. * The new pattern used by this matcher
  219. * @return This matcher
  220. * @throws IllegalArgumentException
  221. * If newPattern is <tt>null</tt>
  222. * @since 1.5
  223. */
  224. public Matcher usePattern(Pattern newPattern) {
  225. if (newPattern == null)
  226. throw new IllegalArgumentException("Pattern cannot be null");
  227. parentPattern = newPattern;
  228. // Reallocate state storage
  229. int parentGroupCount = Math.max(newPattern.capturingGroupCount, 10);
  230. groups = new int[parentGroupCount * 2];
  231. locals = new int[newPattern.localCount];
  232. for (int i = 0; i < groups.length; i++)
  233. groups[i] = -1;
  234. for (int i = 0; i < locals.length; i++)
  235. locals[i] = -1;
  236. return this;
  237. }
  238. /**
  239. * Resets this matcher.
  240. *
  241. * <p> Resetting a matcher discards all of its explicit state information
  242. * and sets its append position to zero. The matcher's region is set to the
  243. * default region, which is its entire character sequence. The anchoring
  244. * and transparency of this matcher's region boundaries are unaffected.
  245. *
  246. * @return This matcher
  247. */
  248. public Matcher reset() {
  249. first = -1;
  250. last = 0;
  251. oldLast = -1;
  252. for(int i=0; i<groups.length; i++)
  253. groups[i] = -1;
  254. for(int i=0; i<locals.length; i++)
  255. locals[i] = -1;
  256. lastAppendPosition = 0;
  257. from = 0;
  258. to = getTextLength();
  259. return this;
  260. }
  261. /**
  262. * Resets this matcher with a new input sequence.
  263. *
  264. * <p> Resetting a matcher discards all of its explicit state information
  265. * and sets its append position to zero. The matcher's region is set to
  266. * the default region, which is its entire character sequence. The
  267. * anchoring and transparency of this matcher's region boundaries are
  268. * unaffected.
  269. *
  270. * @param input
  271. * The new input character sequence
  272. *
  273. * @return This matcher
  274. */
  275. public Matcher reset(CharSequence input) {
  276. text = input;
  277. return reset();
  278. }
  279. /**
  280. * Returns the start index of the previous match. </p>
  281. *
  282. * @return The index of the first character matched
  283. *
  284. * @throws IllegalStateException
  285. * If no match has yet been attempted,
  286. * or if the previous match operation failed
  287. */
  288. public int start() {
  289. if (first < 0)
  290. throw new IllegalStateException("No match available");
  291. return first;
  292. }
  293. /**
  294. * Returns the start index of the subsequence captured by the given group
  295. * during the previous match operation.
  296. *
  297. * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  298. * to right, starting at one. Group zero denotes the entire pattern, so
  299. * the expression <i>m.</i><tt>start(0)</tt> is equivalent to
  300. * <i>m.</i><tt>start()</tt>. </p>
  301. *
  302. * @param group
  303. * The index of a capturing group in this matcher's pattern
  304. *
  305. * @return The index of the first character captured by the group,
  306. * or <tt>-1</tt> if the match was successful but the group
  307. * itself did not match anything
  308. *
  309. * @throws IllegalStateException
  310. * If no match has yet been attempted,
  311. * or if the previous match operation failed
  312. *
  313. * @throws IndexOutOfBoundsException
  314. * If there is no capturing group in the pattern
  315. * with the given index
  316. */
  317. public int start(int group) {
  318. if (first < 0)
  319. throw new IllegalStateException("No match available");
  320. if (group > groupCount())
  321. throw new IndexOutOfBoundsException("No group " + group);
  322. return groups[group * 2];
  323. }
  324. /**
  325. * Returns the offset after the last character matched. </p>
  326. *
  327. * @return The offset after the last character matched
  328. *
  329. * @throws IllegalStateException
  330. * If no match has yet been attempted,
  331. * or if the previous match operation failed
  332. */
  333. public int end() {
  334. if (first < 0)
  335. throw new IllegalStateException("No match available");
  336. return last;
  337. }
  338. /**
  339. * Returns the offset after the last character of the subsequence
  340. * captured by the given group during the previous match operation.
  341. *
  342. * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  343. * to right, starting at one. Group zero denotes the entire pattern, so
  344. * the expression <i>m.</i><tt>end(0)</tt> is equivalent to
  345. * <i>m.</i><tt>end()</tt>. </p>
  346. *
  347. * @param group
  348. * The index of a capturing group in this matcher's pattern
  349. *
  350. * @return The offset after the last character captured by the group,
  351. * or <tt>-1</tt> if the match was successful
  352. * but the group itself did not match anything
  353. *
  354. * @throws IllegalStateException
  355. * If no match has yet been attempted,
  356. * or if the previous match operation failed
  357. *
  358. * @throws IndexOutOfBoundsException
  359. * If there is no capturing group in the pattern
  360. * with the given index
  361. */
  362. public int end(int group) {
  363. if (first < 0)
  364. throw new IllegalStateException("No match available");
  365. if (group > groupCount())
  366. throw new IndexOutOfBoundsException("No group " + group);
  367. return groups[group * 2 + 1];
  368. }
  369. /**
  370. * Returns the input subsequence matched by the previous match.
  371. *
  372. * <p> For a matcher <i>m</i> with input sequence <i>s</i>,
  373. * the expressions <i>m.</i><tt>group()</tt> and
  374. * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(),</tt> <i>m.</i><tt>end())</tt>
  375. * are equivalent. </p>
  376. *
  377. * <p> Note that some patterns, for example <tt>a*</tt>, match the empty
  378. * string. This method will return the empty string when the pattern
  379. * successfully matches the empty string in the input. </p>
  380. *
  381. * @return The (possibly empty) subsequence matched by the previous match,
  382. * in string form
  383. *
  384. * @throws IllegalStateException
  385. * If no match has yet been attempted,
  386. * or if the previous match operation failed
  387. */
  388. public String group() {
  389. return group(0);
  390. }
  391. /**
  392. * Returns the input subsequence captured by the given group during the
  393. * previous match operation.
  394. *
  395. * <p> For a matcher <i>m</i>, input sequence <i>s</i>, and group index
  396. * <i>g</i>, the expressions <i>m.</i><tt>group(</tt><i>g</i><tt>)</tt> and
  397. * <i>s.</i><tt>substring(</tt><i>m.</i><tt>start(</tt><i>g</i><tt>),</tt> <i>m.</i><tt>end(</tt><i>g</i><tt>))</tt>
  398. * are equivalent. </p>
  399. *
  400. * <p> <a href="Pattern.html#cg">Capturing groups</a> are indexed from left
  401. * to right, starting at one. Group zero denotes the entire pattern, so
  402. * the expression <tt>m.group(0)</tt> is equivalent to <tt>m.group()</tt>.
  403. * </p>
  404. *
  405. * <p> If the match was successful but the group specified failed to match
  406. * any part of the input sequence, then <tt>null</tt> is returned. Note
  407. * that some groups, for example <tt>(a*)</tt>, match the empty string.
  408. * This method will return the empty string when such a group successfully
  409. * matches the empty string in the input. </p>
  410. *
  411. * @param group
  412. * The index of a capturing group in this matcher's pattern
  413. *
  414. * @return The (possibly empty) subsequence captured by the group
  415. * during the previous match, or <tt>null</tt> if the group
  416. * failed to match part of the input
  417. *
  418. * @throws IllegalStateException
  419. * If no match has yet been attempted,
  420. * or if the previous match operation failed
  421. *
  422. * @throws IndexOutOfBoundsException
  423. * If there is no capturing group in the pattern
  424. * with the given index
  425. */
  426. public String group(int group) {
  427. if (first < 0)
  428. throw new IllegalStateException("No match found");
  429. if (group < 0 || group > groupCount())
  430. throw new IndexOutOfBoundsException("No group " + group);
  431. if ((groups[group*2] == -1) || (groups[group*2+1] == -1))
  432. return null;
  433. return getSubSequence(groups[group * 2], groups[group * 2 + 1]).toString();
  434. }
  435. /**
  436. * Returns the number of capturing groups in this matcher's pattern.
  437. *
  438. * <p> Group zero denotes the entire pattern by convention. It is not
  439. * included in this count.
  440. *
  441. * <p> Any non-negative integer smaller than or equal to the value
  442. * returned by this method is guaranteed to be a valid group index for
  443. * this matcher. </p>
  444. *
  445. * @return The number of capturing groups in this matcher's pattern
  446. */
  447. public int groupCount() {
  448. return parentPattern.capturingGroupCount - 1;
  449. }
  450. /**
  451. * Attempts to match the entire region against the pattern.
  452. *
  453. * <p> If the match succeeds then more information can be obtained via the
  454. * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
  455. *
  456. * @return <tt>true</tt> if, and only if, the entire region sequence
  457. * matches this matcher's pattern
  458. */
  459. public boolean matches() {
  460. return match(from, ENDANCHOR);
  461. }
  462. /**
  463. * Attempts to find the next subsequence of the input sequence that matches
  464. * the pattern.
  465. *
  466. * <p> This method starts at the beginning of this matcher's region, or, if
  467. * a previous invocation of the method was successful and the matcher has
  468. * not since been reset, at the first character not matched by the previous
  469. * match.
  470. *
  471. * <p> If the match succeeds then more information can be obtained via the
  472. * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
  473. *
  474. * @return <tt>true</tt> if, and only if, a subsequence of the input
  475. * sequence matches this matcher's pattern
  476. */
  477. public boolean find() {
  478. int nextSearchIndex = last;
  479. if (nextSearchIndex == first)
  480. nextSearchIndex++;
  481. // If next search starts before region, start it at region
  482. if (nextSearchIndex < from)
  483. nextSearchIndex = from;
  484. // If next search starts beyond region then it fails
  485. if (nextSearchIndex > to) {
  486. for (int i = 0; i < groups.length; i++)
  487. groups[i] = -1;
  488. return false;
  489. }
  490. return search(nextSearchIndex);
  491. }
  492. /**
  493. * Resets this matcher and then attempts to find the next subsequence of
  494. * the input sequence that matches the pattern, starting at the specified
  495. * index.
  496. *
  497. * <p> If the match succeeds then more information can be obtained via the
  498. * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods, and subsequent
  499. * invocations of the {@link #find()} method will start at the first
  500. * character not matched by this match. </p>
  501. *
  502. * @throws IndexOutOfBoundsException
  503. * If start is less than zero or if start is greater than the
  504. * length of the input sequence.
  505. *
  506. * @return <tt>true</tt> if, and only if, a subsequence of the input
  507. * sequence starting at the given index matches this matcher's
  508. * pattern
  509. */
  510. public boolean find(int start) {
  511. int limit = getTextLength();
  512. if ((start < 0) || (start > limit))
  513. throw new IndexOutOfBoundsException("Illegal start index");
  514. reset();
  515. return search(start);
  516. }
  517. /**
  518. * Attempts to match the input sequence, starting at the beginning of the
  519. * region, against the pattern.
  520. *
  521. * <p> Like the {@link #matches matches} method, this method always starts
  522. * at the beginning of the region; unlike that method, it does not
  523. * require that the entire region be matched.
  524. *
  525. * <p> If the match succeeds then more information can be obtained via the
  526. * <tt>start</tt>, <tt>end</tt>, and <tt>group</tt> methods. </p>
  527. *
  528. * @return <tt>true</tt> if, and only if, a prefix of the input
  529. * sequence matches this matcher's pattern
  530. */
  531. public boolean lookingAt() {
  532. return match(from, NOANCHOR);
  533. }
  534. /**
  535. * Returns a literal replacement <code>String</code> for the specified
  536. * <code>String</code>.
  537. *
  538. * This method produces a <code>String</code> that will work
  539. * use as a literal replacement <code>s</code> in the
  540. * <code>appendReplacement</code> method of the {@link Matcher} class.
  541. * The <code>String</code> produced will match the sequence of characters
  542. * in <code>s</code> treated as a literal sequence. Slashes ('\') and
  543. * dollar signs ('$') will be given no special meaning.
  544. *
  545. * @param s The string to be literalized
  546. * @return A literal string replacement
  547. * @since 1.5
  548. */
  549. public static String quoteReplacement(String s) {
  550. if ((s.indexOf('\\') == -1) && (s.indexOf('$') == -1))
  551. return s;
  552. StringBuffer sb = new StringBuffer();
  553. for (int i=0; i<s.length(); i++) {
  554. char c = s.charAt(i);
  555. if (c == '\\') {
  556. sb.append('\\'); sb.append('\\');
  557. } else if (c == '$') {
  558. sb.append('\\'); sb.append('$');
  559. } else {
  560. sb.append(c);
  561. }
  562. }
  563. return sb.toString();
  564. }
  565. /**
  566. * Implements a non-terminal append-and-replace step.
  567. *
  568. * <p> This method performs the following actions: </p>
  569. *
  570. * <ol>
  571. *
  572. * <li><p> It reads characters from the input sequence, starting at the
  573. * append position, and appends them to the given string buffer. It
  574. * stops after reading the last character preceding the previous match,
  575. * that is, the character at index {@link
  576. * #start()} <tt>-</tt> <tt>1</tt>. </p></li>
  577. *
  578. * <li><p> It appends the given replacement string to the string buffer.
  579. * </p></li>
  580. *
  581. * <li><p> It sets the append position of this matcher to the index of
  582. * the last character matched, plus one, that is, to {@link #end()}.
  583. * </p></li>
  584. *
  585. * </ol>
  586. *
  587. * <p> The replacement string may contain references to subsequences
  588. * captured during the previous match: Each occurrence of
  589. * <tt>$</tt><i>g</i><tt></tt> will be replaced by the result of
  590. * evaluating {@link #group(int) group}<tt>(</tt><i>g</i><tt>)</tt>.
  591. * The first number after the <tt>$</tt> is always treated as part of
  592. * the group reference. Subsequent numbers are incorporated into g if
  593. * they would form a legal group reference. Only the numerals '0'
  594. * through '9' are considered as potential components of the group
  595. * reference. If the second group matched the string <tt>"foo"</tt>, for
  596. * example, then passing the replacement string <tt>"$2bar"</tt> would
  597. * cause <tt>"foobar"</tt> to be appended to the string buffer. A dollar
  598. * sign (<tt>$</tt>) may be included as a literal in the replacement
  599. * string by preceding it with a backslash (<tt>\$</tt>).
  600. *
  601. * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
  602. * the replacement string may cause the results to be different than if it
  603. * were being treated as a literal replacement string. Dollar signs may be
  604. * treated as references to captured subsequences as described above, and
  605. * backslashes are used to escape literal characters in the replacement
  606. * string.
  607. *
  608. * <p> This method is intended to be used in a loop together with the
  609. * {@link #appendTail appendTail} and {@link #find find} methods. The
  610. * following code, for example, writes <tt>one dog two dogs in the
  611. * yard</tt> to the standard-output stream: </p>
  612. *
  613. * <blockquote><pre>
  614. * Pattern p = Pattern.compile("cat");
  615. * Matcher m = p.matcher("one cat two cats in the yard");
  616. * StringBuffer sb = new StringBuffer();
  617. * while (m.find()) {
  618. * m.appendReplacement(sb, "dog");
  619. * }
  620. * m.appendTail(sb);
  621. * System.out.println(sb.toString());</pre></blockquote>
  622. *
  623. * @param sb
  624. * The target string buffer
  625. *
  626. * @param replacement
  627. * The replacement string
  628. *
  629. * @return This matcher
  630. *
  631. * @throws IllegalStateException
  632. * If no match has yet been attempted,
  633. * or if the previous match operation failed
  634. *
  635. * @throws IndexOutOfBoundsException
  636. * If the replacement string refers to a capturing group
  637. * that does not exist in the pattern
  638. */
  639. public Matcher appendReplacement(StringBuffer sb, String replacement) {
  640. // If no match, return error
  641. if (first < 0)
  642. throw new IllegalStateException("No match available");
  643. // Process substitution string to replace group references with groups
  644. int cursor = 0;
  645. String s = replacement;
  646. StringBuffer result = new StringBuffer();
  647. while (cursor < replacement.length()) {
  648. char nextChar = replacement.charAt(cursor);
  649. if (nextChar == '\\') {
  650. cursor++;
  651. nextChar = replacement.charAt(cursor);
  652. result.append(nextChar);
  653. cursor++;
  654. } else if (nextChar == '$') {
  655. // Skip past $
  656. cursor++;
  657. // The first number is always a group
  658. int refNum = (int)replacement.charAt(cursor) - '0';
  659. if ((refNum < 0)||(refNum > 9))
  660. throw new IllegalArgumentException(
  661. "Illegal group reference");
  662. cursor++;
  663. // Capture the largest legal group string
  664. boolean done = false;
  665. while (!done) {
  666. if (cursor >= replacement.length()) {
  667. break;
  668. }
  669. int nextDigit = replacement.charAt(cursor) - '0';
  670. if ((nextDigit < 0)||(nextDigit > 9)) { // not a number
  671. break;
  672. }
  673. int newRefNum = (refNum * 10) + nextDigit;
  674. if (groupCount() < newRefNum) {
  675. done = true;
  676. } else {
  677. refNum = newRefNum;
  678. cursor++;
  679. }
  680. }
  681. // Append group
  682. if (group(refNum) != null)
  683. result.append(group(refNum));
  684. } else {
  685. result.append(nextChar);
  686. cursor++;
  687. }
  688. }
  689. // Append the intervening text
  690. sb.append(getSubSequence(lastAppendPosition, first));
  691. // Append the match substitution
  692. sb.append(result.toString());
  693. lastAppendPosition = last;
  694. return this;
  695. }
  696. /**
  697. * Implements a terminal append-and-replace step.
  698. *
  699. * <p> This method reads characters from the input sequence, starting at
  700. * the append position, and appends them to the given string buffer. It is
  701. * intended to be invoked after one or more invocations of the {@link
  702. * #appendReplacement appendReplacement} method in order to copy the
  703. * remainder of the input sequence. </p>
  704. *
  705. * @param sb
  706. * The target string buffer
  707. *
  708. * @return The target string buffer
  709. */
  710. public StringBuffer appendTail(StringBuffer sb) {
  711. sb.append(getSubSequence(lastAppendPosition, getTextLength()).toString());
  712. return sb;
  713. }
  714. /**
  715. * Replaces every subsequence of the input sequence that matches the
  716. * pattern with the given replacement string.
  717. *
  718. * <p> This method first resets this matcher. It then scans the input
  719. * sequence looking for matches of the pattern. Characters that are not
  720. * part of any match are appended directly to the result string; each match
  721. * is replaced in the result by the replacement string. The replacement
  722. * string may contain references to captured subsequences as in the {@link
  723. * #appendReplacement appendReplacement} method.
  724. *
  725. * <p> Note that backslashes (<tt>\</tt>) and dollar signs (<tt>$</tt>) in
  726. * the replacement string may cause the results to be different than if it
  727. * were being treated as a literal replacement string. Dollar signs may be
  728. * treated as references to captured subsequences as described above, and
  729. * backslashes are used to escape literal characters in the replacement
  730. * string.
  731. *
  732. * <p> Given the regular expression <tt>a*b</tt>, the input
  733. * <tt>"aabfooaabfooabfoob"</tt>, and the replacement string
  734. * <tt>"-"</tt>, an invocation of this method on a matcher for that
  735. * expression would yield the string <tt>"-foo-foo-foo-"</tt>.
  736. *
  737. * <p> Invoking this method changes this matcher's state. If the matcher
  738. * is to be used in further matching operations then it should first be
  739. * reset. </p>
  740. *
  741. * @param replacement
  742. * The replacement string
  743. *
  744. * @return The string constructed by replacing each matching subsequence
  745. * by the replacement string, substituting captured subsequences
  746. * as needed
  747. */
  748. public String replaceAll(String replacement) {
  749. reset();
  750. boolean result = find();
  751. if (result) {
  752. StringBuffer sb = new StringBuffer();
  753. do {
  754. appendReplacement(sb, replacement);
  755. result = find();
  756. } while (result);
  757. appendTail(sb);
  758. return sb.toString();
  759. }
  760. return text.toString();
  761. }
  762. /**
  763. * Replaces the first subsequence of the input sequence that matches the
  764. * pattern with the given replacement string.
  765. *
  766. * <p> This method first resets this matcher. It then scans the input
  767. * sequence looking for a match of the pattern. Characters that are not
  768. * part of the match are appended directly to the result string; the match
  769. * is replaced in the result by the replacement string. The replacement
  770. * string may contain references to captured subsequences as in the {@link
  771. * #appendReplacement appendReplacement} method.
  772. *
  773. * <p> Given the regular expression <tt>dog</tt>, the input
  774. * <tt>"zzzdogzzzdogzzz"</tt>, and the replacement string
  775. * <tt>"cat"</tt>, an invocation of this method on a matcher for that
  776. * expression would yield the string <tt>"zzzcatzzzdogzzz"</tt>. </p>
  777. *
  778. * <p> Invoking this method changes this matcher's state. If the matcher
  779. * is to be used in further matching operations then it should first be
  780. * reset. </p>
  781. *
  782. * @param replacement
  783. * The replacement string
  784. * @return The string constructed by replacing the first matching
  785. * subsequence by the replacement string, substituting captured
  786. * @throws NullPointerException if <code>replacement</code> is null.
  787. * subsequences as needed
  788. */
  789. public String replaceFirst(String replacement) {
  790. if (replacement == null)
  791. throw new NullPointerException("replacement");
  792. StringBuffer sb = new StringBuffer();
  793. reset();
  794. if (find())
  795. appendReplacement(sb, replacement);
  796. appendTail(sb);
  797. return sb.toString();
  798. }
  799. /**
  800. * Sets the limits of this matcher's region. The region is the part of the
  801. * input sequence that will be searched to find a match. Invoking this
  802. * method resets the matcher, and then sets the region to start at the
  803. * index specified by the <code>start</code> parameter and end at the
  804. * index specified by the <code>end</code> parameter.
  805. *
  806. * <p>Depending on the transparency and anchoring being used (see
  807. * {@link #useTransparentBounds useTransparentBounds} and
  808. * {@link #useAnchoringBounds useAnchoringBounds}), certain constructs such
  809. * as anchors may behave differently at or around the boundaries of the
  810. * region.
  811. *
  812. * @param start
  813. * The index to start searching at (inclusive)
  814. * @param end
  815. * The index to end searching at (exclusive)
  816. * @throws IndexOutOfBoundsException
  817. * If start or end is less than zero, if
  818. * start is greater than the length of the input sequence, if
  819. * end is greater than the length of the input sequence, or if
  820. * start is greater than end.
  821. * @return this matcher
  822. * @since 1.5
  823. */
  824. public Matcher region(int start, int end) {
  825. if ((start < 0) || (start > getTextLength()))
  826. throw new IndexOutOfBoundsException("start");
  827. if ((end < 0) || (end > getTextLength()))
  828. throw new IndexOutOfBoundsException("end");
  829. if (start > end)
  830. throw new IndexOutOfBoundsException("start > end");
  831. reset();
  832. from = start;
  833. to = end;
  834. return this;
  835. }
  836. /**
  837. * Reports the start index of this matcher's region. The
  838. * searches this matcher conducts are limited to finding matches
  839. * within {@link #regionStart regionStart} (inclusive) and
  840. * {@link #regionEnd regionEnd} (exclusive).
  841. *
  842. * @return The starting point of this matcher's region
  843. * @since 1.5
  844. */
  845. public int regionStart() {
  846. return from;
  847. }
  848. /**
  849. * Reports the end index (exclusive) of this matcher's region.
  850. * The searches this matcher conducts are limited to finding matches
  851. * within {@link #regionStart regionStart} (inclusive) and
  852. * {@link #regionEnd regionEnd} (exclusive).
  853. *
  854. * @return the ending point of this matcher's region
  855. * @since 1.5
  856. */
  857. public int regionEnd() {
  858. return to;
  859. }
  860. /**
  861. * Queries the transparency of region bounds for this matcher.
  862. *
  863. * <p> This method returns <tt>true</tt> if this matcher uses
  864. * <i>transparent</i> bounds, <tt>false</tt> if it uses <i>opaque</i>
  865. * bounds.
  866. *
  867. * <p> See {@link #useTransparentBounds useTransparentBounds} for a
  868. * description of transparent and opaque bounds.
  869. *
  870. * <p> By default, a matcher uses opaque region boundaries.
  871. *
  872. * @return <tt>true</tt> iff this matcher is using transparent bounds,
  873. * <tt>false</tt> otherwise.
  874. * @see java.util.regex.Matcher#useTransparentBounds(boolean)
  875. * @since 1.5
  876. */
  877. public boolean hasTransparentBounds() {
  878. return transparentBounds;
  879. }
  880. /**
  881. * Sets the transparency of region bounds for this matcher.
  882. *
  883. * <p> Invoking this method with an argument of <tt>true</tt> will set this
  884. * matcher to use <i>transparent</i> bounds. If the boolean
  885. * argument is <tt>false</tt>, then <i>opaque</i> bounds will be used.
  886. *
  887. * <p> Using transparent bounds, the boundaries of this
  888. * matcher's region are transparent to lookahead, lookbehind,
  889. * and boundary matching constructs. Those constructs can see beyond the
  890. * boundaries of the region to see if a match is appropriate.
  891. *
  892. * <p> Using opaque bounds, the boundaries of this matcher's
  893. * region are opaque to lookahead, lookbehind, and boundary matching
  894. * constructs that may try to see beyond them. Those constructs cannot
  895. * look past the boundaries so they will fail to match anything outside
  896. * of the region.
  897. *
  898. * <p> By default, a matcher uses opaque bounds.
  899. *
  900. * @param b a boolean indicating whether to use opaque or transparent
  901. * regions
  902. * @return this matcher
  903. * @see java.util.regex.Matcher#hasTransparentBounds
  904. * @since 1.5
  905. */
  906. public Matcher useTransparentBounds(boolean b) {
  907. transparentBounds = b;
  908. return this;
  909. }
  910. /**
  911. * Queries the anchoring of region bounds for this matcher.
  912. *
  913. * <p> This method returns <tt>true</tt> if this matcher uses
  914. * <i>anchoring</i> bounds, <tt>false</tt> otherwise.
  915. *
  916. * <p> See {@link #useAnchoringBounds useAnchoringBounds} for a
  917. * description of anchoring bounds.
  918. *
  919. * <p> By default, a matcher uses anchoring region boundaries.
  920. *
  921. * @return <tt>true</tt> iff this matcher is using anchoring bounds,
  922. * <tt>false</tt> otherwise.
  923. * @see java.util.regex.Matcher#useAnchoringBounds(boolean)
  924. * @since 1.5
  925. */
  926. public boolean hasAnchoringBounds() {
  927. return anchoringBounds;
  928. }
  929. /**
  930. * Sets the anchoring of region bounds for this matcher.
  931. *
  932. * <p> Invoking this method with an argument of <tt>true</tt> will set this
  933. * matcher to use <i>anchoring</i> bounds. If the boolean
  934. * argument is <tt>false</tt>, then <i>non-anchoring</i> bounds will be
  935. * used.
  936. *
  937. * <p> Using anchoring bounds, the boundaries of this
  938. * matcher's region match anchors such as ^ and $.
  939. *
  940. * <p> Without anchoring bounds, the boundaries of this
  941. * matcher's region will not match anchors such as ^ and $.
  942. *
  943. * <p> By default, a matcher uses anchoring region boundaries.
  944. *
  945. * @param b a boolean indicating whether or not to use anchoring bounds.
  946. * @return this matcher
  947. * @see java.util.regex.Matcher#hasAnchoringBounds
  948. * @since 1.5
  949. */
  950. public Matcher useAnchoringBounds(boolean b) {
  951. anchoringBounds = b;
  952. return this;
  953. }
  954. /**
  955. * <p>Returns the string representation of this matcher. The
  956. * string representation of a <code>Matcher</code> contains information
  957. * that may be useful for debugging. The exact format is unspecified.
  958. *
  959. * @return The string representation of this matcher
  960. * @since 1.5
  961. */
  962. public String toString() {
  963. StringBuffer sb = new StringBuffer();
  964. sb.append("java.util.regex.Matcher");
  965. sb.append("[pattern=" + pattern());
  966. sb.append(" region=");
  967. sb.append(regionStart() + "," + regionEnd());
  968. sb.append(" lastmatch=");
  969. if ((first >= 0) && (group() != null)) {
  970. sb.append(group());
  971. }
  972. sb.append("]");
  973. return sb.toString();
  974. }
  975. /**
  976. * <p>Returns true if the end of input was hit by the search engine in
  977. * the last match operation performed by this matcher.
  978. *
  979. * <p>When this method returns true, then it is possible that more input
  980. * would have changed the result of the last search.
  981. *
  982. * @return true iff the end of input was hit in the last match; false
  983. * otherwise
  984. * @since 1.5
  985. */
  986. public boolean hitEnd() {
  987. return hitEnd;
  988. }
  989. /**
  990. * <p>Returns true if more input could change a positive match into a
  991. * negative one.
  992. *
  993. * <p>If this method returns true, and a match was found, then more
  994. * input could cause the match to be lost. If this method returns false
  995. * and a match was found, then more input might change the match but the
  996. * match won't be lost. If a match was not found, then requireEnd has no
  997. * meaning.
  998. *
  999. * @return true iff more input could change a positive match into a
  1000. * negative one.
  1001. * @since 1.5
  1002. */
  1003. public boolean requireEnd() {
  1004. return requireEnd;
  1005. }
  1006. /**
  1007. * Initiates a search to find a Pattern within the given bounds.
  1008. * The groups are filled with default values and the match of the root
  1009. * of the state machine is called. The state machine will hold the state
  1010. * of the match as it proceeds in this matcher.
  1011. *
  1012. * Matcher.from is not set here, because it is the "hard" boundary
  1013. * of the start of the search which anchors will set to. The from param
  1014. * is the "soft" boundary of the start of the search, meaning that the
  1015. * regex tries to match at that index but ^ won't match there. Subsequent
  1016. * calls to the search methods start at a new "soft" boundary which is
  1017. * the end of the previous match.
  1018. */
  1019. boolean search(int from) {
  1020. this.hitEnd = false;
  1021. this.requireEnd = false;
  1022. from = from < 0 ? 0 : from;
  1023. this.first = from;
  1024. this.oldLast = oldLast < 0 ? from : oldLast;
  1025. for (int i = 0; i < groups.length; i++)
  1026. groups[i] = -1;
  1027. acceptMode = NOANCHOR;
  1028. boolean result = parentPattern.root.match(this, from, text);
  1029. if (!result)
  1030. this.first = -1;
  1031. this.oldLast = this.last;
  1032. return result;
  1033. }
  1034. /**
  1035. * Initiates a search for an anchored match to a Pattern within the given
  1036. * bounds. The groups are filled with default values and the match of the
  1037. * root of the state machine is called. The state machine will hold the
  1038. * state of the match as it proceeds in this matcher.
  1039. */
  1040. boolean match(int from, int anchor) {
  1041. this.hitEnd = false;
  1042. this.requireEnd = false;
  1043. from = from < 0 ? 0 : from;
  1044. this.first = from;
  1045. this.oldLast = oldLast < 0 ? from : oldLast;
  1046. for (int i = 0; i < groups.length; i++)
  1047. groups[i] = -1;
  1048. acceptMode = anchor;
  1049. boolean result = parentPattern.matchRoot.match(this, from, text);
  1050. if (!result)
  1051. this.first = -1;
  1052. this.oldLast = this.last;
  1053. return result;
  1054. }
  1055. /**
  1056. * Returns the end index of the text.
  1057. *
  1058. * @return the index after the last character in the text
  1059. */
  1060. int getTextLength() {
  1061. return text.length();
  1062. }
  1063. /**
  1064. * Generates a String from this Matcher's input in the specified range.
  1065. *
  1066. * @param beginIndex the beginning index, inclusive
  1067. * @param endIndex the ending index, exclusive
  1068. * @return A String generated from this Matcher's input
  1069. */
  1070. CharSequence getSubSequence(int beginIndex, int endIndex) {
  1071. return text.subSequence(beginIndex, endIndex);
  1072. }
  1073. /**
  1074. * Returns this Matcher's input character at index i.
  1075. *
  1076. * @return A char from the specified index
  1077. */
  1078. char charAt(int i) {
  1079. return text.charAt(i);
  1080. }
  1081. }